Rebase Eigen to 3.3.3.

Bug: 34161771 Test: mm and RenderScript BLAS tests pass on bullhead. Change-Id: Ia448b3202708e395fed9c783ea4323289d69dbef
author: Miao Wang <miaowang@google.com> 2017-03-06 13:45:08 -0800
committer: Miao Wang <miaowang@google.com> 2017-03-07 16:30:11 -0800
commit: 2b8756b6f1de65d3f8bffab45be6c44ceb7411fc (patch)
tree: 0488797fc544fe977bec6418c73445759f052482
parent: 353bba589de58014a35f8f3666b7b96353c300f8 (diff)
download: eigen-2b8756b6f1de65d3f8bffab45be6c44ceb7411fc.tar.gz
1078 files changed, 132132 insertions, 44555 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76a11b9d2..f5840025b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
-project(Eigen)
+project(Eigen3)
 
-cmake_minimum_required(VERSION 2.8.2)
+cmake_minimum_required(VERSION 2.8.5)
 
 # guard against in-source builds
 
@@ -8,6 +8,11 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
   message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
 endif()
 
+# Alias Eigen_*_DIR to Eigen3_*_DIR:
+
+set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
+set(Eigen_BINARY_DIR ${Eigen3_BINARY_DIR})
+
 # guard against bad build-type strings
 
 if (NOT CMAKE_BUILD_TYPE)
@@ -55,6 +60,7 @@ endif(EIGEN_HG_CHANGESET)
 
 
 include(CheckCXXCompilerFlag)
+include(GNUInstallDirs)
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 
@@ -92,9 +98,11 @@ else()
 endif()
 
 option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
-if(NOT WIN32)
+
+# Disable pkgconfig only for native Windows builds
+if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
   option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
-endif(NOT WIN32)
+endif()
 
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
@@ -108,7 +116,8 @@ endif()
 set(EIGEN_TEST_MAX_SIZE "320" CACHE STRING "Maximal matrix/vector size, default is 320")
 
 macro(ei_add_cxx_compiler_flag FLAG)
-  string(REGEX REPLACE "-" "" SFLAG ${FLAG})
+  string(REGEX REPLACE "-" "" SFLAG1 ${FLAG})
+  string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1})
   check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG})
   if(COMPILER_SUPPORT_${SFLAG})
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
@@ -117,18 +126,13 @@ endmacro(ei_add_cxx_compiler_flag)
 
 if(NOT MSVC)
   # We assume that other compilers are partly compatible with GNUCC
-  
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-  set(CMAKE_CXX_FLAGS_DEBUG "-g3")
-  set(CMAKE_CXX_FLAGS_RELEASE "-g0 -O2")
-  
-  # clang outputs some warnings for unknwon flags that are not caught by check_cxx_compiler_flag
+
+  # clang outputs some warnings for unknown flags that are not caught by check_cxx_compiler_flag
   # adding -Werror turns such warnings into errors
   check_cxx_compiler_flag("-Werror" COMPILER_SUPPORT_WERROR)
   if(COMPILER_SUPPORT_WERROR)
     set(CMAKE_REQUIRED_FLAGS "-Werror")
   endif()
-  
   ei_add_cxx_compiler_flag("-pedantic")
   ei_add_cxx_compiler_flag("-Wall")
   ei_add_cxx_compiler_flag("-Wextra")
@@ -142,6 +146,18 @@ if(NOT MSVC)
   ei_add_cxx_compiler_flag("-Wpointer-arith")
   ei_add_cxx_compiler_flag("-Wwrite-strings")
   ei_add_cxx_compiler_flag("-Wformat-security")
+  ei_add_cxx_compiler_flag("-Wshorten-64-to-32")
+  ei_add_cxx_compiler_flag("-Wlogical-op")
+  ei_add_cxx_compiler_flag("-Wenum-conversion")
+  ei_add_cxx_compiler_flag("-Wc++11-extensions")
+  ei_add_cxx_compiler_flag("-Wdouble-promotion")
+#  ei_add_cxx_compiler_flag("-Wconversion")
+  
+  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
+  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
+  if(NOT CMAKE_COMPILER_IS_GNUCXX)
+    ei_add_cxx_compiler_flag("-Wshadow")
+  endif()
   
   ei_add_cxx_compiler_flag("-Wno-psabi")
   ei_add_cxx_compiler_flag("-Wno-variadic-macros")
@@ -151,7 +167,8 @@ if(NOT MSVC)
   ei_add_cxx_compiler_flag("-fno-common")
   ei_add_cxx_compiler_flag("-fstrict-aliasing")
   ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
-  ei_add_cxx_compiler_flag("-wd2304")                   # disbale ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
+  ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
+  
   
   # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
   # Moreover we should not set both -strict-ansi and -ansi
@@ -163,6 +180,11 @@ if(NOT MSVC)
   else()
     ei_add_cxx_compiler_flag("-ansi")
   endif()
+
+  if(ANDROID_NDK)
+    ei_add_cxx_compiler_flag("-pie")
+    ei_add_cxx_compiler_flag("-fPIE")
+  endif()
   
   set(CMAKE_REQUIRED_FLAGS "")
 
@@ -196,18 +218,65 @@ if(NOT MSVC)
     message(STATUS "Enabling SSE4.2 in tests/examples")
   endif()
 
+  option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
+  if(EIGEN_TEST_AVX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
+    message(STATUS "Enabling AVX in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
+  if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
+    message(STATUS "Enabling FMA in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
+  if(EIGEN_TEST_AVX512)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512")
+    message(STATUS "Enabling AVX512 in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF)
+  if(EIGEN_TEST_F16C)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
+    message(STATUS "Enabling F16C in tests/examples")
+  endif()
+
   option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
   if(EIGEN_TEST_ALTIVEC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
     message(STATUS "Enabling AltiVec in tests/examples")
   endif()
 
+  option(EIGEN_TEST_VSX "Enable/Disable VSX in tests/examples" OFF)
+  if(EIGEN_TEST_VSX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -mvsx")
+    message(STATUS "Enabling VSX in tests/examples")
+  endif()
+
   option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
   if(EIGEN_TEST_NEON)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mcpu=cortex-a8")
+    if(EIGEN_TEST_FMA)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon-vfpv4")
+    else()
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
+    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard")
     message(STATUS "Enabling NEON in tests/examples")
   endif()
 
+  option(EIGEN_TEST_NEON64 "Enable/Disable Neon in tests/examples" OFF)
+  if(EIGEN_TEST_NEON64)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    message(STATUS "Enabling NEON in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
+  if(EIGEN_TEST_ZVECTOR)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector")
+    message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
+  endif()
+
   check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
   if(COMPILER_SUPPORT_OPENMP)
     option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
@@ -284,28 +353,41 @@ if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT)
   message(STATUS "Disabling alignment in tests/examples")
 endif()
 
-option(EIGEN_TEST_C++0x "Enables all C++0x features." OFF)
+option(EIGEN_TEST_NO_EXCEPTIONS "Disables C++ exceptions" OFF)
+if(EIGEN_TEST_NO_EXCEPTIONS)
+  ei_add_cxx_compiler_flag("-fno-exceptions")
+  message(STATUS "Disabling exceptions in tests/examples")
+endif()
 
-include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF)
 
-# the user modifiable install path for header files
-set(EIGEN_INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR} CACHE PATH "The directory where we install the header files (optional)")
+set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
 
-# set the internal install path for header files which depends on wether the user modifiable
-# EIGEN_INCLUDE_INSTALL_DIR has been set by the user or not.
+# Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
 if(EIGEN_INCLUDE_INSTALL_DIR)
-  set(INCLUDE_INSTALL_DIR
-    ${EIGEN_INCLUDE_INSTALL_DIR}
-    CACHE INTERNAL
-    "The directory where we install the header files (internal)"
-  )
+  message(WARNING "EIGEN_INCLUDE_INSTALL_DIR is deprecated. Use INCLUDE_INSTALL_DIR instead.")
+endif()
+
+if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
+  set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
+      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
 else()
   set(INCLUDE_INSTALL_DIR
-    "${CMAKE_INSTALL_PREFIX}/include/eigen3"
-    CACHE INTERNAL
-    "The directory where we install the header files (internal)"
-  )
+      "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
+      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
+      )
 endif()
+set(CMAKEPACKAGE_INSTALL_DIR
+    "${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
+    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
+    )
+set(PKGCONFIG_INSTALL_DIR
+    "${CMAKE_INSTALL_DATADIR}/pkgconfig"
+    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
+    )
+
 
 # similar to set_target_properties but append the property instead of overwriting it
 macro(ei_add_target_property target prop value)
@@ -324,23 +406,11 @@ install(FILES
   )
 
 if(EIGEN_BUILD_PKGCONFIG)
-    SET(path_separator ":")
-    STRING(REPLACE ${path_separator} ";" pkg_config_libdir_search "$ENV{PKG_CONFIG_LIBDIR}")
-    message(STATUS "searching for 'pkgconfig' directory in PKG_CONFIG_LIBDIR ( $ENV{PKG_CONFIG_LIBDIR} ), ${CMAKE_INSTALL_PREFIX}/share, and ${CMAKE_INSTALL_PREFIX}/lib")
-    FIND_PATH(pkg_config_libdir pkgconfig ${pkg_config_libdir_search} ${CMAKE_INSTALL_PREFIX}/share ${CMAKE_INSTALL_PREFIX}/lib ${pkg_config_libdir_search})
-    if(pkg_config_libdir)
-        SET(pkg_config_install_dir ${pkg_config_libdir})
-        message(STATUS "found ${pkg_config_libdir}/pkgconfig" )
-    else(pkg_config_libdir)
-        SET(pkg_config_install_dir ${CMAKE_INSTALL_PREFIX}/share)
-        message(STATUS "pkgconfig not found; installing in ${pkg_config_install_dir}" )
-    endif(pkg_config_libdir)
-
-    configure_file(eigen3.pc.in eigen3.pc)
+    configure_file(eigen3.pc.in eigen3.pc @ONLY)
     install(FILES ${CMAKE_CURRENT_BINARY_DIR}/eigen3.pc
-        DESTINATION ${pkg_config_install_dir}/pkgconfig
+        DESTINATION ${PKGCONFIG_INSTALL_DIR}
         )
-endif(EIGEN_BUILD_PKGCONFIG)
+endif()
 
 add_subdirectory(Eigen)
 
@@ -366,6 +436,13 @@ else()
   add_subdirectory(lapack EXCLUDE_FROM_ALL)
 endif()
 
+# add SYCL
+option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
+if(EIGEN_TEST_SYCL)
+  set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
+  include(FindComputeCpp)
+endif()
+
 add_subdirectory(unsupported)
 
 add_subdirectory(demos EXCLUDE_FROM_ALL)
@@ -401,16 +478,20 @@ if(cmake_generator_tolower MATCHES "makefile")
   message(STATUS "--------------+--------------------------------------------------------------")
   message(STATUS "Command       |   Description")
   message(STATUS "--------------+--------------------------------------------------------------")
-  message(STATUS "make install  | Install to ${CMAKE_INSTALL_PREFIX}. To change that:")
-  message(STATUS "              |     cmake . -DCMAKE_INSTALL_PREFIX=yourpath")
-  message(STATUS "              |   Eigen headers will then be installed to:")
-  message(STATUS "              |     ${INCLUDE_INSTALL_DIR}")
-  message(STATUS "              |   To install Eigen headers to a separate location, do:")
-  message(STATUS "              |     cmake . -DEIGEN_INCLUDE_INSTALL_DIR=yourpath")
+  message(STATUS "make install  | Install Eigen. Headers will be installed to:")
+  message(STATUS "              |     <CMAKE_INSTALL_PREFIX>/<INCLUDE_INSTALL_DIR>")
+  message(STATUS "              |   Using the following values:")
+  message(STATUS "              |     CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+  message(STATUS "              |     INCLUDE_INSTALL_DIR:  ${INCLUDE_INSTALL_DIR}")
+  message(STATUS "              |   Change the install location of Eigen headers using:")
+  message(STATUS "              |     cmake . -DCMAKE_INSTALL_PREFIX=yourprefix")
+  message(STATUS "              |   Or:")
+  message(STATUS "              |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
   message(STATUS "make doc      | Generate the API documentation, requires Doxygen & LaTeX")
   message(STATUS "make check    | Build and run the unit-tests. Read this page:")
   message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
   message(STATUS "make blas     | Build BLAS library (not the same thing as Eigen)")
+  message(STATUS "make uninstall| Removes files installed by make install")
   message(STATUS "--------------+--------------------------------------------------------------")
 else()
   message(STATUS "To build/run the unit tests, read this page:")
@@ -418,3 +499,98 @@ else()
 endif()
 
 message(STATUS "")
+
+
+set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
+set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
+set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
+set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
+set ( EIGEN_DEFINITIONS "")
+set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" )
+set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
+
+# Interface libraries require at least CMake 3.0
+if (NOT CMAKE_VERSION VERSION_LESS 3.0)
+  include (CMakePackageConfigHelpers)
+
+  # Imported target support
+  add_library (eigen INTERFACE)
+
+  target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
+  target_include_directories (eigen INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
+  )
+
+  # Export as title case Eigen
+  set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
+
+  install (TARGETS eigen EXPORT Eigen3Targets)
+
+  configure_package_config_file (
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+    PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
+    INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+    NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+  )
+  # Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
+  # not depend on architecture specific settings or libraries. More
+  # specifically, an Eigen3Config.cmake generated from a 64 bit target can be
+  # used for 32 bit targets as well (and vice versa).
+  set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+  unset (CMAKE_SIZEOF_VOID_P)
+  write_basic_package_version_file (Eigen3ConfigVersion.cmake
+                                    VERSION ${EIGEN_VERSION_NUMBER}
+                                    COMPATIBILITY SameMajorVersion)
+  set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
+
+  # The Eigen target will be located in the Eigen3 namespace. Other CMake
+  # targets can refer to it using Eigen3::Eigen.
+  export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake)
+  # Export Eigen3 package to CMake registry such that it can be easily found by
+  # CMake even if it has not been installed to a standard directory.
+  export (PACKAGE Eigen3)
+
+  install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
+
+else (NOT CMAKE_VERSION VERSION_LESS 3.0)
+  # Fallback to legacy Eigen3Config.cmake without the imported target
+  
+  # If CMakePackageConfigHelpers module is available (CMake >= 2.8.8)
+  # create a relocatable Config file, otherwise leave the hardcoded paths       
+  include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH)
+  
+  if(CPCH_PATH)
+    configure_package_config_file (
+      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
+      ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+      PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
+      INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+      NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+    )
+  else() 
+    # The PACKAGE_* variables are defined by the configure_package_config_file
+    # but without it we define them manually to the hardcoded paths
+    set(PACKAGE_INIT "")
+    set(PACKAGE_EIGEN_INCLUDE_DIR ${EIGEN_INCLUDE_DIR})
+    set(PACKAGE_EIGEN_ROOT_DIR ${EIGEN_ROOT_DIR})
+    configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
+                     ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+                     @ONLY ESCAPE_QUOTES )
+  endif()
+
+  write_basic_package_version_file( Eigen3ConfigVersion.cmake
+                                    VERSION ${EIGEN_VERSION_NUMBER}
+                                    COMPATIBILITY SameMajorVersion )
+
+endif (NOT CMAKE_VERSION VERSION_LESS 3.0)
+
+install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
+                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3ConfigVersion.cmake
+          DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} )
+
+# Add uninstall target
+add_custom_target ( uninstall
+    COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
diff --git a/CTestConfig.cmake b/CTestConfig.cmake
index 0557c491a..755b47323 100644
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@@ -4,10 +4,10 @@
 ## # The following are required to uses Dart and the Cdash dashboard
 ##   ENABLE_TESTING()
 ##   INCLUDE(CTest)
-set(CTEST_PROJECT_NAME "Eigen3.2")
+set(CTEST_PROJECT_NAME "Eigen3.3")
 set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")
 
 set(CTEST_DROP_METHOD "http")
 set(CTEST_DROP_SITE "manao.inria.fr")
-set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen3.2")
+set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen3.3")
 set(CTEST_DROP_SITE_CDASH TRUE)
diff --git a/Eigen/Array b/Eigen/Array
deleted file mode 100644
index 3d004fb69..000000000
--- a/Eigen/Array
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef EIGEN_ARRAY_MODULE_H
-#define EIGEN_ARRAY_MODULE_H
-
-// include Core first to handle Eigen2 support macros
-#include "Core"
-
-#ifndef EIGEN2_SUPPORT
-  #error The Eigen/Array header does no longer exist in Eigen3. All that functionality has moved to Eigen/Core.
-#endif
-
-#endif // EIGEN_ARRAY_MODULE_H
diff --git a/Eigen/CMakeLists.txt b/Eigen/CMakeLists.txt
index a92dd6f6c..9eb502b79 100644
--- a/Eigen/CMakeLists.txt
+++ b/Eigen/CMakeLists.txt
@@ -16,4 +16,4 @@ install(FILES
   DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel
   )
 
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/Eigen/Cholesky b/Eigen/Cholesky
index f727f5d89..369d1f5ec 100644
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLESKY_MODULE_H
 #define EIGEN_CHOLESKY_MODULE_H
 
@@ -10,20 +17,22 @@
   *
   *
   * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
-  * Those decompositions are accessible via the following MatrixBase methods:
-  *  - MatrixBase::llt(),
+  * Those decompositions are also accessible via the following methods:
+  *  - MatrixBase::llt()
   *  - MatrixBase::ldlt()
+  *  - SelfAdjointView::llt()
+  *  - SelfAdjointView::ldlt()
   *
   * \code
   * #include <Eigen/Cholesky>
   * \endcode
   */
 
-#include "src/misc/Solve.h"
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Cholesky/LLT_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/Cholesky/LLT_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport
index 745b884e7..bed8924d3 100644
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLMODSUPPORT_MODULE_H
 #define EIGEN_CHOLMODSUPPORT_MODULE_H
 
@@ -12,7 +19,7 @@ extern "C" {
 /** \ingroup Support_modules
   * \defgroup CholmodSupport_Module CholmodSupport module
   *
-  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   * It provides the two following main factorization classes:
   * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
   * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
@@ -33,12 +40,8 @@ extern "C" {
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/CholmodSupport/CholmodSupport.h"
 
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CHOLMODSUPPORT_MODULE_H
diff --git a/Eigen/Core b/Eigen/Core
index 509c529e1..0f7fa630d 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,6 +14,58 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"
 
+// Handle NVCC/CUDA/SYCL
+#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
+  // Do not try asserts on CUDA and SYCL!
+  #ifndef EIGEN_NO_DEBUG
+  #define EIGEN_NO_DEBUG
+  #endif
+
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+  #undef EIGEN_INTERNAL_DEBUGGING
+  #endif
+
+  #ifdef EIGEN_EXCEPTIONS
+  #undef EIGEN_EXCEPTIONS
+  #endif
+
+  // All functions callable from CUDA code must be qualified with __device__
+  #ifdef __CUDACC__
+    // Do not try to vectorize on CUDA and SYCL!
+    #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+    #endif
+
+    #define EIGEN_DEVICE_FUNC __host__ __device__
+    // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro
+    // works properly on the device side
+    #include <math_functions.hpp>
+  #else
+    #define EIGEN_DEVICE_FUNC
+  #endif
+
+#else
+  #define EIGEN_DEVICE_FUNC
+
+#endif
+
+// When compiling CUDA device code with NVCC, pull in math functions from the
+// global namespace.  In host mode, and when device doee with clang, use the
+// std versions.
+#if defined(__CUDA_ARCH__) && defined(__NVCC__)
+  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
+#else
+  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
+#endif
+
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+#ifdef EIGEN_EXCEPTIONS
+  #include <new>
+#endif
+
 // then include this file where all our macros are defined. It's really important to do it first because
 // it's where we do all the alignment settings (platform detection and honoring the user's will if he
 // defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
@@ -21,7 +73,7 @@
 
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if defined(__MINGW32__) && EIGEN_GNUC_AT_LEAST(4,6)
+#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
   #pragma GCC optimize ("-fno-ipa-cp-clone")
 #endif
 
@@ -31,26 +83,26 @@
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
 
-// if alignment is disabled, then disable vectorization. Note: EIGEN_ALIGN is the proper check, it takes into
-// account both the user's will (EIGEN_DONT_ALIGN) and our own platform checks
-#if !EIGEN_ALIGN
+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES==0
   #ifndef EIGEN_DONT_VECTORIZE
     #define EIGEN_DONT_VECTORIZE
   #endif
 #endif
 
-#ifdef _MSC_VER
+#if EIGEN_COMP_MSVC
   #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (_MSC_VER >= 1500) // 2008 or later
+  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
     // Remember that usage of defined() in a #define is undefined by the standard.
     // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(_M_X64)
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
       #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
     #endif
   #endif
 #else
   // Remember that usage of defined() in a #define is undefined by the standard
-  #if (defined __SSE2__) && ( (!defined __GNUC__) || (defined __INTEL_COMPILER) || EIGEN_GNUC_AT_LEAST(4,2) )
+  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
     #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
   #endif
 #endif
@@ -82,6 +134,28 @@
     #ifdef __SSE4_2__
       #define EIGEN_VECTORIZE_SSE4_2
     #endif
+    #ifdef __AVX__
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX2__
+      #define EIGEN_VECTORIZE_AVX2
+    #endif
+    #ifdef __FMA__
+      #define EIGEN_VECTORIZE_FMA
+    #endif
+    #if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512)
+      #define EIGEN_VECTORIZE_AVX512
+      #define EIGEN_VECTORIZE_AVX2
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_FMA
+      #ifdef __AVX512DQ__
+        #define EIGEN_VECTORIZE_AVX512DQ
+      #endif
+    #endif
 
     // include files
 
@@ -95,9 +169,10 @@
     extern "C" {
       // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
       // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110
+      #if EIGEN_COMP_ICC >= 1110
         #include <immintrin.h>
       #else
+        #include <mmintrin.h>
         #include <emmintrin.h>
         #include <xmmintrin.h>
         #ifdef  EIGEN_VECTORIZE_SSE3
@@ -112,8 +187,20 @@
         #ifdef EIGEN_VECTORIZE_SSE4_2
         #include <nmmintrin.h>
         #endif
+        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
+        #include <immintrin.h>
+        #endif
       #endif
     } // end extern "C"
+  #elif defined __VSX__
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_VSX
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
   #elif defined __ALTIVEC__
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_ALTIVEC
@@ -123,13 +210,35 @@
     #undef bool
     #undef vector
     #undef pixel
-  #elif defined  __ARM_NEON
+  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_NEON
     #include <arm_neon.h>
+  #elif (defined __s390x__ && defined __VEC__)
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ZVECTOR
+    #include <vecintrin.h>
   #endif
 #endif
 
+#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
+  // We can use the optimized fp16 to float and float to fp16 conversion routines
+  #define EIGEN_HAS_FP16_C
+#endif
+
+#if defined __CUDACC__
+  #define EIGEN_VECTORIZE_CUDA
+  #include <vector_types.h>
+  #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+    #define EIGEN_HAS_CUDA_FP16
+  #endif
+#endif
+
+#if defined EIGEN_HAS_CUDA_FP16
+  #include <host_defines.h>
+  #include <cuda_fp16.h>
+#endif
+
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
   #define EIGEN_HAS_OPENMP
 #endif
@@ -139,7 +248,7 @@
 #endif
 
 // MSVC for windows mobile does not have the errno.h file
-#if !(defined(_MSC_VER) && defined(_WIN32_WCE)) && !defined(__ARMCC_VERSION)
+#if !(EIGEN_COMP_MSVC && EIGEN_OS_WINCE) && !EIGEN_COMP_ARM
 #define EIGEN_HAS_ERRNO
 #endif
 
@@ -159,29 +268,30 @@
 // for min/max:
 #include <algorithm>
 
+// for std::is_nothrow_move_assignable
+#ifdef EIGEN_INCLUDE_TYPE_TRAITS
+#include <type_traits>
+#endif
+
 // for outputting debug info
 #ifdef EIGEN_DEBUG_ASSIGN
 #include <iostream>
 #endif
 
 // required for __cpuid, needs to be included after cmath
-#if defined(_MSC_VER) && (defined(_M_IX86)||defined(_M_X64)) && (!defined(_WIN32_WCE))
+#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE
   #include <intrin.h>
 #endif
 
-#if defined(_CPPUNWIND) || defined(__EXCEPTIONS)
-  #define EIGEN_EXCEPTIONS
-#endif
-
-#ifdef EIGEN_EXCEPTIONS
-  #include <new>
-#endif
-
 /** \brief Namespace containing all symbols from the %Eigen library. */
 namespace Eigen {
 
 inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_SSE4_2)
+#if defined(EIGEN_VECTORIZE_AVX512)
+  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
   return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_1)
   return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
@@ -193,8 +303,12 @@ inline static const char *SimdInstructionSetsInUse(void) {
   return "SSE, SSE2";
 #elif defined(EIGEN_VECTORIZE_ALTIVEC)
   return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
 #elif defined(EIGEN_VECTORIZE_NEON)
   return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_ZVECTOR)
+  return "S390X ZVECTOR";
 #else
   return "None";
 #endif
@@ -202,42 +316,21 @@ inline static const char *SimdInstructionSetsInUse(void) {
 
 } // end namespace Eigen
 
-#define STAGE10_FULL_EIGEN2_API             10
-#define STAGE20_RESOLVE_API_CONFLICTS       20
-#define STAGE30_FULL_EIGEN3_API             30
-#define STAGE40_FULL_EIGEN3_STRICTNESS      40
-#define STAGE99_NO_EIGEN2_SUPPORT           99
-
-#if   defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE40_FULL_EIGEN3_STRICTNESS
-#elif defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
-#elif defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE20_RESOLVE_API_CONFLICTS
-#elif defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE10_FULL_EIGEN2_API
-#elif defined EIGEN2_SUPPORT
-  // default to stage 3, that's what it's always meant
-  #define EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
-#else
-  #define EIGEN2_SUPPORT_STAGE STAGE99_NO_EIGEN2_SUPPORT
+#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
+// This will generate an error message:
+#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
 #endif
 
-#ifdef EIGEN2_SUPPORT
-#undef minor
-#endif
+namespace Eigen {
 
 // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
 // ensure QNX/QCC support
 using std::size_t;
-// gcc 4.6.0 wants std:: for ptrdiff_t 
+// gcc 4.6.0 wants std:: for ptrdiff_t
 using std::ptrdiff_t;
 
+}
+
 /** \defgroup Core_Module Core module
   * This is the main module of Eigen providing dense matrix and vector support
   * (both fixed and dynamic size) with all the features corresponding to a BLAS library
@@ -249,8 +342,8 @@ using std::ptrdiff_t;
   */
 
 #include "src/Core/util/Constants.h"
-#include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/Meta.h"
+#include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"
 #include "src/Core/util/Memory.h"
@@ -258,41 +351,92 @@ using std::ptrdiff_t;
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
+#include "src/Core/MathFunctionsImpl.h"
 
-#if defined EIGEN_VECTORIZE_SSE
+#if defined EIGEN_VECTORIZE_AVX512
+  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX512/PacketMath.h"
+  #include "src/Core/arch/AVX512/MathFunctions.h"
+#elif defined EIGEN_VECTORIZE_AVX
+  // Use AVX for floats and doubles, SSE for integers
+  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
+  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
+#elif defined EIGEN_VECTORIZE_SSE
   #include "src/Core/arch/SSE/PacketMath.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
   #include "src/Core/arch/SSE/Complex.h"
-#elif defined EIGEN_VECTORIZE_ALTIVEC
+  #include "src/Core/arch/SSE/TypeCasting.h"
+#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
   #include "src/Core/arch/AltiVec/PacketMath.h"
+  #include "src/Core/arch/AltiVec/MathFunctions.h"
   #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
   #include "src/Core/arch/NEON/PacketMath.h"
+  #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
+#elif defined EIGEN_VECTORIZE_ZVECTOR
+  #include "src/Core/arch/ZVector/PacketMath.h"
+  #include "src/Core/arch/ZVector/MathFunctions.h"
+  #include "src/Core/arch/ZVector/Complex.h"
+#endif
+
+// Half float support
+#include "src/Core/arch/CUDA/Half.h"
+#include "src/Core/arch/CUDA/PacketMathHalf.h"
+#include "src/Core/arch/CUDA/TypeCasting.h"
+
+#if defined EIGEN_VECTORIZE_CUDA
+  #include "src/Core/arch/CUDA/PacketMath.h"
+  #include "src/Core/arch/CUDA/MathFunctions.h"
 #endif
 
 #include "src/Core/arch/Default/Settings.h"
 
-#include "src/Core/Functors.h"
+#include "src/Core/functors/TernaryFunctors.h"
+#include "src/Core/functors/BinaryFunctors.h"
+#include "src/Core/functors/UnaryFunctors.h"
+#include "src/Core/functors/NullaryFunctors.h"
+#include "src/Core/functors/StlFunctors.h"
+#include "src/Core/functors/AssignmentFunctors.h"
+
+// Specialized functors to enable the processing of complex numbers
+// on CUDA devices
+#include "src/Core/arch/CUDA/Complex.h"
+
+#include "src/Core/IO.h"
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
 #include "src/Core/EigenBase.h"
 
+#include "src/Core/Product.h"
+#include "src/Core/CoreEvaluators.h"
+#include "src/Core/AssignEvaluator.h"
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN // work around Doxygen bug triggered by Assign.h r814874
                                 // at least confirmed with Doxygen 1.5.5 and 1.5.6
   #include "src/Core/Assign.h"
 #endif
 
+#include "src/Core/ArrayBase.h"
 #include "src/Core/util/BlasUtil.h"
 #include "src/Core/DenseStorage.h"
 #include "src/Core/NestByValue.h"
-#include "src/Core/ForceAlignedAccess.h"
+
+// #include "src/Core/ForceAlignedAccess.h"
+
 #include "src/Core/ReturnByValue.h"
 #include "src/Core/NoAlias.h"
 #include "src/Core/PlainObjectBase.h"
 #include "src/Core/Matrix.h"
 #include "src/Core/Array.h"
+#include "src/Core/CwiseTernaryOp.h"
 #include "src/Core/CwiseBinaryOp.h"
 #include "src/Core/CwiseUnaryOp.h"
 #include "src/Core/CwiseNullaryOp.h"
@@ -300,32 +444,32 @@ using std::ptrdiff_t;
 #include "src/Core/SelfCwiseBinaryOp.h"
 #include "src/Core/Dot.h"
 #include "src/Core/StableNorm.h"
-#include "src/Core/MapBase.h"
 #include "src/Core/Stride.h"
+#include "src/Core/MapBase.h"
 #include "src/Core/Map.h"
+#include "src/Core/Ref.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
-#include "src/Core/Ref.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
 #include "src/Core/DiagonalProduct.h"
-#include "src/Core/PermutationMatrix.h"
-#include "src/Core/Transpositions.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
-#include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
-#include "src/Core/Flagged.h"
-#include "src/Core/ProductBase.h"
 #include "src/Core/GeneralProduct.h"
+#include "src/Core/Solve.h"
+#include "src/Core/Inverse.h"
+#include "src/Core/SolverBase.h"
+#include "src/Core/PermutationMatrix.h"
+#include "src/Core/Transpositions.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
 #include "src/Core/products/Parallelizer.h"
-#include "src/Core/products/CoeffBasedProduct.h"
+#include "src/Core/ProductEvaluators.h"
 #include "src/Core/products/GeneralMatrixVector.h"
 #include "src/Core/products/GeneralMatrixMatrix.h"
 #include "src/Core/SolveTriangular.h"
@@ -340,6 +484,7 @@ using std::ptrdiff_t;
 #include "src/Core/products/TriangularSolverVector.h"
 #include "src/Core/BandMatrix.h"
 #include "src/Core/CoreIterators.h"
+#include "src/Core/ConditionEstimator.h"
 
 #include "src/Core/BooleanRedux.h"
 #include "src/Core/Select.h"
@@ -347,18 +492,17 @@ using std::ptrdiff_t;
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
-#include "src/Core/ArrayBase.h"
 #include "src/Core/ArrayWrapper.h"
 
 #ifdef EIGEN_USE_BLAS
-#include "src/Core/products/GeneralMatrixMatrix_MKL.h"
-#include "src/Core/products/GeneralMatrixVector_MKL.h"
-#include "src/Core/products/GeneralMatrixMatrixTriangular_MKL.h"
-#include "src/Core/products/SelfadjointMatrixMatrix_MKL.h"
-#include "src/Core/products/SelfadjointMatrixVector_MKL.h"
-#include "src/Core/products/TriangularMatrixMatrix_MKL.h"
-#include "src/Core/products/TriangularMatrixVector_MKL.h"
-#include "src/Core/products/TriangularSolverMatrix_MKL.h"
+#include "src/Core/products/GeneralMatrixMatrix_BLAS.h"
+#include "src/Core/products/GeneralMatrixVector_BLAS.h"
+#include "src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixMatrix_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularMatrixMatrix_BLAS.h"
+#include "src/Core/products/TriangularMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularSolverMatrix_BLAS.h"
 #endif // EIGEN_USE_BLAS
 
 #ifdef EIGEN_USE_MKL_VML
@@ -369,8 +513,4 @@ using std::ptrdiff_t;
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#ifdef EIGEN2_SUPPORT
-#include "Eigen2Support"
-#endif
-
 #endif // EIGEN_CORE_H
diff --git a/Eigen/Eigen b/Eigen/Eigen
index 19b40ea4e..654c8dc63 100644
--- a/Eigen/Eigen
+++ b/Eigen/Eigen
@@ -1,2 +1,2 @@
 #include "Dense"
-//#include "Sparse"
+#include "Sparse"
diff --git a/Eigen/Eigen2Support b/Eigen/Eigen2Support
deleted file mode 100644
index 6aa009d20..000000000
--- a/Eigen/Eigen2Support
+++ /dev/null
@@ -1,95 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2SUPPORT_H
-#define EIGEN2SUPPORT_H
-
-#if (!defined(EIGEN2_SUPPORT)) || (!defined(EIGEN_CORE_H))
-#error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before including any Eigen header
-#endif
-
-#ifndef EIGEN_NO_EIGEN2_DEPRECATED_WARNING
-
-#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
-#warning "Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3. (Define EIGEN_NO_EIGEN2_DEPRECATED_WARNING to disable this warning)"
-#else
-#pragma message ("Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3. (Define EIGEN_NO_EIGEN2_DEPRECATED_WARNING to disable this warning)")
-#endif
-
-#endif // EIGEN_NO_EIGEN2_DEPRECATED_WARNING
-
-#include "src/Core/util/DisableStupidWarnings.h"
-
-/** \ingroup Support_modules
-  * \defgroup Eigen2Support_Module Eigen2 support module
-  *
-  * \warning Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3.
-  *
-  * This module provides a couple of deprecated functions improving the compatibility with Eigen2.
-  * 
-  * To use it, define EIGEN2_SUPPORT before including any Eigen header
-  * \code
-  * #define EIGEN2_SUPPORT
-  * \endcode
-  *
-  */
-
-#include "src/Eigen2Support/Macros.h"
-#include "src/Eigen2Support/Memory.h"
-#include "src/Eigen2Support/Meta.h"
-#include "src/Eigen2Support/Lazy.h"
-#include "src/Eigen2Support/Cwise.h"
-#include "src/Eigen2Support/CwiseOperators.h"
-#include "src/Eigen2Support/TriangularSolver.h"
-#include "src/Eigen2Support/Block.h"
-#include "src/Eigen2Support/VectorBlock.h"
-#include "src/Eigen2Support/Minor.h"
-#include "src/Eigen2Support/MathFunctions.h"
-
-
-#include "src/Core/util/ReenableStupidWarnings.h"
-
-// Eigen2 used to include iostream
-#include<iostream>
-
-#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
-using Eigen::Matrix##SizeSuffix##TypeSuffix; \
-using Eigen::Vector##SizeSuffix##TypeSuffix; \
-using Eigen::RowVector##SizeSuffix##TypeSuffix;
-
-#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(TypeSuffix) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) \
-
-#define EIGEN_USING_MATRIX_TYPEDEFS \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(i) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(f) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(d) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cf) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cd)
-
-#define USING_PART_OF_NAMESPACE_EIGEN \
-EIGEN_USING_MATRIX_TYPEDEFS \
-using Eigen::Matrix; \
-using Eigen::MatrixBase; \
-using Eigen::ei_random; \
-using Eigen::ei_real; \
-using Eigen::ei_imag; \
-using Eigen::ei_conj; \
-using Eigen::ei_abs; \
-using Eigen::ei_abs2; \
-using Eigen::ei_sqrt; \
-using Eigen::ei_exp; \
-using Eigen::ei_log; \
-using Eigen::ei_sin; \
-using Eigen::ei_cos;
-
-#endif // EIGEN2SUPPORT_H
diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues
index 53c5a73a2..009e529e1 100644
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_EIGENVALUES_MODULE_H
 #define EIGEN_EIGENVALUES_MODULE_H
 
@@ -25,6 +32,7 @@
   * \endcode
   */
 
+#include "src/misc/RealSvd2x2.h"
 #include "src/Eigenvalues/Tridiagonalization.h"
 #include "src/Eigenvalues/RealSchur.h"
 #include "src/Eigenvalues/EigenSolver.h"
@@ -37,9 +45,10 @@
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Eigenvalues/RealSchur_MKL.h"
-#include "src/Eigenvalues/ComplexSchur_MKL.h"
-#include "src/Eigenvalues/SelfAdjointEigenSolver_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/Eigenvalues/RealSchur_LAPACKE.h"
+#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
+#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/Geometry b/Eigen/Geometry
index efd9d4504..716d52952 100644
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_GEOMETRY_MODULE_H
 #define EIGEN_GEOMETRY_MODULE_H
 
@@ -9,21 +16,17 @@
 #include "LU"
 #include <limits>
 
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
 /** \defgroup Geometry_Module Geometry module
   *
-  *
-  *
   * This module provides support for:
   *  - fixed-size homogeneous transformations
   *  - translation, scaling, 2D and 3D rotations
-  *  - quaternions
-  *  - \ref MatrixBase::cross() "cross product"
-  *  - \ref MatrixBase::unitOrthogonal() "orthognal vector generation"
-  *  - some linear components: parametrized-lines and hyperplanes
+  *  - \link Quaternion quaternions \endlink
+  *  - cross products (\ref MatrixBase::cross, \ref MatrixBase::cross3)
+  *  - orthognal vector generation (\ref MatrixBase::unitOrthogonal)
+  *  - some linear components: \link ParametrizedLine parametrized-lines \endlink and \link Hyperplane hyperplanes \endlink
+  *  - \link AlignedBox axis aligned bounding boxes \endlink
+  *  - \link umeyama least-square transformation fitting \endlink
   *
   * \code
   * #include <Eigen/Geometry>
@@ -33,27 +36,23 @@
 #include "src/Geometry/OrthoMethods.h"
 #include "src/Geometry/EulerAngles.h"
 
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-  #include "src/Geometry/Homogeneous.h"
-  #include "src/Geometry/RotationBase.h"
-  #include "src/Geometry/Rotation2D.h"
-  #include "src/Geometry/Quaternion.h"
-  #include "src/Geometry/AngleAxis.h"
-  #include "src/Geometry/Transform.h"
-  #include "src/Geometry/Translation.h"
-  #include "src/Geometry/Scaling.h"
-  #include "src/Geometry/Hyperplane.h"
-  #include "src/Geometry/ParametrizedLine.h"
-  #include "src/Geometry/AlignedBox.h"
-  #include "src/Geometry/Umeyama.h"
-
-  #if defined EIGEN_VECTORIZE_SSE
-    #include "src/Geometry/arch/Geometry_SSE.h"
-  #endif
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/Geometry/All.h"
+#include "src/Geometry/Homogeneous.h"
+#include "src/Geometry/RotationBase.h"
+#include "src/Geometry/Rotation2D.h"
+#include "src/Geometry/Quaternion.h"
+#include "src/Geometry/AngleAxis.h"
+#include "src/Geometry/Transform.h"
+#include "src/Geometry/Translation.h"
+#include "src/Geometry/Scaling.h"
+#include "src/Geometry/Hyperplane.h"
+#include "src/Geometry/ParametrizedLine.h"
+#include "src/Geometry/AlignedBox.h"
+#include "src/Geometry/Umeyama.h"
+
+// Use the SSE optimized version whenever possible. At the moment the
+// SSE version doesn't compile when AVX is enabled
+#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
+#include "src/Geometry/arch/Geometry_SSE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/Householder b/Eigen/Householder
index 6e348db5c..89cd81b1a 100644
--- a/Eigen/Householder
+++ b/Eigen/Householder
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_HOUSEHOLDER_MODULE_H
 #define EIGEN_HOUSEHOLDER_MODULE_H
 
diff --git a/Eigen/IterativeLinearSolvers b/Eigen/IterativeLinearSolvers
index 0f4159dc1..957d5750b 100644
--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
 #define EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
 
@@ -12,28 +19,29 @@
   * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse.
   * Those solvers are accessible via the following classes:
   *  - ConjugateGradient for selfadjoint (hermitian) matrices,
+  *  - LeastSquaresConjugateGradient for rectangular least-square problems,
   *  - BiCGSTAB for general square matrices.
   *
   * These iterative solvers are associated with some preconditioners:
   *  - IdentityPreconditioner - not really useful
-  *  - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices.
-  *  - IncompleteILUT - incomplete LU factorization with dual thresholding
+  *  - DiagonalPreconditioner - also called Jacobi preconditioner, work very well on diagonal dominant matrices.
+  *  - IncompleteLUT - incomplete LU factorization with dual thresholding
   *
   * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.
   *
-  * \code
-  * #include <Eigen/IterativeLinearSolvers>
-  * \endcode
+    \code
+    #include <Eigen/IterativeLinearSolvers>
+    \endcode
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
+#include "src/IterativeLinearSolvers/SolveWithGuess.h"
 #include "src/IterativeLinearSolvers/IterativeSolverBase.h"
 #include "src/IterativeLinearSolvers/BasicPreconditioners.h"
 #include "src/IterativeLinearSolvers/ConjugateGradient.h"
+#include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h"
 #include "src/IterativeLinearSolvers/BiCGSTAB.h"
 #include "src/IterativeLinearSolvers/IncompleteLUT.h"
+#include "src/IterativeLinearSolvers/IncompleteCholesky.h"
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/Jacobi b/Eigen/Jacobi
index ba8a4dc36..17c1d785a 100644
--- a/Eigen/Jacobi
+++ b/Eigen/Jacobi
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_JACOBI_MODULE_H
 #define EIGEN_JACOBI_MODULE_H
 
diff --git a/Eigen/LU b/Eigen/LU
index db5795504..6f6c55629 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_LU_MODULE_H
 #define EIGEN_LU_MODULE_H
 
@@ -16,25 +23,23 @@
   * \endcode
   */
 
-#include "src/misc/Solve.h"
 #include "src/misc/Kernel.h"
 #include "src/misc/Image.h"
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/LU/PartialPivLU_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/LU/PartialPivLU_LAPACKE.h"
 #endif
 #include "src/LU/Determinant.h"
-#include "src/LU/Inverse.h"
+#include "src/LU/InverseImpl.h"
 
-#if defined EIGEN_VECTORIZE_SSE
+// Use the SSE optimized version whenever possible. At the moment the
+// SSE version doesn't compile when AVX is enabled
+#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
   #include "src/LU/arch/Inverse_SSE.h"
 #endif
 
-#ifdef EIGEN2_SUPPORT
-  #include "src/Eigen2Support/LU.h"
-#endif
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_LU_MODULE_H
diff --git a/Eigen/LeastSquares b/Eigen/LeastSquares
deleted file mode 100644
index 35137c25d..000000000
--- a/Eigen/LeastSquares
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef EIGEN_REGRESSION_MODULE_H
-#define EIGEN_REGRESSION_MODULE_H
-
-#ifndef EIGEN2_SUPPORT
-#error LeastSquares is only available in Eigen2 support mode (define EIGEN2_SUPPORT)
-#endif
-
-// exclude from normal eigen3-only documentation
-#ifdef EIGEN2_SUPPORT
-
-#include "Core"
-
-#include "src/Core/util/DisableStupidWarnings.h"
-
-#include "Eigenvalues"
-#include "Geometry"
-
-/** \defgroup LeastSquares_Module LeastSquares module
-  * This module provides linear regression and related features.
-  *
-  * \code
-  * #include <Eigen/LeastSquares>
-  * \endcode
-  */
-
-#include "src/Eigen2Support/LeastSquares.h"
-
-#include "src/Core/util/ReenableStupidWarnings.h"
-
-#endif // EIGEN2_SUPPORT
-
-#endif // EIGEN_REGRESSION_MODULE_H
diff --git a/Eigen/MetisSupport b/Eigen/MetisSupport
index 6a113f7a8..85c41bf34 100644
--- a/Eigen/MetisSupport
+++ b/Eigen/MetisSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_METISSUPPORT_MODULE_H
 #define EIGEN_METISSUPPORT_MODULE_H
 
diff --git a/Eigen/OrderingMethods b/Eigen/OrderingMethods
index 7c0f1ffff..d8ea36193 100644
--- a/Eigen/OrderingMethods
+++ b/Eigen/OrderingMethods
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ORDERINGMETHODS_MODULE_H
 #define EIGEN_ORDERINGMETHODS_MODULE_H
 
diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport
index 7c616ee5e..de3a63b4d 100644
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PASTIXSUPPORT_MODULE_H
 #define EIGEN_PASTIXSUPPORT_MODULE_H
 
@@ -5,7 +12,6 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-#include <complex.h>
 extern "C" {
 #include <pastix_nompi.h>
 #include <pastix.h>
@@ -35,12 +41,8 @@ extern "C" {
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/PaStiXSupport/PaStiXSupport.h"
 
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_PASTIXSUPPORT_MODULE_H
diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport
index 99330ce7a..340edf51f 100644..100755
--- a/Eigen/PardisoSupport
+++ b/Eigen/PardisoSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PARDISOSUPPORT_MODULE_H
 #define EIGEN_PARDISOSUPPORT_MODULE_H
 
@@ -7,8 +14,6 @@
 
 #include <mkl_pardiso.h>
 
-#include <unsupported/Eigen/SparseExtra>
-
 /** \ingroup Support_modules
   * \defgroup PardisoSupport_Module PardisoSupport module
   *
diff --git a/Eigen/QR b/Eigen/QR
index ac5b02693..80838e3bd 100644
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_QR_MODULE_H
 #define EIGEN_QR_MODULE_H
 
@@ -15,31 +22,26 @@
   *
   * This module provides various QR decompositions
   * This module also provides some MatrixBase methods, including:
-  *  - MatrixBase::qr(),
+  *  - MatrixBase::householderQr()
+  *  - MatrixBase::colPivHouseholderQr()
+  *  - MatrixBase::fullPivHouseholderQr()
   *
   * \code
   * #include <Eigen/QR>
   * \endcode
   */
 
-#include "src/misc/Solve.h"
 #include "src/QR/HouseholderQR.h"
 #include "src/QR/FullPivHouseholderQR.h"
 #include "src/QR/ColPivHouseholderQR.h"
+#include "src/QR/CompleteOrthogonalDecomposition.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/QR/HouseholderQR_MKL.h"
-#include "src/QR/ColPivHouseholderQR_MKL.h"
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/QR.h"
+#include "src/misc/lapacke.h"
+#include "src/QR/HouseholderQR_LAPACKE.h"
+#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#ifdef EIGEN2_SUPPORT
-#include "Eigenvalues"
-#endif
-
 #endif // EIGEN_QR_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc
index 46f7d83b7..c6571f129 100644
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc
@@ -1,3 +1,9 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #ifndef EIGEN_QTMALLOC_MODULE_H
 #define EIGEN_QTMALLOC_MODULE_H
@@ -8,7 +14,7 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-void *qMalloc(size_t size)
+void *qMalloc(std::size_t size)
 {
   return Eigen::internal::aligned_malloc(size);
 }
@@ -18,7 +24,7 @@ void qFree(void *ptr)
   Eigen::internal::aligned_free(ptr);
 }
 
-void *qRealloc(void *ptr, size_t size)
+void *qRealloc(void *ptr, std::size_t size)
 {
   void* newPtr = Eigen::internal::aligned_malloc(size);
   memcpy(newPtr, ptr, size);
diff --git a/Eigen/SPQRSupport b/Eigen/SPQRSupport
index 77016442e..f70390c17 100644
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPQRSUPPORT_MODULE_H
 #define EIGEN_SPQRSUPPORT_MODULE_H
 
@@ -10,7 +17,7 @@
 /** \ingroup Support_modules
   * \defgroup SPQRSupport_Module SuiteSparseQR module
   * 
-  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   *
   * \code
   * #include <Eigen/SPQRSupport>
@@ -21,8 +28,6 @@
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
 #include "src/CholmodSupport/CholmodSupport.h"
 #include "src/SPQRSupport/SuiteSparseQRSupport.h"
 
diff --git a/Eigen/SVD b/Eigen/SVD
index fd310017a..86143c23d 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SVD_MODULE_H
 #define EIGEN_SVD_MODULE_H
 
@@ -12,23 +19,26 @@
   *
   *
   * This module provides SVD decomposition for matrices (both real and complex).
-  * This decomposition is accessible via the following MatrixBase method:
+  * Two decomposition algorithms are provided:
+  *  - JacobiSVD implementing two-sided Jacobi iterations is numerically very accurate, fast for small matrices, but very slow for larger ones.
+  *  - BDCSVD implementing a recursive divide & conquer strategy on top of an upper-bidiagonalization which remains fast for large problems.
+  * These decompositions are accessible via the respective classes and following MatrixBase methods:
   *  - MatrixBase::jacobiSvd()
+  *  - MatrixBase::bdcSvd()
   *
   * \code
   * #include <Eigen/SVD>
   * \endcode
   */
 
-#include "src/misc/Solve.h"
+#include "src/misc/RealSvd2x2.h"
+#include "src/SVD/UpperBidiagonalization.h"
+#include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
+#include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
-#include "src/SVD/JacobiSVD_MKL.h"
-#endif
-#include "src/SVD/UpperBidiagonalization.h"
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/SVD.h"
+#include "src/misc/lapacke.h"
+#include "src/SVD/JacobiSVD_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/Sparse b/Eigen/Sparse
index 7cc9c0913..136e681a1 100644
--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSE_MODULE_H
 #define EIGEN_SPARSE_MODULE_H
 
@@ -11,14 +18,16 @@
   * - \ref SparseQR_Module
   * - \ref IterativeLinearSolvers_Module
   *
-  * \code
-  * #include <Eigen/Sparse>
-  * \endcode
+    \code
+    #include <Eigen/Sparse>
+    \endcode
   */
 
 #include "SparseCore"
 #include "OrderingMethods"
+#ifndef EIGEN_MPL2_ONLY
 #include "SparseCholesky"
+#endif
 #include "SparseLU"
 #include "SparseQR"
 #include "IterativeLinearSolvers"
diff --git a/Eigen/SparseCholesky b/Eigen/SparseCholesky
index 9f5056aa1..b6a320c40 100644
--- a/Eigen/SparseCholesky
+++ b/Eigen/SparseCholesky
@@ -34,8 +34,6 @@
 #error The SparseCholesky module has nothing to offer in MPL2 only mode
 #endif
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
 #include "src/SparseCholesky/SimplicialCholesky.h"
 
 #ifndef EIGEN_MPL2_ONLY
diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index 9b5be5e15..76966c4c4 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSECORE_MODULE_H
 #define EIGEN_SPARSECORE_MODULE_H
 
@@ -14,7 +21,7 @@
 /** 
   * \defgroup SparseCore_Module SparseCore module
   *
-  * This module provides a sparse matrix representation, and basic associatd matrix manipulations
+  * This module provides a sparse matrix representation, and basic associated matrix manipulations
   * and operations.
   *
   * See the \ref TutorialSparse "Sparse tutorial"
@@ -26,37 +33,35 @@
   * This module depends on: Core.
   */
 
-namespace Eigen {
-
-/** The type used to identify a general sparse storage. */
-struct Sparse {};
-
-}
-
 #include "src/SparseCore/SparseUtil.h"
 #include "src/SparseCore/SparseMatrixBase.h"
+#include "src/SparseCore/SparseAssign.h"
 #include "src/SparseCore/CompressedStorage.h"
 #include "src/SparseCore/AmbiVector.h"
+#include "src/SparseCore/SparseCompressedBase.h"
 #include "src/SparseCore/SparseMatrix.h"
+#include "src/SparseCore/SparseMap.h"
 #include "src/SparseCore/MappedSparseMatrix.h"
 #include "src/SparseCore/SparseVector.h"
-#include "src/SparseCore/SparseBlock.h"
-#include "src/SparseCore/SparseTranspose.h"
+#include "src/SparseCore/SparseRef.h"
 #include "src/SparseCore/SparseCwiseUnaryOp.h"
 #include "src/SparseCore/SparseCwiseBinaryOp.h"
+#include "src/SparseCore/SparseTranspose.h"
+#include "src/SparseCore/SparseBlock.h"
 #include "src/SparseCore/SparseDot.h"
-#include "src/SparseCore/SparsePermutation.h"
 #include "src/SparseCore/SparseRedux.h"
-#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/SparseView.h"
+#include "src/SparseCore/SparseDiagonalProduct.h"
 #include "src/SparseCore/ConservativeSparseSparseProduct.h"
 #include "src/SparseCore/SparseSparseProductWithPruning.h"
 #include "src/SparseCore/SparseProduct.h"
 #include "src/SparseCore/SparseDenseProduct.h"
-#include "src/SparseCore/SparseDiagonalProduct.h"
-#include "src/SparseCore/SparseTriangularView.h"
 #include "src/SparseCore/SparseSelfAdjointView.h"
+#include "src/SparseCore/SparseTriangularView.h"
 #include "src/SparseCore/TriangularSolver.h"
-#include "src/SparseCore/SparseView.h"
+#include "src/SparseCore/SparsePermutation.h"
+#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/SparseSolverBase.h"
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/SparseLU b/Eigen/SparseLU
index 8527a49bd..38b38b531 100644
--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@@ -20,9 +20,6 @@
   * Please, see the documentation of the SparseLU class for more details.
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 // Ordering interface
 #include "OrderingMethods"
 
diff --git a/Eigen/SparseQR b/Eigen/SparseQR
index 4ee42065e..a6f3b7f7d 100644
--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSEQR_MODULE_H
 #define EIGEN_SPARSEQR_MODULE_H
 
@@ -21,9 +28,6 @@
   * 
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"
diff --git a/Eigen/StdDeque b/Eigen/StdDeque
index f27234778..bc68397be 100644
--- a/Eigen/StdDeque
+++ b/Eigen/StdDeque
@@ -14,7 +14,7 @@
 #include "Core"
 #include <deque>
 
-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)
 
diff --git a/Eigen/StdList b/Eigen/StdList
index 225c1e18f..4c6262c08 100644
--- a/Eigen/StdList
+++ b/Eigen/StdList
@@ -13,7 +13,7 @@
 #include "Core"
 #include <list>
 
-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */    
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)
 
diff --git a/Eigen/StdVector b/Eigen/StdVector
index 6b22627f6..0c4697ad5 100644
--- a/Eigen/StdVector
+++ b/Eigen/StdVector
@@ -14,7 +14,7 @@
 #include "Core"
 #include <vector>
 
-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)
 
diff --git a/Eigen/SuperLUSupport b/Eigen/SuperLUSupport
index 575e14fbc..59312a82d 100644
--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SUPERLUSUPPORT_MODULE_H
 #define EIGEN_SUPERLUSUPPORT_MODULE_H
 
@@ -36,6 +43,8 @@ namespace Eigen { struct SluMatrix; }
   * - class SuperLU: a supernodal sequential LU factorization.
   * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods).
   *
+  * \warning This wrapper requires at least versions 4.0 of SuperLU. The 3.x versions are not supported.
+  *
   * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting.
   *
   * \code
@@ -48,12 +57,8 @@ namespace Eigen { struct SluMatrix; }
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/SuperLUSupport/SuperLUSupport.h"
 
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SUPERLUSUPPORT_MODULE_H
diff --git a/Eigen/UmfPackSupport b/Eigen/UmfPackSupport
index 984f64a84..00eec8087 100644
--- a/Eigen/UmfPackSupport
+++ b/Eigen/UmfPackSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_UMFPACKSUPPORT_MODULE_H
 #define EIGEN_UMFPACKSUPPORT_MODULE_H
 
@@ -12,7 +19,7 @@ extern "C" {
 /** \ingroup Support_modules
   * \defgroup UmfPackSupport_Module UmfPackSupport module
   *
-  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   * It provides the following factorization class:
   * - class UmfPackLU: a multifrontal sequential LU factorization.
   *
@@ -26,9 +33,6 @@ extern "C" {
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/UmfPackSupport/UmfPackSupport.h"
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/src/CMakeLists.txt b/Eigen/src/CMakeLists.txt
deleted file mode 100644
index c326f374d..000000000
--- a/Eigen/src/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB Eigen_src_subdirectories "*")
-escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
-foreach(f ${Eigen_src_subdirectories})
-  if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" )
-    add_subdirectory(${f})
-  endif()
-endforeach()
diff --git a/Eigen/src/Cholesky/CMakeLists.txt b/Eigen/src/Cholesky/CMakeLists.txt
deleted file mode 100644
index d01488b41..000000000
--- a/Eigen/src/Cholesky/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Cholesky_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Cholesky_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Cholesky COMPONENT Devel
-  )
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index e01ae8233..fcee7b2e3 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -13,7 +13,7 @@
 #ifndef EIGEN_LDLT_H
 #define EIGEN_LDLT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
   template<typename MatrixType, int UpLo> struct LDLT_Traits;
@@ -28,8 +28,8 @@ namespace internal {
   *
   * \brief Robust Cholesky decomposition of a matrix with pivoting
   *
-  * \param MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  * \tparam _MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
+  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
   *             The other triangular part won't be read.
   *
   * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite
@@ -43,7 +43,9 @@ namespace internal {
   * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
   * decomposition to determine whether a system of equations has a solution.
   *
-  * \sa MatrixBase::ldlt(), class LLT
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
+  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
   */
 template<typename _MatrixType, int _UpLo> class LDLT
 {
@@ -52,15 +54,15 @@ template<typename _MatrixType, int _UpLo> class LDLT
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options & ~RowMajorBit, // these are the options for the TmpMatrixType, we need a ColMajor matrix here!
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
       UpLo = _UpLo
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar, RowsAtCompileTime, 1, Options, MaxRowsAtCompileTime, 1> TmpMatrixType;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;
 
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
     typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
@@ -72,11 +74,11 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * The default constructor is useful in cases in which the user intends to
       * perform decompositions via LDLT::compute(const MatrixType&).
       */
-    LDLT() 
-      : m_matrix(), 
-        m_transpositions(), 
+    LDLT()
+      : m_matrix(),
+        m_transpositions(),
         m_sign(internal::ZeroSign),
-        m_isInitialized(false) 
+        m_isInitialized(false)
     {}
 
     /** \brief Default Constructor with memory preallocation
@@ -85,7 +87,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * according to the specified problem \a size.
       * \sa LDLT()
       */
-    LDLT(Index size)
+    explicit LDLT(Index size)
       : m_matrix(size, size),
         m_transpositions(size),
         m_temporary(size),
@@ -96,16 +98,35 @@ template<typename _MatrixType, int _UpLo> class LDLT
     /** \brief Constructor with decomposition
       *
       * This calculates the decomposition for the input \a matrix.
+      *
       * \sa LDLT(Index size)
       */
-    LDLT(const MatrixType& matrix)
+    template<typename InputType>
+    explicit LDLT(const EigenBase<InputType>& matrix)
       : m_matrix(matrix.rows(), matrix.cols()),
         m_transpositions(matrix.rows()),
         m_temporary(matrix.rows()),
         m_sign(internal::ZeroSign),
         m_isInitialized(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
+    }
+
+    /** \brief Constructs a LDLT factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa LDLT(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit LDLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_transpositions(matrix.rows()),
+        m_temporary(matrix.rows()),
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false)
+    {
+      compute(matrix.derived());
     }
 
     /** Clear any existing decomposition
@@ -151,13 +172,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
       eigen_assert(m_isInitialized && "LDLT is not initialized.");
       return m_sign == internal::PositiveSemiDef || m_sign == internal::ZeroSign;
     }
-    
-    #ifdef EIGEN2_SUPPORT
-    inline bool isPositiveDefinite() const
-    {
-      return isPositive();
-    }
-    #endif
 
     /** \returns true if the matrix is negative (semidefinite) */
     inline bool isNegative(void) const
@@ -173,37 +187,38 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * \note_about_checking_solutions
       *
       * More precisely, this method solves \f$ A x = b \f$ using the decomposition \f$ A = P^T L D L^* P \f$
-      * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, 
+      * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$,
       * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
       * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
       * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
       * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
       *
-      * \sa MatrixBase::ldlt()
+      * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<LDLT, Rhs>
+    inline const Solve<LDLT, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "LDLT is not initialized.");
       eigen_assert(m_matrix.rows()==b.rows()
                 && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<LDLT, Rhs>(*this, b.derived());
+      return Solve<LDLT, Rhs>(*this, b.derived());
     }
 
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = this->solve(b);
-      return true;
-    }
-    #endif
-
     template<typename Derived>
     bool solveInPlace(MatrixBase<Derived> &bAndX) const;
 
-    LDLT& compute(const MatrixType& matrix);
+    template<typename InputType>
+    LDLT& compute(const EigenBase<InputType>& matrix);
+
+    /** \returns an estimate of the reciprocal condition number of the matrix of
+     *  which \c *this is the LDLT decomposition.
+     */
+    RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
+    }
 
     template <typename Derived>
     LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha=1);
@@ -220,6 +235,13 @@ template<typename _MatrixType, int _UpLo> class LDLT
 
     MatrixType reconstructedMatrix() const;
 
+    /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint.
+      *
+      * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
+      * \code x = decomposition.adjoint().solve(b) \endcode
+      */
+    const LDLT& adjoint() const { return *this; };
+
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
 
@@ -231,11 +253,17 @@ template<typename _MatrixType, int _UpLo> class LDLT
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return Success;
+      return m_info;
     }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif
+
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
@@ -248,10 +276,12 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * is not stored), and the diagonal entries correspond to D.
       */
     MatrixType m_matrix;
+    RealScalar m_l1_norm;
     TranspositionType m_transpositions;
     TmpMatrixType m_temporary;
     internal::SignMatrix m_sign;
     bool m_isInitialized;
+    ComputationInfo m_info;
 };
 
 namespace internal {
@@ -266,15 +296,17 @@ template<> struct ldlt_inplace<Lower>
     using std::abs;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename TranspositionType::StorageIndex IndexType;
     eigen_assert(mat.rows()==mat.cols());
     const Index size = mat.rows();
+    bool found_zero_pivot = false;
+    bool ret = true;
 
     if (size <= 1)
     {
       transpositions.setIdentity();
-      if (numext::real(mat.coeff(0,0)) > 0) sign = PositiveSemiDef;
-      else if (numext::real(mat.coeff(0,0)) < 0) sign = NegativeSemiDef;
+      if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef;
+      else if (numext::real(mat.coeff(0,0)) < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
       else sign = ZeroSign;
       return true;
     }
@@ -286,7 +318,7 @@ template<> struct ldlt_inplace<Lower>
       mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
       index_of_biggest_in_corner += k;
 
-      transpositions.coeffRef(k) = index_of_biggest_in_corner;
+      transpositions.coeffRef(k) = IndexType(index_of_biggest_in_corner);
       if(k != index_of_biggest_in_corner)
       {
         // apply the transposition while taking care to consider only
@@ -295,7 +327,7 @@ template<> struct ldlt_inplace<Lower>
         mat.row(k).head(k).swap(mat.row(index_of_biggest_in_corner).head(k));
         mat.col(k).tail(s).swap(mat.col(index_of_biggest_in_corner).tail(s));
         std::swap(mat.coeffRef(k,k),mat.coeffRef(index_of_biggest_in_corner,index_of_biggest_in_corner));
-        for(int i=k+1;i<index_of_biggest_in_corner;++i)
+        for(Index i=k+1;i<index_of_biggest_in_corner;++i)
         {
           Scalar tmp = mat.coeffRef(i,k);
           mat.coeffRef(i,k) = numext::conj(mat.coeffRef(index_of_biggest_in_corner,i));
@@ -321,26 +353,44 @@ template<> struct ldlt_inplace<Lower>
         if(rs>0)
           A21.noalias() -= A20 * temp.head(k);
       }
-      
+
       // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
-      // was smaller than the cutoff value. However, soince LDLT is not rank-revealing
-      // we should only make sure we do not introduce INF or NaN values.
-      // LAPACK also uses 0 as the cutoff value.
+      // was smaller than the cutoff value. However, since LDLT is not rank-revealing
+      // we should only make sure that we do not introduce INF or NaN values.
+      // Remark that LAPACK also uses 0 as the cutoff value.
       RealScalar realAkk = numext::real(mat.coeffRef(k,k));
-      if((rs>0) && (abs(realAkk) > RealScalar(0)))
+      bool pivot_is_valid = (abs(realAkk) > RealScalar(0));
+
+      if(k==0 && !pivot_is_valid)
+      {
+        // The entire diagonal is zero, there is nothing more to do
+        // except filling the transpositions, and checking whether the matrix is zero.
+        sign = ZeroSign;
+        for(Index j = 0; j<size; ++j)
+        {
+          transpositions.coeffRef(j) = IndexType(j);
+          ret = ret && (mat.col(j).tail(size-j-1).array()==Scalar(0)).all();
+        }
+        return ret;
+      }
+
+      if((rs>0) && pivot_is_valid)
         A21 /= realAkk;
 
+      if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
+      else if(!pivot_is_valid) found_zero_pivot = true;
+
       if (sign == PositiveSemiDef) {
-        if (realAkk < 0) sign = Indefinite;
+        if (realAkk < static_cast<RealScalar>(0)) sign = Indefinite;
       } else if (sign == NegativeSemiDef) {
-        if (realAkk > 0) sign = Indefinite;
+        if (realAkk > static_cast<RealScalar>(0)) sign = Indefinite;
       } else if (sign == ZeroSign) {
-        if (realAkk > 0) sign = PositiveSemiDef;
-        else if (realAkk < 0) sign = NegativeSemiDef;
+        if (realAkk > static_cast<RealScalar>(0)) sign = PositiveSemiDef;
+        else if (realAkk < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
       }
     }
 
-    return true;
+    return ret;
   }
 
   // Reference for the algorithm: Davis and Hager, "Multiple Rank
@@ -356,7 +406,6 @@ template<> struct ldlt_inplace<Lower>
     using numext::isfinite;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
 
     const Index size = mat.rows();
     eigen_assert(mat.cols() == size && w.size()==size);
@@ -420,16 +469,16 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Lower>
 {
   typedef const TriangularView<const MatrixType, UnitLower> MatrixL;
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
 };
 
 template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 {
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitLower> MatrixL;
   typedef const TriangularView<const MatrixType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
-  static inline MatrixU getU(const MatrixType& m) { return m; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
 };
 
 } // end namespace internal
@@ -437,21 +486,35 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 /** Compute / recompute the LDLT decomposition A = L D L^* = U^* D U of \a matrix
   */
 template<typename MatrixType, int _UpLo>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
+template<typename InputType>
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
 {
   check_template_parameters();
-  
+
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
 
-  m_matrix = a;
+  m_matrix = a.derived();
+
+  // Compute matrix L1 norm = max abs column sum.
+  m_l1_norm = RealScalar(0);
+  // TODO move this code to SelfAdjointView
+  for (Index col = 0; col < size; ++col) {
+    RealScalar abs_col_sum;
+    if (_UpLo == Lower)
+      abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+    else
+      abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
+    if (abs_col_sum > m_l1_norm)
+      m_l1_norm = abs_col_sum;
+  }
 
   m_transpositions.resize(size);
   m_isInitialized = false;
   m_temporary.resize(size);
   m_sign = internal::ZeroSign;
 
-  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);
+  m_info = internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign) ? Success : NumericalIssue;
 
   m_isInitialized = true;
   return *this;
@@ -464,20 +527,21 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
   */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename NumTraits<typename MatrixType::Scalar>::Real& sigma)
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename LDLT<MatrixType,_UpLo>::RealScalar& sigma)
 {
+  typedef typename TranspositionType::StorageIndex IndexType;
   const Index size = w.rows();
   if (m_isInitialized)
   {
     eigen_assert(m_matrix.rows()==size);
   }
   else
-  {    
+  {
     m_matrix.resize(size,size);
     m_matrix.setZero();
     m_transpositions.resize(size);
     for (Index i = 0; i < size; i++)
-      m_transpositions.coeffRef(i) = i;
+      m_transpositions.coeffRef(i) = IndexType(i);
     m_temporary.resize(size);
     m_sign = sigma>=0 ? internal::PositiveSemiDef : internal::NegativeSemiDef;
     m_isInitialized = true;
@@ -488,53 +552,45 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
   return *this;
 }
 
-namespace internal {
-template<typename _MatrixType, int _UpLo, typename Rhs>
-struct solve_retval<LDLT<_MatrixType,_UpLo>, Rhs>
-  : solve_retval_base<LDLT<_MatrixType,_UpLo>, Rhs>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType, int _UpLo>
+template<typename RhsType, typename DstType>
+void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  typedef LDLT<_MatrixType,_UpLo> LDLTType;
-  EIGEN_MAKE_SOLVE_HELPERS(LDLTType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
+  eigen_assert(rhs.rows() == rows());
+  // dst = P b
+  dst = m_transpositions * rhs;
+
+  // dst = L^-1 (P b)
+  matrixL().solveInPlace(dst);
+
+  // dst = D^-1 (L^-1 P b)
+  // more precisely, use pseudo-inverse of D (see bug 241)
+  using std::abs;
+  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
+  // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
+  // as motivated by LAPACK's xGELSS:
+  // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
+  // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
+  // diagonal element is not well justified and leads to numerical issues in some cases.
+  // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
+  RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
+
+  for (Index i = 0; i < vecD.size(); ++i)
   {
-    eigen_assert(rhs().rows() == dec().matrixLDLT().rows());
-    // dst = P b
-    dst = dec().transpositionsP() * rhs();
-
-    // dst = L^-1 (P b)
-    dec().matrixL().solveInPlace(dst);
-
-    // dst = D^-1 (L^-1 P b)
-    // more precisely, use pseudo-inverse of D (see bug 241)
-    using std::abs;
-    using std::max;
-    typedef typename LDLTType::MatrixType MatrixType;
-    typedef typename LDLTType::RealScalar RealScalar;
-    const typename Diagonal<const MatrixType>::RealReturnType vectorD(dec().vectorD());
-    // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
-    // as motivated by LAPACK's xGELSS:
-    // RealScalar tolerance = (max)(vectorD.array().abs().maxCoeff() *NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
-    // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
-    // diagonal element is not well justified and to numerical issues in some cases.
-    // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
-    RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
-    
-    for (Index i = 0; i < vectorD.size(); ++i) {
-      if(abs(vectorD(i)) > tolerance)
-        dst.row(i) /= vectorD(i);
-      else
-        dst.row(i).setZero();
-    }
+    if(abs(vecD(i)) > tolerance)
+      dst.row(i) /= vecD(i);
+    else
+      dst.row(i).setZero();
+  }
 
-    // dst = L^-T (D^-1 L^-1 P b)
-    dec().matrixU().solveInPlace(dst);
+  // dst = L^-T (D^-1 L^-1 P b)
+  matrixU().solveInPlace(dst);
 
-    // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
-    dst = dec().transpositionsP().transpose() * dst;
-  }
-};
+  // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
+  dst = m_transpositions.transpose() * dst;
 }
+#endif
 
 /** \internal use x = ldlt_object.solve(x);
   *
@@ -588,6 +644,7 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
 
 /** \cholesky_module
   * \returns the Cholesky decomposition with full pivoting without square root of \c *this
+  * \sa MatrixBase::ldlt()
   */
 template<typename MatrixType, unsigned int UpLo>
 inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@@ -598,6 +655,7 @@ SelfAdjointView<MatrixType, UpLo>::ldlt() const
 
 /** \cholesky_module
   * \returns the Cholesky decomposition with full pivoting without square root of \c *this
+  * \sa SelfAdjointView::ldlt()
   */
 template<typename Derived>
 inline const LDLT<typename MatrixBase<Derived>::PlainObject>
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 59723a63d..87ca8d423 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_LLT_H
 #define EIGEN_LLT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal{
 template<typename MatrixType, int UpLo> struct LLT_Traits;
@@ -22,8 +22,8 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   *
   * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features
   *
-  * \param MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
+  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
   *             The other triangular part won't be read.
   *
   * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
@@ -40,8 +40,10 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   *
   * Example: \include LLT_example.cpp
   * Output: \verbinclude LLT_example.out
-  *    
-  * \sa MatrixBase::llt(), class LDLT
+  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
+  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
   */
  /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
   * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
@@ -54,12 +56,12 @@ template<typename _MatrixType, int _UpLo> class LLT
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename MatrixType::StorageIndex StorageIndex;
 
     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
@@ -83,14 +85,30 @@ template<typename _MatrixType, int _UpLo> class LLT
       * according to the specified problem \a size.
       * \sa LLT()
       */
-    LLT(Index size) : m_matrix(size, size),
+    explicit LLT(Index size) : m_matrix(size, size),
                     m_isInitialized(false) {}
 
-    LLT(const MatrixType& matrix)
+    template<typename InputType>
+    explicit LLT(const EigenBase<InputType>& matrix)
       : m_matrix(matrix.rows(), matrix.cols()),
         m_isInitialized(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
+    }
+
+    /** \brief Constructs a LDLT factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
+      * \c MatrixType is a Eigen::Ref.
+      *
+      * \sa LLT(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit LLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_isInitialized(false)
+    {
+      compute(matrix.derived());
     }
 
     /** \returns a view of the upper triangular matrix U */
@@ -115,33 +133,33 @@ template<typename _MatrixType, int _UpLo> class LLT
       * Example: \include LLT_solve.cpp
       * Output: \verbinclude LLT_solve.out
       *
-      * \sa solveInPlace(), MatrixBase::llt()
+      * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<LLT, Rhs>
+    inline const Solve<LLT, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "LLT is not initialized.");
       eigen_assert(m_matrix.rows()==b.rows()
                 && "LLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<LLT, Rhs>(*this, b.derived());
+      return Solve<LLT, Rhs>(*this, b.derived());
     }
 
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = this->solve(b);
-      return true;
-    }
-    
-    bool isPositiveDefinite() const { return true; }
-    #endif
-
     template<typename Derived>
     void solveInPlace(MatrixBase<Derived> &bAndX) const;
 
-    LLT& compute(const MatrixType& matrix);
+    template<typename InputType>
+    LLT& compute(const EigenBase<InputType>& matrix);
+
+    /** \returns an estimate of the reciprocal condition number of the matrix of
+      *  which \c *this is the Cholesky decomposition.
+      */
+    RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
+    }
 
     /** \returns the LLT decomposition matrix
       *
@@ -167,24 +185,38 @@ template<typename _MatrixType, int _UpLo> class LLT
       return m_info;
     }
 
+    /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint.
+      *
+      * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
+      * \code x = decomposition.adjoint().solve(b) \endcode
+      */
+    const LLT& adjoint() const { return *this; };
+
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
 
     template<typename VectorType>
     LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif
+
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
     /** \internal
       * Used to compute and store L
       * The strict upper part is not used and even not initialized.
       */
     MatrixType m_matrix;
+    RealScalar m_l1_norm;
     bool m_isInitialized;
     ComputationInfo m_info;
 };
@@ -194,12 +226,11 @@ namespace internal {
 template<typename Scalar, int UpLo> struct llt_inplace;
 
 template<typename MatrixType, typename VectorType>
-static typename MatrixType::Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma)
+static Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma)
 {
   using std::sqrt;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::ColXpr ColXpr;
   typedef typename internal::remove_all<ColXpr>::type ColXprCleaned;
   typedef typename ColXprCleaned::SegmentReturnType ColXprSegment;
@@ -268,11 +299,10 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
   template<typename MatrixType>
-  static typename MatrixType::Index unblocked(MatrixType& mat)
+  static Index unblocked(MatrixType& mat)
   {
     using std::sqrt;
-    typedef typename MatrixType::Index Index;
-    
+
     eigen_assert(mat.rows()==mat.cols());
     const Index size = mat.rows();
     for(Index k = 0; k < size; ++k)
@@ -289,15 +319,14 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
         return k;
       mat.coeffRef(k,k) = x = sqrt(x);
       if (k>0 && rs>0) A21.noalias() -= A20 * A10.adjoint();
-      if (rs>0) A21 *= RealScalar(1)/x;
+      if (rs>0) A21 /= x;
     }
     return -1;
   }
 
   template<typename MatrixType>
-  static typename MatrixType::Index blocked(MatrixType& m)
+  static Index blocked(MatrixType& m)
   {
-    typedef typename MatrixType::Index Index;
     eigen_assert(m.rows()==m.cols());
     Index size = m.rows();
     if(size<32)
@@ -322,36 +351,36 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
       Index ret;
       if((ret=unblocked(A11))>=0) return k+ret;
       if(rs>0) A11.adjoint().template triangularView<Upper>().template solveInPlace<OnTheRight>(A21);
-      if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,-1); // bottleneck
+      if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,typename NumTraits<RealScalar>::Literal(-1)); // bottleneck
     }
     return -1;
   }
 
   template<typename MatrixType, typename VectorType>
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
   {
     return Eigen::internal::llt_rank_update_lower(mat, vec, sigma);
   }
 };
-  
+
 template<typename Scalar> struct llt_inplace<Scalar, Upper>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
   template<typename MatrixType>
-  static EIGEN_STRONG_INLINE typename MatrixType::Index unblocked(MatrixType& mat)
+  static EIGEN_STRONG_INLINE Index unblocked(MatrixType& mat)
   {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::unblocked(matt);
   }
   template<typename MatrixType>
-  static EIGEN_STRONG_INLINE typename MatrixType::Index blocked(MatrixType& mat)
+  static EIGEN_STRONG_INLINE Index blocked(MatrixType& mat)
   {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::blocked(matt);
   }
   template<typename MatrixType, typename VectorType>
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
   {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::rankUpdate(matt, vec.conjugate(), sigma);
@@ -362,8 +391,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Lower>
 {
   typedef const TriangularView<const MatrixType, Lower> MatrixL;
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
   static bool inplace_decomposition(MatrixType& m)
   { return llt_inplace<typename MatrixType::Scalar, Lower>::blocked(m)==-1; }
 };
@@ -372,8 +401,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
 {
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, Lower> MatrixL;
   typedef const TriangularView<const MatrixType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
-  static inline MatrixU getU(const MatrixType& m) { return m; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
   static bool inplace_decomposition(MatrixType& m)
   { return llt_inplace<typename MatrixType::Scalar, Upper>::blocked(m)==-1; }
 };
@@ -388,14 +417,28 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
   * Output: \verbinclude TutorialLinAlgComputeTwice.out
   */
 template<typename MatrixType, int _UpLo>
-LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
+template<typename InputType>
+LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
 {
   check_template_parameters();
-  
+
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
   m_matrix.resize(size, size);
-  m_matrix = a;
+  m_matrix = a.derived();
+
+  // Compute matrix L1 norm = max abs column sum.
+  m_l1_norm = RealScalar(0);
+  // TODO move this code to SelfAdjointView
+  for (Index col = 0; col < size; ++col) {
+    RealScalar abs_col_sum;
+    if (_UpLo == Lower)
+      abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+    else
+      abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
+    if (abs_col_sum > m_l1_norm)
+      m_l1_norm = abs_col_sum;
+  }
 
   m_isInitialized = true;
   bool ok = Traits::inplace_decomposition(m_matrix);
@@ -423,33 +466,24 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c
 
   return *this;
 }
-    
-namespace internal {
-template<typename _MatrixType, int UpLo, typename Rhs>
-struct solve_retval<LLT<_MatrixType, UpLo>, Rhs>
-  : solve_retval_base<LLT<_MatrixType, UpLo>, Rhs>
-{
-  typedef LLT<_MatrixType,UpLo> LLTType;
-  EIGEN_MAKE_SOLVE_HELPERS(LLTType,Rhs)
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dst = rhs();
-    dec().solveInPlace(dst);
-  }
-};
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType,int _UpLo>
+template<typename RhsType, typename DstType>
+void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
+{
+  dst = rhs;
+  solveInPlace(dst);
 }
+#endif
 
 /** \internal use x = llt_object.solve(x);
-  * 
+  *
   * This is the \em in-place version of solve().
   *
   * \param bAndX represents both the right-hand side matrix b and result x.
   *
-  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
-  *
-  * This version avoids a copy when the right hand side matrix b is not
-  * needed anymore.
+  * This version avoids a copy when the right hand side matrix b is not needed anymore.
   *
   * \sa LLT::solve(), MatrixBase::llt()
   */
@@ -475,6 +509,7 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
 
 /** \cholesky_module
   * \returns the LLT decomposition of \c *this
+  * \sa SelfAdjointView::llt()
   */
 template<typename Derived>
 inline const LLT<typename MatrixBase<Derived>::PlainObject>
@@ -485,6 +520,7 @@ MatrixBase<Derived>::llt() const
 
 /** \cholesky_module
   * \returns the LLT decomposition of \c *this
+  * \sa SelfAdjointView::llt()
   */
 template<typename MatrixType, unsigned int UpLo>
 inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
diff --git a/Eigen/src/Cholesky/LLT_MKL.h b/Eigen/src/Cholesky/LLT_LAPACKE.h
index 66675d747..bc6489e69 100644
--- a/Eigen/src/Cholesky/LLT_MKL.h
+++ b/Eigen/src/Cholesky/LLT_LAPACKE.h
@@ -25,41 +25,38 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *     LLt decomposition based on LAPACKE_?potrf function.
  ********************************************************************************
 */
 
-#ifndef EIGEN_LLT_MKL_H
-#define EIGEN_LLT_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-#include <iostream>
+#ifndef EIGEN_LLT_LAPACKE_H
+#define EIGEN_LLT_LAPACKE_H
 
 namespace Eigen { 
 
 namespace internal {
 
-template<typename Scalar> struct mkl_llt;
+template<typename Scalar> struct lapacke_llt;
 
-#define EIGEN_MKL_LLT(EIGTYPE, MKLTYPE, MKLPREFIX) \
-template<> struct mkl_llt<EIGTYPE> \
+#define EIGEN_LAPACKE_LLT(EIGTYPE, BLASTYPE, LAPACKE_PREFIX) \
+template<> struct lapacke_llt<EIGTYPE> \
 { \
   template<typename MatrixType> \
-  static inline typename MatrixType::Index potrf(MatrixType& m, char uplo) \
+  static inline Index potrf(MatrixType& m, char uplo) \
   { \
     lapack_int matrix_order; \
     lapack_int size, lda, info, StorageOrder; \
     EIGTYPE* a; \
     eigen_assert(m.rows()==m.cols()); \
     /* Set up parameters for ?potrf */ \
-    size = m.rows(); \
+    size = convert_index<lapack_int>(m.rows()); \
     StorageOrder = MatrixType::Flags&RowMajorBit?RowMajor:ColMajor; \
     matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
     a = &(m.coeffRef(0,0)); \
-    lda = m.outerStride(); \
+    lda = convert_index<lapack_int>(m.outerStride()); \
 \
-    info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
+    info = LAPACKE_##LAPACKE_PREFIX##potrf( matrix_order, uplo, size, (BLASTYPE*)a, lda ); \
     info = (info==0) ? -1 : info>0 ? info-1 : size; \
     return info; \
   } \
@@ -67,36 +64,36 @@ template<> struct mkl_llt<EIGTYPE> \
 template<> struct llt_inplace<EIGTYPE, Lower> \
 { \
   template<typename MatrixType> \
-  static typename MatrixType::Index blocked(MatrixType& m) \
+  static Index blocked(MatrixType& m) \
   { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'L'); \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'L'); \
   } \
   template<typename MatrixType, typename VectorType> \
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
   { return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); } \
 }; \
 template<> struct llt_inplace<EIGTYPE, Upper> \
 { \
   template<typename MatrixType> \
-  static typename MatrixType::Index blocked(MatrixType& m) \
+  static Index blocked(MatrixType& m) \
   { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'U'); \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'U'); \
   } \
   template<typename MatrixType, typename VectorType> \
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
   { \
     Transpose<MatrixType> matt(mat); \
     return llt_inplace<EIGTYPE, Lower>::rankUpdate(matt, vec.conjugate(), sigma); \
   } \
 };
 
-EIGEN_MKL_LLT(double, double, d)
-EIGEN_MKL_LLT(float, float, s)
-EIGEN_MKL_LLT(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_LLT(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_LLT(double, double, d)
+EIGEN_LAPACKE_LLT(float, float, s)
+EIGEN_LAPACKE_LLT(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_LLT(scomplex, lapack_complex_float, c)
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_LLT_MKL_H
+#endif // EIGEN_LLT_LAPACKE_H
diff --git a/Eigen/src/CholmodSupport/CMakeLists.txt b/Eigen/src/CholmodSupport/CMakeLists.txt
deleted file mode 100644
index 814dfa613..000000000
--- a/Eigen/src/CholmodSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CholmodSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_CholmodSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/CholmodSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index c449960de..571972023 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -14,46 +14,52 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Scalar, typename CholmodType>
-void cholmod_configure_matrix(CholmodType& mat)
-{
-  if (internal::is_same<Scalar,float>::value)
-  {
-    mat.xtype = CHOLMOD_REAL;
-    mat.dtype = CHOLMOD_SINGLE;
-  }
-  else if (internal::is_same<Scalar,double>::value)
-  {
+template<typename Scalar> struct cholmod_configure_matrix;
+
+template<> struct cholmod_configure_matrix<double> {
+  template<typename CholmodType>
+  static void run(CholmodType& mat) {
     mat.xtype = CHOLMOD_REAL;
     mat.dtype = CHOLMOD_DOUBLE;
   }
-  else if (internal::is_same<Scalar,std::complex<float> >::value)
-  {
-    mat.xtype = CHOLMOD_COMPLEX;
-    mat.dtype = CHOLMOD_SINGLE;
-  }
-  else if (internal::is_same<Scalar,std::complex<double> >::value)
-  {
+};
+
+template<> struct cholmod_configure_matrix<std::complex<double> > {
+  template<typename CholmodType>
+  static void run(CholmodType& mat) {
     mat.xtype = CHOLMOD_COMPLEX;
     mat.dtype = CHOLMOD_DOUBLE;
   }
-  else
-  {
-    eigen_assert(false && "Scalar type not supported by CHOLMOD");
-  }
-}
+};
+
+// Other scalar types are not yet suppotred by Cholmod
+// template<> struct cholmod_configure_matrix<float> {
+//   template<typename CholmodType>
+//   static void run(CholmodType& mat) {
+//     mat.xtype = CHOLMOD_REAL;
+//     mat.dtype = CHOLMOD_SINGLE;
+//   }
+// };
+//
+// template<> struct cholmod_configure_matrix<std::complex<float> > {
+//   template<typename CholmodType>
+//   static void run(CholmodType& mat) {
+//     mat.xtype = CHOLMOD_COMPLEX;
+//     mat.dtype = CHOLMOD_SINGLE;
+//   }
+// };
 
 } // namespace internal
 
 /** Wraps the Eigen sparse matrix \a mat into a Cholmod sparse matrix object.
   * Note that the data are shared.
   */
-template<typename _Scalar, int _Options, typename _Index>
-cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
+template<typename _Scalar, int _Options, typename _StorageIndex>
+cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> > mat)
 {
   cholmod_sparse res;
   res.nzmax   = mat.nonZeros();
-  res.nrow    = mat.rows();;
+  res.nrow    = mat.rows();
   res.ncol    = mat.cols();
   res.p       = mat.outerIndexPtr();
   res.i       = mat.innerIndexPtr();
@@ -74,11 +80,11 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
   res.dtype   = 0;
   res.stype   = -1;
   
-  if (internal::is_same<_Index,int>::value)
+  if (internal::is_same<_StorageIndex,int>::value)
   {
     res.itype = CHOLMOD_INT;
   }
-  else if (internal::is_same<_Index,UF_long>::value)
+  else if (internal::is_same<_StorageIndex,long>::value)
   {
     res.itype = CHOLMOD_LONG;
   }
@@ -88,7 +94,7 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
   }
 
   // setup res.xtype
-  internal::cholmod_configure_matrix<_Scalar>(res);
+  internal::cholmod_configure_matrix<_Scalar>::run(res);
   
   res.stype = 0;
   
@@ -98,16 +104,23 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
 template<typename _Scalar, int _Options, typename _Index>
 const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>& mat)
 {
-  cholmod_sparse res = viewAsCholmod(mat.const_cast_derived());
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.const_cast_derived()));
+  return res;
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+const cholmod_sparse viewAsCholmod(const SparseVector<_Scalar,_Options,_Index>& mat)
+{
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.const_cast_derived()));
   return res;
 }
 
 /** Returns a view of the Eigen sparse matrix \a mat as Cholmod sparse matrix.
   * The data are not copied but shared. */
 template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
-cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
+cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
-  cholmod_sparse res = viewAsCholmod(mat.matrix().const_cast_derived());
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
   
   if(UpLo==Upper) res.stype =  1;
   if(UpLo==Lower) res.stype = -1;
@@ -131,19 +144,19 @@ cholmod_dense viewAsCholmod(MatrixBase<Derived>& mat)
   res.x      = (void*)(mat.derived().data());
   res.z      = 0;
 
-  internal::cholmod_configure_matrix<Scalar>(res);
+  internal::cholmod_configure_matrix<Scalar>::run(res);
 
   return res;
 }
 
 /** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix.
   * The data are not copied but shared. */
-template<typename Scalar, int Flags, typename Index>
-MappedSparseMatrix<Scalar,Flags,Index> viewAsEigen(cholmod_sparse& cm)
+template<typename Scalar, int Flags, typename StorageIndex>
+MappedSparseMatrix<Scalar,Flags,StorageIndex> viewAsEigen(cholmod_sparse& cm)
 {
-  return MappedSparseMatrix<Scalar,Flags,Index>
-         (cm.nrow, cm.ncol, static_cast<Index*>(cm.p)[cm.ncol],
-          static_cast<Index*>(cm.p), static_cast<Index*>(cm.i),static_cast<Scalar*>(cm.x) );
+  return MappedSparseMatrix<Scalar,Flags,StorageIndex>
+         (cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol],
+          static_cast<StorageIndex*>(cm.p), static_cast<StorageIndex*>(cm.i),static_cast<Scalar*>(cm.x) );
 }
 
 enum CholmodMode {
@@ -157,29 +170,39 @@ enum CholmodMode {
   * \sa class CholmodSupernodalLLT, class CholmodSimplicialLDLT, class CholmodSimplicialLLT
   */
 template<typename _MatrixType, int _UpLo, typename Derived>
-class CholmodBase : internal::noncopyable
+class CholmodBase : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
   public:
     typedef _MatrixType MatrixType;
     enum { UpLo = _UpLo };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef MatrixType CholMatrixType;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
     CholmodBase()
-      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
+      : m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false)
     {
-      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
+      EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
+      m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
       cholmod_start(&m_cholmod);
     }
 
-    CholmodBase(const MatrixType& matrix)
-      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
+    explicit CholmodBase(const MatrixType& matrix)
+      : m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false)
     {
-      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
+      EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
+      m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
       cholmod_start(&m_cholmod);
       compute(matrix);
     }
@@ -191,11 +214,8 @@ class CholmodBase : internal::noncopyable
       cholmod_finish(&m_cholmod);
     }
     
-    inline Index cols() const { return m_cholmodFactor->n; }
-    inline Index rows() const { return m_cholmodFactor->n; }
-    
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
+    inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
     
     /** \brief Reports whether previous computation was successful.
       *
@@ -216,34 +236,6 @@ class CholmodBase : internal::noncopyable
       return derived();
     }
     
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<CholmodBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<CholmodBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<CholmodBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<CholmodBase, Rhs>(*this, b.derived());
-    }
-    
     /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
       *
       * This function is particularly useful when solving for several problems having the same structure.
@@ -277,7 +269,7 @@ class CholmodBase : internal::noncopyable
       eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
       cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
       cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod);
-      
+
       // If the factorization failed, minor is the column at which it did. On success minor == n.
       this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
       m_factorizationIsOk = true;
@@ -290,20 +282,22 @@ class CholmodBase : internal::noncopyable
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
       const Index size = m_cholmodFactor->n;
       EIGEN_UNUSED_VARIABLE(size);
       eigen_assert(size==b.rows());
+      
+      // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref.
+      Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
 
-      // note: cd stands for Cholmod Dense
-      Rhs& b_ref(b.const_cast_derived());
       cholmod_dense b_cd = viewAsCholmod(b_ref);
       cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod);
       if(!x_cd)
       {
         this->m_info = NumericalIssue;
+        return;
       }
       // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
       dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
@@ -311,8 +305,8 @@ class CholmodBase : internal::noncopyable
     }
     
     /** \internal */
-    template<typename RhsScalar, int RhsOptions, typename RhsIndex, typename DestScalar, int DestOptions, typename DestIndex>
-    void _solve(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+    template<typename RhsDerived, typename DestDerived>
+    void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
     {
       eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
       const Index size = m_cholmodFactor->n;
@@ -320,14 +314,16 @@ class CholmodBase : internal::noncopyable
       eigen_assert(size==b.rows());
 
       // note: cs stands for Cholmod Sparse
-      cholmod_sparse b_cs = viewAsCholmod(b);
+      Ref<SparseMatrix<typename RhsDerived::Scalar,ColMajor,typename RhsDerived::StorageIndex> > b_ref(b.const_cast_derived());
+      cholmod_sparse b_cs = viewAsCholmod(b_ref);
       cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod);
       if(!x_cs)
       {
         this->m_info = NumericalIssue;
+        return;
       }
       // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
-      dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);
+      dest.derived() = viewAsEigen<typename DestDerived::Scalar,ColMajor,typename DestDerived::StorageIndex>(*x_cs);
       cholmod_free_sparse(&x_cs, &m_cholmod);
     }
     #endif // EIGEN_PARSED_BY_DOXYGEN
@@ -344,10 +340,61 @@ class CholmodBase : internal::noncopyable
       */
     Derived& setShift(const RealScalar& offset)
     {
-      m_shiftOffset[0] = offset;
+      m_shiftOffset[0] = double(offset);
       return derived();
     }
     
+    /** \returns the determinant of the underlying matrix from the current factorization */
+    Scalar determinant() const
+    {
+      using std::exp;
+      return exp(logDeterminant());
+    }
+
+    /** \returns the log determinant of the underlying matrix from the current factorization */
+    Scalar logDeterminant() const
+    {
+      using std::log;
+      using numext::real;
+      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
+
+      RealScalar logDet = 0;
+      Scalar *x = static_cast<Scalar*>(m_cholmodFactor->x);
+      if (m_cholmodFactor->is_super)
+      {
+        // Supernodal factorization stored as a packed list of dense column-major blocs,
+        // as described by the following structure:
+
+        // super[k] == index of the first column of the j-th super node
+        StorageIndex *super = static_cast<StorageIndex*>(m_cholmodFactor->super);
+        // pi[k] == offset to the description of row indices
+        StorageIndex *pi = static_cast<StorageIndex*>(m_cholmodFactor->pi);
+        // px[k] == offset to the respective dense block
+        StorageIndex *px = static_cast<StorageIndex*>(m_cholmodFactor->px);
+
+        Index nb_super_nodes = m_cholmodFactor->nsuper;
+        for (Index k=0; k < nb_super_nodes; ++k)
+        {
+          StorageIndex ncols = super[k + 1] - super[k];
+          StorageIndex nrows = pi[k + 1] - pi[k];
+
+          Map<const Array<Scalar,1,Dynamic>, 0, InnerStride<> > sk(x + px[k], ncols, InnerStride<>(nrows+1));
+          logDet += sk.real().log().sum();
+        }
+      }
+      else
+      {
+        // Simplicial factorization stored as standard CSC matrix.
+        StorageIndex *p = static_cast<StorageIndex*>(m_cholmodFactor->p);
+        Index size = m_cholmodFactor->n;
+        for (Index k=0; k<size; ++k)
+          logDet += log(real( x[p[k]] ));
+      }
+      if (m_cholmodFactor->is_ll)
+        logDet *= 2.0;
+      return logDet;
+    };
+
     template<typename Stream>
     void dumpMemory(Stream& /*s*/)
     {}
@@ -355,9 +402,8 @@ class CholmodBase : internal::noncopyable
   protected:
     mutable cholmod_common m_cholmod;
     cholmod_factor* m_cholmodFactor;
-    RealScalar m_shiftOffset[2];
+    double m_shiftOffset[2];
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     int m_factorizationIsOk;
     int m_analysisIsOk;
 };
@@ -376,9 +422,13 @@ class CholmodBase : internal::noncopyable
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
-  * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLLT
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLLT
   */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT<_MatrixType, _UpLo> >
@@ -395,7 +445,7 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
     CholmodSimplicialLLT(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodSimplicialLLT() {}
@@ -423,9 +473,13 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
-  * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLDLT
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLDLT
   */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT<_MatrixType, _UpLo> >
@@ -442,7 +496,7 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
     CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodSimplicialLDLT() {}
@@ -468,9 +522,13 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept
   */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT<_MatrixType, _UpLo> >
@@ -487,7 +545,7 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
     CholmodSupernodalLLT(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodSupernodalLLT() {}
@@ -515,9 +573,13 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept
   */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecomposition<_MatrixType, _UpLo> >
@@ -534,7 +596,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
     CholmodDecomposition(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodDecomposition() {}
@@ -572,36 +634,6 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
     }
 };
 
-namespace internal {
-  
-template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
-struct solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-  : solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-{
-  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
-struct sparse_solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-  : sparse_solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-{
-  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_CHOLMODSUPPORT_H
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 0ab03eff0..0d34269fd 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -12,7 +12,16 @@
 
 namespace Eigen {
 
-/** \class Array 
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  typedef ArrayXpr XprKind;
+  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
+};
+}
+
+/** \class Array
   * \ingroup Core_Module
   *
   * \brief General-purpose arrays with easy API for coefficient-wise operations
@@ -24,20 +33,14 @@ namespace Eigen {
   * API for the %Matrix class provides easy access to linear-algebra
   * operations.
   *
+  * See documentation of class Matrix for detailed information on the template parameters
+  * storage layout.
+  *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
   *
-  * \sa \ref TutorialArrayClass, \ref TopicClassHierarchy
+  * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy
   */
-namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  typedef ArrayXpr XprKind;
-  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
-};
-}
-
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 class Array
   : public PlainObjectBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
@@ -69,11 +72,27 @@ class Array
       * the usage of 'using'. This should be done only for operator=.
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array& operator=(const EigenBase<OtherDerived> &other)
     {
       return Base::operator=(other);
     }
 
+    /** Set all the entries to \a value.
+      * \sa DenseBase::setConstant(), DenseBase::fill()
+      */
+    /* This overload is needed because the usage of
+      *   using Base::operator=;
+      * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
+      * the usage of 'using'. This should be done only for operator=.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array& operator=(const Scalar &value)
+    {
+      Base::setConstant(value);
+      return *this;
+    }
+
     /** Copies the value of the expression \a other into \c *this with automatic resizing.
       *
       * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
@@ -84,7 +103,8 @@ class Array
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array& operator=(const ArrayBase<OtherDerived>& other)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array& operator=(const DenseBase<OtherDerived>& other)
     {
       return Base::_set(other);
     }
@@ -92,11 +112,12 @@ class Array
     /** This is a special case of the templated operator=. Its purpose is to
       * prevent a default operator= from hiding the templated operator=.
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array& operator=(const Array& other)
     {
       return Base::_set(other);
     }
-
+    
     /** Default constructor.
       *
       * For fixed-size matrices, does nothing.
@@ -107,6 +128,7 @@ class Array
       *
       * \sa resize(Index,Index)
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array() : Base()
     {
       Base::_check_template_params();
@@ -116,6 +138,7 @@ class Array
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     // FIXME is it still needed ??
     /** \internal */
+    EIGEN_DEVICE_FUNC
     Array(internal::constructor_without_unaligned_array_assert)
       : Base(internal::constructor_without_unaligned_array_assert())
     {
@@ -124,41 +147,64 @@ class Array
     }
 #endif
 
-    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
-      *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Matrix() instead.
-      */
-    EIGEN_STRONG_INLINE explicit Array(Index dim)
-      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+      : Base(std::move(other))
     {
       Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Array)
-      eigen_assert(dim >= 0);
-      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
+        Base::_set_noalias(other);
+    }
+    EIGEN_DEVICE_FUNC
+    Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
+    {
+      other.swap(*this);
+      return *this;
     }
+#endif
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Array(const T& x)
+    {
+      Base::_check_template_params();
+      Base::template _init1<T>(x);
+    }
+
     template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1)
     {
       Base::_check_template_params();
       this->template _init2<T0,T1>(val0, val1);
     }
     #else
-    /** constructs an uninitialized matrix with \a rows rows and \a cols columns.
+    /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
+    EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
+    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass the dimension here, so it makes more sense to use the default
+      * constructor Array() instead.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Array(Index dim);
+    /** constructs an initialized 1x1 Array with the given coefficient */
+    Array(const Scalar& value);
+    /** constructs an uninitialized array with \a rows rows and \a cols columns.
       *
-      * This is useful for dynamic-size matrices. For fixed-size matrices,
+      * This is useful for dynamic-size arrays. For fixed-size arrays,
       * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead. */
+      * Array() instead. */
     Array(Index rows, Index cols);
     /** constructs an initialized 2D vector with given coefficients */
     Array(const Scalar& val0, const Scalar& val1);
     #endif
 
     /** constructs an initialized 3D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
     {
       Base::_check_template_params();
@@ -168,6 +214,7 @@ class Array
       m_storage.data()[2] = val2;
     }
     /** constructs an initialized 4D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
     {
       Base::_check_template_params();
@@ -178,51 +225,21 @@ class Array
       m_storage.data()[3] = val3;
     }
 
-    explicit Array(const Scalar *data);
-
-    /** Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array(const ArrayBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
     /** Copy constructor */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Array& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
+            : Base(other)
+    { }
 
     /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::_resize_to_match(other);
-      *this = other;
-    }
-
-    /** Override MatrixBase::swap() since for dynamic-sized matrices of same type it is enough to swap the
-      * data pointers.
-      */
-    template<typename OtherDerived>
-    void swap(ArrayBase<OtherDerived> const & other)
-    { this->_swap(other.derived()); }
+      : Base(other.derived())
+    { }
 
-    inline Index innerStride() const { return 1; }
-    inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
 
     #ifdef EIGEN_ARRAY_PLUGIN
     #include EIGEN_ARRAY_PLUGIN
diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index 38852600d..f0232f65e 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -32,7 +32,7 @@ template<typename ExpressionType> class MatrixWrapper;
   * \tparam Derived is the derived type, e.g., an array or an expression type.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
   *
   * \sa class MatrixBase, \ref TopicClassHierarchy
   */
@@ -46,11 +46,7 @@ template<typename Derived> class ArrayBase
 
     typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;
 
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
-
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -64,8 +60,7 @@ template<typename Derived> class ArrayBase
     using Base::MaxSizeAtCompileTime;
     using Base::IsVectorAtCompileTime;
     using Base::Flags;
-    using Base::CoeffReadCost;
-
+    
     using Base::derived;
     using Base::const_cast_derived;
     using Base::rows;
@@ -85,25 +80,14 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal the plain matrix type corresponding to this expression. Note that is not necessarily
-      * exactly the return type of eval(): in the case of plain matrices, the return type of eval() is a const
-      * reference to a matrix, not a matrix! It is however guaranteed that the return type of eval() is either
-      * PlainObject or const PlainObject&.
-      */
-    typedef Array<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
-
+    typedef typename Base::PlainObject PlainObject;
 
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
@@ -114,44 +98,62 @@ template<typename Derived> class ArrayBase
 #     include EIGEN_ARRAYBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const ArrayBase& other)
     {
-      return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+      internal::call_assignment(derived(), other.derived());
+      return derived();
     }
-
-    Derived& operator+=(const Scalar& scalar)
-    { return *this = derived() + scalar; }
-    Derived& operator-=(const Scalar& scalar)
-    { return *this = derived() - scalar; }
+    
+    /** Set all the entries to \a value.
+      * \sa DenseBase::setConstant(), DenseBase::fill() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator=(const Scalar &value)
+    { Base::setConstant(value); return derived(); }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator+=(const Scalar& scalar);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator-=(const Scalar& scalar);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator+=(const ArrayBase<OtherDerived>& other);
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator-=(const ArrayBase<OtherDerived>& other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator*=(const ArrayBase<OtherDerived>& other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator/=(const ArrayBase<OtherDerived>& other);
 
   public:
+    EIGEN_DEVICE_FUNC
     ArrayBase<Derived>& array() { return *this; }
+    EIGEN_DEVICE_FUNC
     const ArrayBase<Derived>& array() const { return *this; }
 
     /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
       * \sa MatrixBase::array() */
-    MatrixWrapper<Derived> matrix() { return derived(); }
-    const MatrixWrapper<const Derived> matrix() const { return derived(); }
+    EIGEN_DEVICE_FUNC
+    MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
+    EIGEN_DEVICE_FUNC
+    const MatrixWrapper<const Derived> matrix() const { return MatrixWrapper<const Derived>(derived()); }
 
 //     template<typename Dest>
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }
 
   protected:
+    EIGEN_DEVICE_FUNC
     ArrayBase() : Base() {}
 
   private:
@@ -176,8 +178,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -190,8 +191,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -204,8 +204,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -218,8 +217,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index b4641e2a0..a04521a16 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -44,6 +44,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     typedef ArrayBase<ArrayWrapper> Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper)
+    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;
 
     typedef typename internal::conditional<
                        internal::is_lvalue<ExpressionType>::value,
@@ -51,76 +52,45 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
 
-    inline ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+    using Base::coeffRef;
 
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const { return m_expression.innerStride(); }
 
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    EIGEN_DEVICE_FUNC
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+    EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return m_expression.data(); }
 
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return m_expression.coeff(rowId, colId);
-    }
-
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(rowId, colId);
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_expression.template packet<LoadMode>(rowId, colId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+      return m_expression.coeffRef(index);
     }
 
     template<typename Dest>
+    EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const { dst = m_expression; }
 
     const typename internal::remove_all<NestedExpressionType>::type& 
+    EIGEN_DEVICE_FUNC
     nestedExpression() const 
     {
       return m_expression;
@@ -128,10 +98,12 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
 
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index)  */
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    EIGEN_DEVICE_FUNC
+    void resize(Index newSize) { m_expression.resize(newSize); }
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index,Index)*/
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+    EIGEN_DEVICE_FUNC
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }
 
   protected:
     NestedExpressionType m_expression;
@@ -169,6 +141,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     typedef MatrixBase<MatrixWrapper<ExpressionType> > Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(MatrixWrapper)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MatrixWrapper)
+    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;
 
     typedef typename internal::conditional<
                        internal::is_lvalue<ExpressionType>::value,
@@ -176,72 +149,40 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
 
-    inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}
+    using Base::coeffRef;
 
+    EIGEN_DEVICE_FUNC
+    explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const { return m_expression.innerStride(); }
 
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    EIGEN_DEVICE_FUNC
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+    EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return m_expression.data(); }
 
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return m_expression.coeff(rowId, colId);
-    }
-
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return m_expression.derived().coeffRef(rowId, colId);
     }
 
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_expression.template packet<LoadMode>(rowId, colId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+      return m_expression.coeffRef(index);
     }
 
+    EIGEN_DEVICE_FUNC
     const typename internal::remove_all<NestedExpressionType>::type& 
     nestedExpression() const 
     {
@@ -250,10 +191,12 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
 
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index)  */
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    EIGEN_DEVICE_FUNC
+    void resize(Index newSize) { m_expression.resize(newSize); }
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index,Index)*/
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+    EIGEN_DEVICE_FUNC
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }
 
   protected:
     NestedExpressionType m_expression;
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index bcfc261e5..53806ba33 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -14,478 +14,6 @@
 
 namespace Eigen {
 
-namespace internal {
-
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for traversal and unrolling       *
-***************************************************************************/
-
-template <typename Derived, typename OtherDerived>
-struct assign_traits
-{
-public:
-  enum {
-    DstIsAligned = Derived::Flags & AlignedBit,
-    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
-    SrcIsAligned = OtherDerived::Flags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
-  };
-
-private:
-  enum {
-    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
-              : int(Derived::RowsAtCompileTime),
-    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
-              : int(Derived::MaxRowsAtCompileTime),
-    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Derived::Scalar>::size
-  };
-
-  enum {
-    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
-    MightVectorize = StorageOrdersAgree
-                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(DstIsAligned) && int(SrcIsAligned),
-    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
-      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-         so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
-      /* slice vectorization can be slow, so we only want it if the slices are big, which is
-         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
-  };
-
-public:
-  enum {
-    Traversal = int(MayInnerVectorize)  ? int(InnerVectorizedTraversal)
-              : int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
-              : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
-              : int(MayLinearize)       ? int(LinearTraversal)
-                                        : int(DefaultTraversal),
-    Vectorized = int(Traversal) == InnerVectorizedTraversal
-              || int(Traversal) == LinearVectorizedTraversal
-              || int(Traversal) == SliceVectorizedTraversal
-  };
-
-private:
-  enum {
-    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
-    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
-    MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
-  };
-
-public:
-  enum {
-    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
-                ? (
-                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
-                  : int(MayUnrollInner)      ? int(InnerUnrolling)
-                                             : int(NoUnrolling)
-                  )
-              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) )
-              : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) )
-              : int(NoUnrolling)
-  };
-
-#ifdef EIGEN_DEBUG_ASSIGN
-  static void debug()
-  {
-    EIGEN_DEBUG_VAR(DstIsAligned)
-    EIGEN_DEBUG_VAR(SrcIsAligned)
-    EIGEN_DEBUG_VAR(JointAlignment)
-    EIGEN_DEBUG_VAR(InnerSize)
-    EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
-    EIGEN_DEBUG_VAR(StorageOrdersAgree)
-    EIGEN_DEBUG_VAR(MightVectorize)
-    EIGEN_DEBUG_VAR(MayLinearize)
-    EIGEN_DEBUG_VAR(MayInnerVectorize)
-    EIGEN_DEBUG_VAR(MayLinearVectorize)
-    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
-    EIGEN_DEBUG_VAR(UnrollingLimit)
-    EIGEN_DEBUG_VAR(MayUnrollCompletely)
-    EIGEN_DEBUG_VAR(MayUnrollInner)
-    EIGEN_DEBUG_VAR(Unrolling)
-  }
-#endif
-};
-
-/***************************************************************************
-* Part 2 : meta-unrollers
-***************************************************************************/
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_DefaultTraversal_CompleteUnrolling
-{
-  enum {
-    outer = Index / Derived1::InnerSizeAtCompileTime,
-    inner = Index % Derived1::InnerSizeAtCompileTime
-  };
-
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.copyCoeffByOuterInner(outer, inner, src);
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_DefaultTraversal_InnerUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
-  {
-    dst.copyCoeffByOuterInner(outer, Index, src);
-    assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, outer);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_LinearTraversal_CompleteUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.copyCoeff(Index, src);
-    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_innervec_CompleteUnrolling
-{
-  enum {
-    outer = Index / Derived1::InnerSizeAtCompileTime,
-    inner = Index % Derived1::InnerSizeAtCompileTime,
-    JointAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-  };
-
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.template copyPacketByOuterInner<Derived2, Aligned, JointAlignment>(outer, inner, src);
-    assign_innervec_CompleteUnrolling<Derived1, Derived2,
-      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_innervec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_innervec_InnerUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
-  {
-    dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, Index, src);
-    assign_innervec_InnerUnrolling<Derived1, Derived2,
-      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, outer);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_innervec_InnerUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-template<typename Derived1, typename Derived2,
-         int Traversal = assign_traits<Derived1, Derived2>::Traversal,
-         int Unrolling = assign_traits<Derived1, Derived2>::Unrolling,
-         int Version = Specialized>
-struct assign_impl;
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Derived1, typename Derived2, int Unrolling, int Version>
-struct assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling, Version>
-{
-  static inline void run(Derived1 &, const Derived2 &) { }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
-        ::run(dst, src, outer);
-  }
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index size = dst.size();
-    for(Index i = 0; i < size; ++i)
-      dst.copyCoeff(i, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    const Index packetSize = packet_traits<typename Derived1::Scalar>::size;
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, inner, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, InnerUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      assign_innervec_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
-        ::run(dst, src, outer);
-  }
-};
-
-/***************************
-*** Linear vectorization ***
-***************************/
-
-template <bool IsAligned = false>
-struct unaligned_assign_impl
-{
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_STRONG_INLINE void run(const Derived&, OtherDerived&, typename Derived::Index, typename Derived::Index) {}
-};
-
-template <>
-struct unaligned_assign_impl<false>
-{
-  // MSVC must not inline this functions. If it does, it fails to optimize the
-  // packet access path.
-#ifdef _MSC_VER
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_DONT_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
-#else
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_STRONG_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
-#endif
-  {
-    for (typename Derived::Index index = start; index < end; ++index)
-      dst.copyCoeff(index, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index size = dst.size();
-    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
-      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-    };
-    const Index alignedStart = assign_traits<Derived1,Derived2>::DstIsAligned ? 0
-                             : internal::first_aligned(&dst.coeffRef(0), size);
-    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
-
-    unaligned_assign_impl<assign_traits<Derived1,Derived2>::DstIsAligned!=0>::run(src,dst,0,alignedStart);
-
-    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
-    {
-      dst.template copyPacket<Derived2, dstAlignment, srcAlignment>(index, src);
-    }
-
-    unaligned_assign_impl<>::run(src,dst,alignedEnd,size);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, CompleteUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    enum { size = Derived1::SizeAtCompileTime,
-           packetSize = packet_traits<typename Derived1::Scalar>::size,
-           alignedSize = (size/packetSize)*packetSize };
-
-    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
-  }
-};
-
-/**************************
-*** Slice vectorization ***
-***************************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    typedef typename Derived1::Scalar Scalar;
-    typedef packet_traits<Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      alignable = PacketTraits::AlignedOnScalar,
-      dstIsAligned = assign_traits<Derived1,Derived2>::DstIsAligned,
-      dstAlignment = alignable ? Aligned : int(dstIsAligned),
-      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-    };
-    const Scalar *dst_ptr = &dst.coeffRef(0,0);
-    if((!bool(dstIsAligned)) && (Index(dst_ptr) % sizeof(Scalar))>0)
-    {
-      // the pointer is not aligend-on scalar, so alignment is not possible
-      return assign_impl<Derived1,Derived2,DefaultTraversal,NoUnrolling>::run(dst, src);
-    }
-    const Index packetAlignedMask = packetSize - 1;
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    const Index alignedStep = alignable ? (packetSize - dst.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize);
-
-    for(Index outer = 0; outer < outerSize; ++outer)
-    {
-      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
-      // do the non-vectorizable part of the assignment
-      for(Index inner = 0; inner<alignedStart ; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-
-      // do the vectorizable part of the assignment
-      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
-        dst.template copyPacketByOuterInner<Derived2, dstAlignment, Unaligned>(outer, inner, src);
-
-      // do the non-vectorizable part of the assignment
-      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-
-      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
-    }
-  }
-};
-
-} // end namespace internal
-
-/***************************************************************************
-* Part 4 : implementation of DenseBase methods
-***************************************************************************/
-
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
@@ -499,90 +27,62 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
   EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::assign_traits<Derived, OtherDerived>::debug();
-#endif
   eigen_assert(rows() == other.rows() && cols() == other.cols());
-  internal::assign_impl<Derived, OtherDerived, int(SameType) ? int(internal::assign_traits<Derived, OtherDerived>::Traversal)
-                                                       : int(InvalidTraversal)>::run(derived(),other.derived());
-#ifndef EIGEN_NO_DEBUG
-  checkTransposeAliasing(other.derived());
-#endif
+  internal::call_assignment_no_alias(derived(),other.derived());
+  
   return derived();
 }
 
-namespace internal {
-
-template<typename Derived, typename OtherDerived,
-         bool EvalBeforeAssigning = (int(internal::traits<OtherDerived>::Flags) & EvalBeforeAssigningBit) != 0,
-         bool NeedToTranspose = ((int(Derived::RowsAtCompileTime) == 1 && int(OtherDerived::ColsAtCompileTime) == 1)
-                              |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                  // revert to || as soon as not needed anymore.
-                                  (int(Derived::ColsAtCompileTime) == 1 && int(OtherDerived::RowsAtCompileTime) == 1))
-                              && int(Derived::SizeAtCompileTime) != 1>
-struct assign_selector;
-
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,false,false> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
-  template<typename ActualDerived, typename ActualOtherDerived>
-  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { other.evalTo(dst); return dst; }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,true,false> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.eval()); }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,false,true> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
-  template<typename ActualDerived, typename ActualOtherDerived>
-  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { Transpose<ActualDerived> dstTrans(dst); other.evalTo(dstTrans); return dst; }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,true,true> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
-};
-
-} // end namespace internal
-
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other)
 {
-  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other)
 {
-  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
 template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
 template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+  other.derived().evalTo(derived());
+  return derived();
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
new file mode 100644
index 000000000..b0ec7b7ca
--- /dev/null
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -0,0 +1,935 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGN_EVALUATOR_H
+#define EIGEN_ASSIGN_EVALUATOR_H
+
+namespace Eigen {
+
+// This implementation is based on Assign.h
+
+namespace internal {
+  
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for traversal and unrolling       *
+***************************************************************************/
+
+// copy_using_evaluator_traits is based on assign_traits
+
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
+struct copy_using_evaluator_traits
+{
+  typedef typename DstEvaluator::XprType Dst;
+  typedef typename Dst::Scalar DstScalar;
+  
+  enum {
+    DstFlags = DstEvaluator::Flags,
+    SrcFlags = SrcEvaluator::Flags
+  };
+  
+public:
+  enum {
+    DstAlignment = DstEvaluator::Alignment,
+    SrcAlignment = SrcEvaluator::Alignment,
+    DstHasDirectAccess = DstFlags & DirectAccessBit,
+    JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
+  };
+
+private:
+  enum {
+    InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
+              : int(DstFlags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
+              : int(Dst::RowsAtCompileTime),
+    InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
+              : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
+              : int(Dst::MaxRowsAtCompileTime),
+    OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
+    MaxSizeAtCompileTime = Dst::SizeAtCompileTime
+  };
+
+  // TODO distinguish between linear traversal and inner-traversals
+  typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType;
+  typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType;
+
+  enum {
+    LinearPacketSize = unpacket_traits<LinearPacketType>::size,
+    InnerPacketSize = unpacket_traits<InnerPacketType>::size
+  };
+
+public:
+  enum {
+    LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment,
+    InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment
+  };
+
+private:
+  enum {
+    DstIsRowMajor = DstFlags&RowMajorBit,
+    SrcIsRowMajor = SrcFlags&RowMajorBit,
+    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
+    MightVectorize = bool(StorageOrdersAgree)
+                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
+                  && bool(functor_traits<AssignFunc>::PacketAccess),
+    MayInnerVectorize  = MightVectorize
+                       && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0
+                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
+                       && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
+    MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
+                       && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
+      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
+         so it's only good for large enough sizes. */
+    MaySliceVectorize  = bool(MightVectorize) && bool(DstHasDirectAccess)
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize)))
+      /* slice vectorization can be slow, so we only want it if the slices are big, which is
+         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
+         in a fixed-size matrix
+         However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
+  };
+
+public:
+  enum {
+    Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
+              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
+              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
+              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
+              : int(MayLinearize)        ? int(LinearTraversal)
+                                         : int(DefaultTraversal),
+    Vectorized = int(Traversal) == InnerVectorizedTraversal
+              || int(Traversal) == LinearVectorizedTraversal
+              || int(Traversal) == SliceVectorizedTraversal
+  };
+
+  typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType;
+
+private:
+  enum {
+    ActualPacketSize    = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize
+                        : Vectorized ? InnerPacketSize
+                        : 1,
+    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
+    MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
+                       && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit),
+    MayUnrollInner      = int(InnerSize) != Dynamic
+                       && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit)
+  };
+
+public:
+  enum {
+    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
+                ? (
+                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
+                  : int(MayUnrollInner)      ? int(InnerUnrolling)
+                                             : int(NoUnrolling)
+                  )
+              : int(Traversal) == int(LinearVectorizedTraversal)
+                ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)))
+                          ? int(CompleteUnrolling)
+                          : int(NoUnrolling) )
+              : int(Traversal) == int(LinearTraversal)
+                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
+                                              : int(NoUnrolling) )
+#if EIGEN_UNALIGNED_VECTORIZE
+              : int(Traversal) == int(SliceVectorizedTraversal)
+                ? ( bool(MayUnrollInner) ? int(InnerUnrolling)
+                                         : int(NoUnrolling) )
+#endif
+              : int(NoUnrolling)
+  };
+
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
+    std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    std::cerr << "DstFlags" << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl;
+    std::cerr << "SrcFlags" << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl;
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(DstAlignment)
+    EIGEN_DEBUG_VAR(SrcAlignment)
+    EIGEN_DEBUG_VAR(LinearRequiredAlignment)
+    EIGEN_DEBUG_VAR(InnerRequiredAlignment)
+    EIGEN_DEBUG_VAR(JointAlignment)
+    EIGEN_DEBUG_VAR(InnerSize)
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(LinearPacketSize)
+    EIGEN_DEBUG_VAR(InnerPacketSize)
+    EIGEN_DEBUG_VAR(ActualPacketSize)
+    EIGEN_DEBUG_VAR(StorageOrdersAgree)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearize)
+    EIGEN_DEBUG_VAR(MayInnerVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    EIGEN_DEBUG_VAR(MayUnrollCompletely)
+    EIGEN_DEBUG_VAR(MayUnrollInner)
+    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
+    std::cerr << std::endl;
+  }
+#endif
+};
+
+/***************************************************************************
+* Part 2 : meta-unrollers
+***************************************************************************/
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
+{
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+  
+  enum {
+    outer = Index / DstXprType::InnerSizeAtCompileTime,
+    inner = Index % DstXprType::InnerSizeAtCompileTime
+  };
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    kernel.assignCoeffByOuterInner(outer, inner);
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+template<typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
+  {
+    kernel.assignCoeffByOuterInner(outer, Index_);
+    copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_+1, Stop>::run(kernel, outer);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) { }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel)
+  {
+    kernel.assignCoeff(Index);
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_innervec_CompleteUnrolling
+{
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+  typedef typename Kernel::PacketType PacketType;
+  
+  enum {
+    outer = Index / DstXprType::InnerSizeAtCompileTime,
+    inner = Index % DstXprType::InnerSizeAtCompileTime,
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
+  };
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
+    enum { NextIndex = Index + unpacket_traits<PacketType>::size };
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_InnerUnrolling
+{
+  typedef typename Kernel::PacketType PacketType;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
+  {
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
+    enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
+    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer);
+  }
+};
+
+template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+// dense_assignment_loop is based on assign_impl
+
+template<typename Kernel,
+         int Traversal = Kernel::AssignmentTraits::Traversal,
+         int Unrolling = Kernel::AssignmentTraits::Unrolling>
+struct dense_assignment_loop;
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel &kernel)
+  {
+    for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
+      for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
+        kernel.assignCoeffByOuterInner(outer, inner);
+      }
+    }
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+
+    const Index outerSize = kernel.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+  }
+};
+
+/***************************
+*** Linear vectorization ***
+***************************/
+
+
+// The goal of unaligned_dense_assignment_loop is simply to factorize the handling
+// of the non vectorizable beginning and ending parts
+
+template <bool IsAligned = false>
+struct unaligned_dense_assignment_loop
+{
+  // if IsAligned = true, then do nothing
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index, Index) {}
+};
+
+template <>
+struct unaligned_dense_assignment_loop<false>
+{
+  // MSVC must not inline this functions. If it does, it fails to optimize the
+  // packet access path.
+  // FIXME check which version exhibits this issue
+#if EIGEN_COMP_MSVC
+  template <typename Kernel>
+  static EIGEN_DONT_INLINE void run(Kernel &kernel,
+                                    Index start,
+                                    Index end)
+#else
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel,
+                                      Index start,
+                                      Index end)
+#endif
+  {
+    for (Index index = start; index < end; ++index)
+      kernel.assignCoeff(index);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    const Index size = kernel.size();
+    typedef typename Kernel::Scalar Scalar;
+    typedef typename Kernel::PacketType PacketType;
+    enum {
+      requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
+      packetSize = unpacket_traits<PacketType>::size,
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
+                                                            : int(Kernel::AssignmentTraits::DstAlignment),
+      srcAlignment = Kernel::AssignmentTraits::JointAlignment
+    };
+    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
+    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
+
+    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
+
+    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
+      kernel.template assignPacket<dstAlignment, srcAlignment, PacketType>(index);
+
+    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
+    
+    enum { size = DstXprType::SizeAtCompileTime,
+           packetSize =unpacket_traits<PacketType>::size,
+           alignedSize = (size/packetSize)*packetSize };
+
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
+  }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
+{
+  typedef typename Kernel::PacketType PacketType;
+  enum {
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
+  };
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const Index packetSize = unpacket_traits<PacketType>::size;
+    for(Index outer = 0; outer < outerSize; ++outer)
+      for(Index inner = 0; inner < innerSize; inner+=packetSize)
+        kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::AssignmentTraits Traits;
+    const Index outerSize = kernel.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime,
+                                                   Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer);
+  }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    const Index size = kernel.size();
+    for(Index i = 0; i < size; ++i)
+      kernel.assignCoeff(i);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+/**************************
+*** Slice vectorization ***
+***************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::Scalar Scalar;
+    typedef typename Kernel::PacketType PacketType;
+    enum {
+      packetSize = unpacket_traits<PacketType>::size,
+      requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment),
+      alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = alignable ? int(requestedAlignment)
+                               : int(Kernel::AssignmentTraits::DstAlignment)
+    };
+    const Scalar *dst_ptr = kernel.dstDataPtr();
+    if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
+    {
+      // the pointer is not aligend-on scalar, so alignment is not possible
+      return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
+    }
+    const Index packetAlignedMask = packetSize - 1;
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
+    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);
+
+    for(Index outer = 0; outer < outerSize; ++outer)
+    {
+      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
+      // do the non-vectorizable part of the assignment
+      for(Index inner = 0; inner<alignedStart ; ++inner)
+        kernel.assignCoeffByOuterInner(outer, inner);
+
+      // do the vectorizable part of the assignment
+      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
+        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned, PacketType>(outer, inner);
+
+      // do the non-vectorizable part of the assignment
+      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
+        kernel.assignCoeffByOuterInner(outer, inner);
+
+      alignedStart = numext::mini((alignedStart+alignedStep)%packetSize, innerSize);
+    }
+  }
+};
+
+#if EIGEN_UNALIGNED_VECTORIZE
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
+
+    enum { size = DstXprType::InnerSizeAtCompileTime,
+           packetSize =unpacket_traits<PacketType>::size,
+           vectorizableSize = (size/packetSize)*packetSize };
+
+    for(Index outer = 0; outer < kernel.outerSize(); ++outer)
+    {
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer);
+    }
+  }
+};
+#endif
+
+
+/***************************************************************************
+* Part 4 : Generic dense assignment kernel
+***************************************************************************/
+
+// This class generalize the assignment of a coefficient (or packet) from one dense evaluator
+// to another dense writable evaluator.
+// It is parametrized by the two evaluators, and the actual assignment functor.
+// This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
+// One can customize the assignment using this generic dense_assignment_kernel with different
+// functors, or by completely overloading it, by-passing a functor.
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
+class generic_dense_assignment_kernel
+{
+protected:
+  typedef typename DstEvaluatorTypeT::XprType DstXprType;
+  typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
+public:
+  
+  typedef DstEvaluatorTypeT DstEvaluatorType;
+  typedef SrcEvaluatorTypeT SrcEvaluatorType;
+  typedef typename DstEvaluatorType::Scalar Scalar;
+  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
+  typedef typename AssignmentTraits::PacketType PacketType;
+  
+  
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
+  {
+    #ifdef EIGEN_DEBUG_ASSIGN
+    AssignmentTraits::debug();
+    #endif
+  }
+  
+  EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
+  
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
+  
+  /// Assign src(row,col) to dst(row,col) through the assignment functor.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
+  {
+    m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
+  }
+  
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
+  {
+    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
+  }
+  
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
+  {
+    Index row = rowIndexByOuterInner(outer, inner); 
+    Index col = colIndexByOuterInner(outer, inner); 
+    assignCoeff(row, col);
+  }
+  
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
+  {
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
+  }
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
+  {
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
+  }
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
+  {
+    Index row = rowIndexByOuterInner(outer, inner); 
+    Index col = colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
+  }
+  
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
+  {
+    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    return int(Traits::RowsAtCompileTime) == 1 ? 0
+      : int(Traits::ColsAtCompileTime) == 1 ? inner
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? outer
+      : inner;
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner)
+  {
+    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    return int(Traits::ColsAtCompileTime) == 1 ? 0
+      : int(Traits::RowsAtCompileTime) == 1 ? inner
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
+      : outer;
+  }
+
+  EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const
+  {
+    return m_dstExpr.data();
+  }
+  
+protected:
+  DstEvaluatorType& m_dst;
+  const SrcEvaluatorType& m_src;
+  const Functor &m_functor;
+  // TODO find a way to avoid the needs of the original expression
+  DstXprType& m_dstExpr;
+};
+
+/***************************************************************************
+* Part 5 : Entry point for dense rectangular assignment
+***************************************************************************/
+
+template<typename DstXprType,typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/)
+{
+  EIGEN_ONLY_USED_FOR_DEBUG(dst);
+  EIGEN_ONLY_USED_FOR_DEBUG(src);
+  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+}
+
+template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/)
+{
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
+    dst.resize(dstRows, dstCols);
+  eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
+}
+
+template<typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
+{
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
+  // we need to resize the destination after the source evaluator has been created.
+  resize_if_allowed(dst, src, func);
+
+  DstEvaluatorType dstEvaluator(dst);
+    
+  typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+  dense_assignment_loop<Kernel>::run(kernel);
+}
+
+template<typename DstXprType, typename SrcXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src)
+{
+  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+}
+
+/***************************************************************************
+* Part 6 : Generic assignment
+***************************************************************************/
+
+// Based on the respective shapes of the destination and source,
+// the class AssignmentKind determine the kind of assignment mechanism.
+// AssignmentKind must define a Kind typedef.
+template<typename DstShape, typename SrcShape> struct AssignmentKind;
+
+// Assignement kind defined in this file:
+struct Dense2Dense {};
+struct EigenBase2EigenBase {};
+
+template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
+template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
+    
+// This is the main assignment class
+template< typename DstXprType, typename SrcXprType, typename Functor,
+          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
+          typename EnableIf = void>
+struct Assignment;
+
+
+// The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition.
+// Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated.
+// So this intermediate function removes everything related to "assume-aliasing" such that Assignment
+// does not has to bother about these annoying details.
+
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src)
+{
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(const Dst& dst, const Src& src)
+{
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+                     
+// Deal with "assume-aliasing"
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
+{
+  typename plain_matrix_type<Src>::type tmp(src);
+  call_assignment_no_alias(dst, tmp, func);
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
+{
+  call_assignment_no_alias(dst, src, func);
+}
+
+// by-pass "assume-aliasing"
+// When there is no aliasing, we require that 'dst' has been properly resized
+template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+{
+  call_assignment_no_alias(dst.expression(), src, func);
+}
+
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+{
+  enum {
+    NeedToTranspose = (    (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
+                        || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)
+                      ) && int(Dst::SizeAtCompileTime) != 1
+  };
+
+  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
+  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
+  ActualDstType actualDst(dst);
+  
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
+  
+  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
+{
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
+
+  Assignment<Dst,Src,Func>::run(dst, src, func);
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+
+// forward declaration
+template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
+
+// Generic Dense to Dense assignment
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+#ifndef EIGEN_NO_DEBUG
+    internal::check_for_aliasing(dst, src);
+#endif
+    
+    call_dense_assignment_loop(dst, src, func);
+  }
+};
+
+// Generic assignment through evalTo.
+// TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.evalTo(dst);
+  }
+
+  // NOTE The following two functions are templated to avoid their instanciation if not needed
+  //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.addTo(dst);
+  }
+
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.subTo(dst);
+  }
+};
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_ASSIGN_EVALUATOR_H
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
index 7772951b9..6c2ab9264 100644..100755
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -1,6 +1,7 @@
 /*
  Copyright (c) 2011, Intel Corporation. All rights reserved.
-
+ Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+ 
  Redistribution and use in source and binary forms, with or without modification,
  are permitted provided that the following conditions are met:
 
@@ -37,17 +38,13 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Op> struct vml_call
-{ enum { IsSupported = 0 }; };
-
-template<typename Dst, typename Src, typename UnaryOp>
+template<typename Dst, typename Src>
 class vml_assign_traits
 {
   private:
     enum {
       DstHasDirectAccess = Dst::Flags & DirectAccessBit,
       SrcHasDirectAccess = Src::Flags & DirectAccessBit,
-
       StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
       InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
                 : int(Dst::Flags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
@@ -57,165 +54,120 @@ class vml_assign_traits
                     : int(Dst::MaxRowsAtCompileTime),
       MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
 
-      MightEnableVml =  vml_call<UnaryOp>::IsSupported && StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess
-                     && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
+      MightEnableVml = StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
       MightLinearize = MightEnableVml && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
       VmlSize = MightLinearize ? MaxSizeAtCompileTime : InnerMaxSize,
-      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD,
-      MayEnableVml = MightEnableVml && LargeEnough,
-      MayLinearize = MayEnableVml && MightLinearize
+      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD
     };
   public:
     enum {
-      Traversal = MayLinearize ? LinearVectorizedTraversal
-                : MayEnableVml ? InnerVectorizedTraversal
-                : DefaultTraversal
+      EnableVml = MightEnableVml && LargeEnough,
+      Traversal = MightLinearize ? LinearTraversal : DefaultTraversal
     };
 };
 
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling,
-         int VmlTraversal = vml_assign_traits<Derived1, Derived2, UnaryOp>::Traversal >
-struct vml_assign_impl
-  : assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>
-{
-};
-
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, InnerVectorizedTraversal>
-{
-  typedef typename Derived1::Scalar Scalar;
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer) {
-      const Scalar *src_ptr = src.IsRowMajor ?  &(src.nestedExpression().coeffRef(outer,0)) :
-                                                &(src.nestedExpression().coeffRef(0, outer));
-      Scalar *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));
-      vml_call<UnaryOp>::run(src.functor(), innerSize, src_ptr, dst_ptr );
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, LinearVectorizedTraversal>
-{
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    vml_call<UnaryOp>::run(src.functor(), dst.size(), src.nestedExpression().data(), dst.data() );
-  }
-};
-
-// Macroses
-
-#define EIGEN_MKL_VML_SPECIALIZE_ASSIGN(TRAVERSAL,UNROLLING) \
-  template<typename Derived1, typename Derived2, typename UnaryOp> \
-  struct assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>, TRAVERSAL, UNROLLING, Specialized>  {  \
-    static inline void run(Derived1 &dst, const Eigen::CwiseUnaryOp<UnaryOp, Derived2> &src) { \
-      vml_assign_impl<Derived1,Derived2,UnaryOp,TRAVERSAL,UNROLLING>::run(dst, src); \
-    } \
-  };
-
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(SliceVectorizedTraversal,NoUnrolling)
-
-
+#define EIGEN_PP_EXPAND(ARG) ARG
 #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define  EIGEN_MKL_VML_MODE VML_HA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_HA
 #else
-#define  EIGEN_MKL_VML_MODE VML_LA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_LA
 #endif
 
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)     \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst);                           \
-    }                                                                            \
+#define EIGEN_VMLMODE_EXPAND__ 
+
+#define EIGEN_VMLMODE_PREFIX_LA vm
+#define EIGEN_VMLMODE_PREFIX__  v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested>                                                                         \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                   \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
+        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                           \
+      } else {                                                                                                                  \
+        const Index outerSize = dst.outerSize();                                                                                \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                             \
+                                                      &(src.nestedExpression().coeffRef(0, outer));                             \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
+                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                             \
+        }                                                                                                                       \
+      }                                                                                                                         \
+    }                                                                                                                           \
+  };                                                                                                                            \
+
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),s##VMLOP), float, float, VMLMODE)           \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),d##VMLOP), double, double, VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),c##VMLOP), scomplex, MKL_Complex8, VMLMODE) \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),z##VMLOP), dcomplex, MKL_Complex16, VMLMODE)
+  
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP, VMLMODE)                                                              \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                               \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)
+
+  
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sin,   Sin,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(asin,  Asin,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sinh,  Sinh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cos,   Cos,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(acos,  Acos,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cosh,  Cosh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tan,   Tan,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(atan,  Atan,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tanh,  Tanh,  LA)
+// EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,   Abs,    _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(exp,   Exp,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log,   Ln,    LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log10, Log10, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sqrt,  Sqrt,  _)
+
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr,   _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(arg, Arg,      _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
+
+#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested, typename Plain>                                                       \
+  struct Assignment<DstXprType, CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                       \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> >, assign_op<EIGENTYPE,EIGENTYPE>,    \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                 \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
+      {                                                                                                                       \
+        VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent,                                                        \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
+      } else {                                                                                                                \
+        const Index outerSize = dst.outerSize();                                                                              \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.lhs().coeffRef(outer,0)) :                                        \
+                                                      &(src.lhs().coeffRef(0, outer));                                        \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
+                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
+        }                                                                                                                     \
+      }                                                                                                                       \
+    }                                                                                                                         \
   };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)  \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst, vmlMode);                  \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)       \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& func,        \
-                          int size, const EIGENTYPE* src, EIGENTYPE* dst) {      \
-      EIGENTYPE exponent = func.m_exponent;                                      \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(&size, (const VMLTYPE*)src, (const VMLTYPE*)&exponent,               \
-                        (VMLTYPE*)dst, &vmlMode);                                \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vs##VMLOP, float, float)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vc##VMLOP, scomplex, MKL_Complex8)   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP)                        \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)
-
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vms##VMLOP, float, float)         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmc##VMLOP, scomplex, MKL_Complex8)  \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(EIGENOP, VMLOP)                     \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                      \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)
-
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sin,  Sin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,  Cos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,  Tan)
-//EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,  Abs)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,  Exp)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,  Ln)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sqrt, Sqrt)
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr)
-
-// The vm*powx functions are not avaibale in the windows version of MKL.
-#ifndef _WIN32
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmspowx_, float, float)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdpowx_, double, double)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcpowx_, scomplex, MKL_Complex8)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzpowx_, dcomplex, MKL_Complex16)
-#endif
+  
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmsPowx, float,    float,         LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdPowx, double,   double,        LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcPowx, scomplex, MKL_Complex8,  LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzPowx, dcomplex, MKL_Complex16, LA)
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h
index ffd7fe8b3..4978c9140 100644
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@@ -32,7 +32,7 @@ class BandMatrixBase : public EigenBase<Derived>
     };
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> DenseMatrixType;
-    typedef typename DenseMatrixType::Index Index;
+    typedef typename DenseMatrixType::StorageIndex StorageIndex;
     typedef typename internal::traits<Derived>::CoefficientsType CoefficientsType;
     typedef EigenBase<Derived> Base;
 
@@ -161,15 +161,15 @@ class BandMatrixBase : public EigenBase<Derived>
   *
   * \brief Represents a rectangular matrix with a banded storage
   *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Rows Number of rows, or \b Dynamic
-  * \param Cols Number of columns, or \b Dynamic
-  * \param Supers Number of super diagonal
-  * \param Subs Number of sub diagonal
-  * \param _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
-  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to
-  *                 column-major. The latter controls whether the matrix represents a selfadjoint 
-  *                 matrix in which case either Supers of Subs have to be null.
+  * \tparam _Scalar Numeric type, i.e. float, double, int
+  * \tparam _Rows Number of rows, or \b Dynamic
+  * \tparam _Cols Number of columns, or \b Dynamic
+  * \tparam _Supers Number of super diagonal
+  * \tparam _Subs Number of sub diagonal
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
+  *                  The former controls \ref TopicStorageOrders "storage order", and defaults to
+  *                  column-major. The latter controls whether the matrix represents a selfadjoint
+  *                  matrix in which case either Supers of Subs have to be null.
   *
   * \sa class TridiagonalMatrix
   */
@@ -179,7 +179,7 @@ struct traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
 {
   typedef _Scalar Scalar;
   typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef Eigen::Index StorageIndex;
   enum {
     CoeffReadCost = NumTraits<Scalar>::ReadCost,
     RowsAtCompileTime = _Rows,
@@ -201,10 +201,10 @@ class BandMatrix : public BandMatrixBase<BandMatrix<_Scalar,Rows,Cols,Supers,Sub
   public:
 
     typedef typename internal::traits<BandMatrix>::Scalar Scalar;
-    typedef typename internal::traits<BandMatrix>::Index Index;
+    typedef typename internal::traits<BandMatrix>::StorageIndex StorageIndex;
     typedef typename internal::traits<BandMatrix>::CoefficientsType CoefficientsType;
 
-    inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
+    explicit inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
       : m_coeffs(1+supers+subs,cols),
         m_rows(rows), m_supers(supers), m_subs(subs)
     {
@@ -241,7 +241,7 @@ struct traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Opt
 {
   typedef typename _CoefficientsType::Scalar Scalar;
   typedef typename _CoefficientsType::StorageKind StorageKind;
-  typedef typename _CoefficientsType::Index Index;
+  typedef typename _CoefficientsType::StorageIndex StorageIndex;
   enum {
     CoeffReadCost = internal::traits<_CoefficientsType>::CoeffReadCost,
     RowsAtCompileTime = _Rows,
@@ -264,9 +264,9 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT
 
     typedef typename internal::traits<BandMatrixWrapper>::Scalar Scalar;
     typedef typename internal::traits<BandMatrixWrapper>::CoefficientsType CoefficientsType;
-    typedef typename internal::traits<BandMatrixWrapper>::Index Index;
+    typedef typename internal::traits<BandMatrixWrapper>::StorageIndex StorageIndex;
 
-    inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
+    explicit inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
       : m_coeffs(coeffs),
         m_rows(rows), m_supers(supers), m_subs(subs)
     {
@@ -302,9 +302,9 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT
   *
   * \brief Represents a tridiagonal matrix with a compact banded storage
   *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Size Number of rows and cols, or \b Dynamic
-  * \param _Options Can be 0 or \b SelfAdjoint
+  * \tparam Scalar Numeric type, i.e. float, double, int
+  * \tparam Size Number of rows and cols, or \b Dynamic
+  * \tparam Options Can be 0 or \b SelfAdjoint
   *
   * \sa class BandMatrix
   */
@@ -312,9 +312,9 @@ template<typename Scalar, int Size, int Options>
 class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor>
 {
     typedef BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor> Base;
-    typedef typename Base::Index Index;
+    typedef typename Base::StorageIndex StorageIndex;
   public:
-    TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
+    explicit TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
 
     inline typename Base::template DiagonalIntReturnType<1>::Type super()
     { return Base::template diagonal<1>(); }
@@ -327,6 +327,25 @@ class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint
   protected:
 };
 
+
+struct BandShape {};
+
+template<typename _Scalar, int _Rows, int _Cols, int _Supers, int _Subs, int _Options>
+struct evaluator_traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+  : public evaluator_traits_base<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef BandShape Shape;
+};
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+struct evaluator_traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+  : public evaluator_traits_base<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef BandShape Shape;
+};
+
+template<> struct AssignmentKind<DenseShape,BandShape> { typedef EigenBase2EigenBase Kind; };
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 827894443..11de45c2e 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -13,38 +13,6 @@
 
 namespace Eigen { 
 
-/** \class Block
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a fixed-size or dynamic-size block
-  *
-  * \param XprType the type of the expression in which we are taking a block
-  * \param BlockRows the number of rows of the block we are taking at compile time (optional)
-  * \param BlockCols the number of columns of the block we are taking at compile time (optional)
-  *
-  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
-  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
-  * most of the time this is the only way it is used.
-  *
-  * However, if you want to directly maniputate block expressions,
-  * for instance if you want to write a function returning such an expression, you
-  * will need to use this class.
-  *
-  * Here is an example illustrating the dynamic case:
-  * \include class_Block.cpp
-  * Output: \verbinclude class_Block.out
-  *
-  * \note Even though this expression has dynamic size, in the case where \a XprType
-  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
-  * it does not cause a dynamic memory allocation.
-  *
-  * Here is an example illustrating the fixed-size case:
-  * \include class_FixedBlock.cpp
-  * Output: \verbinclude class_FixedBlock.out
-  *
-  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
-  */
-
 namespace internal {
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
 struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprType>
@@ -52,7 +20,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
   typedef typename traits<XprType>::Scalar Scalar;
   typedef typename traits<XprType>::StorageKind StorageKind;
   typedef typename traits<XprType>::XprKind XprKind;
-  typedef typename nested<XprType>::type XprTypeNested;
+  typedef typename ref_selector<XprType>::type XprTypeNested;
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   enum{
     MatrixRows = traits<XprType>::RowsAtCompileTime,
@@ -65,10 +33,10 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
     MaxColsAtCompileTime = BlockCols==0 ? 0
                          : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
                          : int(traits<XprType>::MaxColsAtCompileTime),
+
     XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
-    IsDense = is_same<StorageKind,Dense>::value,
-    IsRowMajor = (IsDense&&MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-               : (IsDense&&MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
                : XprTypeIsRowMajor,
     HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
     InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
@@ -78,18 +46,16 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
     OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
                              ? int(outer_stride_at_compile_time<XprType>::ret)
                              : int(inner_stride_at_compile_time<XprType>::ret),
-    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
-                       && (InnerStrideAtCompileTime == 1)
-                        ? PacketAccessBit : 0,
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
+
+    // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
     FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
     FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
-                                        DirectAccessBit |
-                                        MaskPacketAccessBit |
-                                        MaskAlignedBit),
-    Flags = Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit
+    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit,
+    // FIXME DirectAccessBit should not be handled by expressions
+    // 
+    // Alignment is needed by MapBase's assertions
+    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
+    Alignment = 0
   };
 };
 
@@ -100,6 +66,40 @@ template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool In
 
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;
 
+/** \class Block
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size block
+  *
+  * \tparam XprType the type of the expression in which we are taking a block
+  * \tparam BlockRows the number of rows of the block we are taking at compile time (optional)
+  * \tparam BlockCols the number of columns of the block we are taking at compile time (optional)
+  * \tparam InnerPanel is true, if the block maps to a set of rows of a row major matrix or
+  *         to set of columns of a column major matrix (optional). The parameter allows to determine
+  *         at compile time whether aligned access is possible on the block expression.
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
+  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
+  * most of the time this is the only way it is used.
+  *
+  * However, if you want to directly maniputate block expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_Block.cpp
+  * Output: \verbinclude class_Block.out
+  *
+  * \note Even though this expression has dynamic size, in the case where \a XprType
+  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
+  * it does not cause a dynamic memory allocation.
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedBlock.cpp
+  * Output: \verbinclude class_FixedBlock.out
+  *
+  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
+  */
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class Block
   : public BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind>
 {
@@ -109,9 +109,12 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
     typedef Impl Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
+    
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
   
     /** Column or Row constructor
       */
+    EIGEN_DEVICE_FUNC
     inline Block(XprType& xpr, Index i) : Impl(xpr,i)
     {
       eigen_assert( (i>=0) && (
@@ -121,25 +124,27 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
 
     /** Fixed-size constructor
       */
-    inline Block(XprType& xpr, Index a_startRow, Index a_startCol)
-      : Impl(xpr, a_startRow, a_startCol)
+    EIGEN_DEVICE_FUNC
+    inline Block(XprType& xpr, Index startRow, Index startCol)
+      : Impl(xpr, startRow, startCol)
     {
       EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(a_startRow >= 0 && BlockRows >= 1 && a_startRow + BlockRows <= xpr.rows()
-             && a_startCol >= 0 && BlockCols >= 1 && a_startCol + BlockCols <= xpr.cols());
+      eigen_assert(startRow >= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows()
+             && startCol >= 0 && BlockCols >= 0 && startCol + BlockCols <= xpr.cols());
     }
 
     /** Dynamic-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline Block(XprType& xpr,
-          Index a_startRow, Index a_startCol,
+          Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols)
     {
       eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
           && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
-      eigen_assert(a_startRow >= 0 && blockRows >= 0 && a_startRow  <= xpr.rows() - blockRows
-          && a_startCol >= 0 && blockCols >= 0 && a_startCol <= xpr.cols() - blockCols);
+      eigen_assert(startRow >= 0 && blockRows >= 0 && startRow  <= xpr.rows() - blockRows
+          && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols);
     }
 };
          
@@ -150,14 +155,15 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
   : public internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel>
 {
     typedef internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel> Impl;
-    typedef typename XprType::Index Index;
+    typedef typename XprType::StorageIndex StorageIndex;
   public:
     typedef Impl Base;
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
-    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol, Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };
 
 namespace internal {
@@ -167,16 +173,18 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
   : public internal::dense_xpr_base<Block<XprType, BlockRows, BlockCols, InnerPanel> >::type
 {
     typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
   public:
 
     typedef typename internal::dense_xpr_base<BlockType>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
 
-    class InnerIterator;
+    // class InnerIterator; // FIXME apparently never used
 
     /** Column or Row constructor
       */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr, Index i)
       : m_xpr(xpr),
         // It is a row if and only if BlockRows==1 and BlockCols==XprType::ColsAtCompileTime,
@@ -191,75 +199,76 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
 
     /** Fixed-size constructor
       */
-    inline BlockImpl_dense(XprType& xpr, Index a_startRow, Index a_startCol)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
                     m_blockRows(BlockRows), m_blockCols(BlockCols)
     {}
 
     /** Dynamic-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr,
-          Index a_startRow, Index a_startCol,
+          Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
                     m_blockRows(blockRows), m_blockCols(blockCols)
     {}
 
-    inline Index rows() const { return m_blockRows.value(); }
-    inline Index cols() const { return m_blockCols.value(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_blockRows.value(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_blockCols.value(); }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index rowId, Index colId)
     {
       EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
-      return m_xpr.derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.derived().coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
     {
       return m_xpr.coeff(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index index)
     {
       EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
+    EIGEN_DEVICE_FUNC
     inline const CoeffReturnType coeff(Index index) const
     {
-      return m_xpr
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                         m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
     template<int LoadMode>
     inline PacketScalar packet(Index rowId, Index colId) const
     {
-      return m_xpr.template packet<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
     template<int LoadMode>
     inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
     {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value(), val);
+      m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
     }
 
     template<int LoadMode>
@@ -273,40 +282,46 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     template<int LoadMode>
     inline void writePacket(Index index, const PacketScalar& val)
     {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
+      m_xpr.template writePacket<Unaligned>
          (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
           m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val);
     }
 
     #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \sa MapBase::data() */
-    inline const Scalar* data() const;
-    inline Index innerStride() const;
-    inline Index outerStride() const;
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
+    EIGEN_DEVICE_FUNC inline Index innerStride() const;
+    EIGEN_DEVICE_FUNC inline Index outerStride() const;
     #endif
 
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
     { 
       return m_xpr; 
     }
+
+    EIGEN_DEVICE_FUNC
+    XprType& nestedExpression() { return m_xpr; }
       
-    Index startRow() const 
+    EIGEN_DEVICE_FUNC
+    StorageIndex startRow() const
     { 
       return m_startRow.value(); 
     }
       
-    Index startCol() const 
+    EIGEN_DEVICE_FUNC
+    StorageIndex startCol() const
     { 
       return m_startCol.value(); 
     }
 
   protected:
 
-    const typename XprType::Nested m_xpr;
-    const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-    const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
-    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
-    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
+    XprTypeNested m_xpr;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
+    const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows;
+    const internal::variable_if_dynamic<StorageIndex, ColsAtCompileTime> m_blockCols;
 };
 
 /** \internal Internal implementation of dense Blocks in the direct access case.*/
@@ -315,6 +330,10 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
   : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >
 {
     typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
+    enum {
+      XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
+    };
   public:
 
     typedef MapBase<BlockType> Base;
@@ -323,42 +342,52 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     /** Column or Row constructor
       */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr, Index i)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(
-              (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0,
-              (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)),
+      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) 
+                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
              BlockRows==1 ? 1 : xpr.rows(),
              BlockCols==1 ? 1 : xpr.cols()),
-        m_xpr(xpr)
+        m_xpr(xpr),
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)
     {
       init();
     }
 
     /** Fixed-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol))), m_xpr(xpr)
+      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
       init();
     }
 
     /** Dynamic-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr,
           Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol)), blockRows, blockCols),
-        m_xpr(xpr)
+      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
+        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
       init();
     }
 
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
     { 
       return m_xpr; 
     }
+
+    EIGEN_DEVICE_FUNC
+    XprType& nestedExpression() { return m_xpr; }
       
     /** \sa MapBase::innerStride() */
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const
     {
       return internal::traits<BlockType>::HasSameStorageOrderAsXprType
@@ -367,11 +396,24 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     }
 
     /** \sa MapBase::outerStride() */
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
       return m_outerStride;
     }
 
+    EIGEN_DEVICE_FUNC
+    StorageIndex startRow() const
+    {
+      return m_startRow.value();
+    }
+
+    EIGEN_DEVICE_FUNC
+    StorageIndex startCol() const
+    {
+      return m_startCol.value();
+    }
+
   #ifndef __SUNPRO_CC
   // FIXME sunstudio is not friendly with the above friend...
   // META-FIXME there is no 'friend' keyword around here. Is this obsolete?
@@ -380,6 +422,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal used by allowAligned() */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
       : Base(data, blockRows, blockCols), m_xpr(xpr)
     {
@@ -388,6 +431,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     #endif
 
   protected:
+    EIGEN_DEVICE_FUNC
     void init()
     {
       m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
@@ -395,7 +439,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
                     : m_xpr.innerStride();
     }
 
-    typename XprType::Nested m_xpr;
+    XprTypeNested m_xpr;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
     Index m_outerStride;
 };
 
diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h
index be9f48a8c..8409d8749 100644
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -17,9 +17,10 @@ namespace internal {
 template<typename Derived, int UnrollCount>
 struct all_unroller
 {
+  typedef typename Derived::ExpressionTraits Traits;
   enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
+    row = (UnrollCount-1) % Traits::RowsAtCompileTime
   };
 
   static inline bool run(const Derived &mat)
@@ -43,11 +44,12 @@ struct all_unroller<Derived, Dynamic>
 template<typename Derived, int UnrollCount>
 struct any_unroller
 {
+  typedef typename Derived::ExpressionTraits Traits;
   enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
+    row = (UnrollCount-1) % Traits::RowsAtCompileTime
   };
-
+  
   static inline bool run(const Derived &mat)
   {
     return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
@@ -78,19 +80,19 @@ struct any_unroller<Derived, Dynamic>
 template<typename Derived>
 inline bool DenseBase<Derived>::all() const
 {
+  typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
   };
+  Evaluator evaluator(derived());
   if(unroll)
-    return internal::all_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
   else
   {
     for(Index j = 0; j < cols(); ++j)
       for(Index i = 0; i < rows(); ++i)
-        if (!coeff(i, j)) return false;
+        if (!evaluator.coeff(i, j)) return false;
     return true;
   }
 }
@@ -102,19 +104,19 @@ inline bool DenseBase<Derived>::all() const
 template<typename Derived>
 inline bool DenseBase<Derived>::any() const
 {
+  typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
   };
+  Evaluator evaluator(derived());
   if(unroll)
-    return internal::any_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
   else
   {
     for(Index j = 0; j < cols(); ++j)
       for(Index i = 0; i < rows(); ++i)
-        if (coeff(i, j)) return true;
+        if (evaluator.coeff(i, j)) return true;
     return false;
   }
 }
@@ -124,7 +126,7 @@ inline bool DenseBase<Derived>::any() const
   * \sa all(), any()
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::Index DenseBase<Derived>::count() const
+inline Eigen::Index DenseBase<Derived>::count() const
 {
   return derived().template cast<bool>().template cast<Index>().sum();
 }
@@ -136,7 +138,11 @@ inline typename DenseBase<Derived>::Index DenseBase<Derived>::count() const
 template<typename Derived>
 inline bool DenseBase<Derived>::hasNaN() const
 {
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isNaN().any();
+#else
   return !((derived().array()==derived().array()).all());
+#endif
 }
 
 /** \returns true if \c *this contains only finite numbers, i.e., no NaN and no +/-INF values.
@@ -146,7 +152,11 @@ inline bool DenseBase<Derived>::hasNaN() const
 template<typename Derived>
 inline bool DenseBase<Derived>::allFinite() const
 {
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isFinite().all();
+#else
   return !((derived()-derived()).hasNaN());
+#endif
 }
     
 } // end namespace Eigen
diff --git a/Eigen/src/Core/CMakeLists.txt b/Eigen/src/Core/CMakeLists.txt
deleted file mode 100644
index 2346fc2bb..000000000
--- a/Eigen/src/Core/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-FILE(GLOB Eigen_Core_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core COMPONENT Devel
-  )
-
-ADD_SUBDIRECTORY(products)
-ADD_SUBDIRECTORY(util)
-ADD_SUBDIRECTORY(arch)
diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index a036d8c3b..d218e9814 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -22,14 +22,14 @@ namespace Eigen {
   * the return type of MatrixBase::operator<<, and most of the time this is the only
   * way it is used.
   *
-  * \sa \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished()
+  * \sa \blank \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished()
   */
 template<typename XprType>
 struct CommaInitializer
 {
   typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::Index Index;
 
+  EIGEN_DEVICE_FUNC
   inline CommaInitializer(XprType& xpr, const Scalar& s)
     : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
   {
@@ -37,6 +37,7 @@ struct CommaInitializer
   }
 
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
     : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
   {
@@ -46,6 +47,7 @@ struct CommaInitializer
   /* Copy/Move constructor which transfers ownership. This is crucial in 
    * absence of return value optimization to avoid assertions during destruction. */
   // FIXME in C++11 mode this could be replaced by a proper RValue constructor
+  EIGEN_DEVICE_FUNC
   inline CommaInitializer(const CommaInitializer& o)
   : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
     // Mark original object as finished. In absence of R-value references we need to const_cast:
@@ -55,6 +57,7 @@ struct CommaInitializer
   }
 
   /* inserts a scalar value in the target matrix */
+  EIGEN_DEVICE_FUNC
   CommaInitializer& operator,(const Scalar& s)
   {
     if (m_col==m_xpr.cols())
@@ -74,11 +77,10 @@ struct CommaInitializer
 
   /* inserts a matrix expression in the target matrix */
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
   {
-    if(other.cols()==0 || other.rows()==0)
-      return *this;
-    if (m_col==m_xpr.cols())
+    if (m_col==m_xpr.cols() && (other.cols()!=0 || other.rows()!=m_currentBlockRows))
     {
       m_row+=m_currentBlockRows;
       m_col = 0;
@@ -86,24 +88,22 @@ struct CommaInitializer
       eigen_assert(m_row+m_currentBlockRows<=m_xpr.rows()
         && "Too many rows passed to comma initializer (operator<<)");
     }
-    eigen_assert(m_col<m_xpr.cols()
+    eigen_assert((m_col + other.cols() <= m_xpr.cols())
       && "Too many coefficients passed to comma initializer (operator<<)");
     eigen_assert(m_currentBlockRows==other.rows());
-    if (OtherDerived::SizeAtCompileTime != Dynamic)
-      m_xpr.template block<OtherDerived::RowsAtCompileTime != Dynamic ? OtherDerived::RowsAtCompileTime : 1,
-                              OtherDerived::ColsAtCompileTime != Dynamic ? OtherDerived::ColsAtCompileTime : 1>
-                    (m_row, m_col) = other;
-    else
-      m_xpr.block(m_row, m_col, other.rows(), other.cols()) = other;
+    m_xpr.template block<OtherDerived::RowsAtCompileTime, OtherDerived::ColsAtCompileTime>
+                    (m_row, m_col, other.rows(), other.cols()) = other;
     m_col += other.cols();
     return *this;
   }
 
+  EIGEN_DEVICE_FUNC
   inline ~CommaInitializer()
+#if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
+  EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
+#endif
   {
-    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
-         && m_col == m_xpr.cols()
-         && "Too few coefficients passed to comma initializer (operator<<)");
+      finished();
   }
 
   /** \returns the built matrix once all its coefficients have been set.
@@ -113,9 +113,15 @@ struct CommaInitializer
     * quaternion.fromRotationMatrix((Matrix3f() << axis0, axis1, axis2).finished());
     * \endcode
     */
-  inline XprType& finished() { return m_xpr; }
+  EIGEN_DEVICE_FUNC
+  inline XprType& finished() {
+      eigen_assert(((m_row+m_currentBlockRows) == m_xpr.rows() || m_xpr.cols() == 0)
+           && m_col == m_xpr.cols()
+           && "Too few coefficients passed to comma initializer (operator<<)");
+      return m_xpr;
+  }
 
-  XprType& m_xpr;   // target expression
+  XprType& m_xpr;           // target expression
   Index m_row;              // current row id
   Index m_col;              // current col id
   Index m_currentBlockRows; // current block height
diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h
new file mode 100644
index 000000000..aa7efdc76
--- /dev/null
+++ b/Eigen/src/Core/ConditionEstimator.h
@@ -0,0 +1,175 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONDITIONESTIMATOR_H
+#define EIGEN_CONDITIONESTIMATOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Vector, typename RealVector, bool IsComplex>
+struct rcond_compute_sign {
+  static inline Vector run(const Vector& v) {
+    const RealVector v_abs = v.cwiseAbs();
+    return (v_abs.array() == static_cast<typename Vector::RealScalar>(0))
+            .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs));
+  }
+};
+
+// Partial specialization to avoid elementwise division for real vectors.
+template <typename Vector>
+struct rcond_compute_sign<Vector, Vector, false> {
+  static inline Vector run(const Vector& v) {
+    return (v.array() < static_cast<typename Vector::RealScalar>(0))
+           .select(-Vector::Ones(v.size()), Vector::Ones(v.size()));
+  }
+};
+
+/**
+  * \returns an estimate of ||inv(matrix)||_1 given a decomposition of
+  * \a matrix that implements .solve() and .adjoint().solve() methods.
+  *
+  * This function implements Algorithms 4.1 and 5.1 from
+  *   http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf
+  * which also forms the basis for the condition number estimators in
+  * LAPACK. Since at most 10 calls to the solve method of dec are
+  * performed, the total cost is O(dims^2), as opposed to O(dims^3)
+  * needed to compute the inverse matrix explicitly.
+  *
+  * The most common usage is in estimating the condition number
+  * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be
+  * computed directly in O(n^2) operations.
+  *
+  * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and
+  * LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomposition& dec)
+{
+  typedef typename Decomposition::MatrixType MatrixType;
+  typedef typename Decomposition::Scalar Scalar;
+  typedef typename Decomposition::RealScalar RealScalar;
+  typedef typename internal::plain_col_type<MatrixType>::type Vector;
+  typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVector;
+  const bool is_complex = (NumTraits<Scalar>::IsComplex != 0);
+
+  eigen_assert(dec.rows() == dec.cols());
+  const Index n = dec.rows();
+  if (n == 0)
+    return 0;
+
+  // Disable Index to float conversion warning
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  Vector v = dec.solve(Vector::Ones(n) / Scalar(n));
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
+
+  // lower_bound is a lower bound on
+  //   ||inv(matrix)||_1  = sup_v ||inv(matrix) v||_1 / ||v||_1
+  // and is the objective maximized by the ("super-") gradient ascent
+  // algorithm below.
+  RealScalar lower_bound = v.template lpNorm<1>();
+  if (n == 1)
+    return lower_bound;
+
+  // Gradient ascent algorithm follows: We know that the optimum is achieved at
+  // one of the simplices v = e_i, so in each iteration we follow a
+  // super-gradient to move towards the optimal one.
+  RealScalar old_lower_bound = lower_bound;
+  Vector sign_vector(n);
+  Vector old_sign_vector;
+  Index v_max_abs_index = -1;
+  Index old_v_max_abs_index = v_max_abs_index;
+  for (int k = 0; k < 4; ++k)
+  {
+    sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v);
+    if (k > 0 && !is_complex && sign_vector == old_sign_vector) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )|
+    v = dec.adjoint().solve(sign_vector);
+    v.real().cwiseAbs().maxCoeff(&v_max_abs_index);
+    if (v_max_abs_index == old_v_max_abs_index) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // Move to the new simplex e_j, where j = v_max_abs_index.
+    v = dec.solve(Vector::Unit(n, v_max_abs_index));  // v = inv(matrix) * e_j.
+    lower_bound = v.template lpNorm<1>();
+    if (lower_bound <= old_lower_bound) {
+      // Break if the gradient step did not increase the lower_bound.
+      break;
+    }
+    if (!is_complex) {
+      old_sign_vector = sign_vector;
+    }
+    old_v_max_abs_index = v_max_abs_index;
+    old_lower_bound = lower_bound;
+  }
+  // The following calculates an independent estimate of ||matrix||_1 by
+  // multiplying matrix by a vector with entries of slowly increasing
+  // magnitude and alternating sign:
+  //   v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1.
+  // This improvement to Hager's algorithm above is due to Higham. It was
+  // added to make the algorithm more robust in certain corner cases where
+  // large elements in the matrix might otherwise escape detection due to
+  // exact cancellation (especially when op and op_adjoint correspond to a
+  // sequence of backsubstitutions and permutations), which could cause
+  // Hager's algorithm to vastly underestimate ||matrix||_1.
+  Scalar alternating_sign(RealScalar(1));
+  for (Index i = 0; i < n; ++i) {
+    // The static_cast is needed when Scalar is a complex and RealScalar implements expression templates
+    v[i] = alternating_sign * static_cast<RealScalar>(RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
+    alternating_sign = -alternating_sign;
+  }
+  v = dec.solve(v);
+  const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n));
+  return numext::maxi(lower_bound, alternate_lower_bound);
+}
+
+/** \brief Reciprocal condition number estimator.
+  *
+  * Computing a decomposition of a dense matrix takes O(n^3) operations, while
+  * this method estimates the condition number quickly and reliably in O(n^2)
+  * operations.
+  *
+  * \returns an estimate of the reciprocal condition number
+  * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
+  * its decomposition. Supports the following decompositions: FullPivLU,
+  * PartialPivLU, LDLT, and LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar
+rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec)
+{
+  typedef typename Decomposition::RealScalar RealScalar;
+  eigen_assert(dec.rows() == dec.cols());
+  if (dec.rows() == 0)              return RealScalar(1);
+  if (matrix_norm == RealScalar(0)) return RealScalar(0);
+  if (dec.rows() == 1)              return RealScalar(1);
+  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
+  return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0)
+                                               : (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
+}
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
new file mode 100644
index 000000000..f7c1effca
--- /dev/null
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -0,0 +1,1671 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_COREEVALUATORS_H
+#define EIGEN_COREEVALUATORS_H
+
+namespace Eigen {
+  
+namespace internal {
+
+// This class returns the evaluator kind from the expression storage kind.
+// Default assumes index based accessors
+template<typename StorageKind>
+struct storage_kind_to_evaluator_kind {
+  typedef IndexBased Kind;
+};
+
+// This class returns the evaluator shape from the expression storage kind.
+// It can be Dense, Sparse, Triangular, Diagonal, SelfAdjoint, Band, etc.
+template<typename StorageKind> struct storage_kind_to_shape;
+
+template<> struct storage_kind_to_shape<Dense>                  { typedef DenseShape Shape;           };
+template<> struct storage_kind_to_shape<SolverStorage>          { typedef SolverShape Shape;           };
+template<> struct storage_kind_to_shape<PermutationStorage>     { typedef PermutationShape Shape;     };
+template<> struct storage_kind_to_shape<TranspositionsStorage>  { typedef TranspositionsShape Shape;  };
+
+// Evaluators have to be specialized with respect to various criteria such as:
+//  - storage/structure/shape
+//  - scalar type
+//  - etc.
+// Therefore, we need specialization of evaluator providing additional template arguments for each kind of evaluators.
+// We currently distinguish the following kind of evaluators:
+// - unary_evaluator    for expressions taking only one arguments (CwiseUnaryOp, CwiseUnaryView, Transpose, MatrixWrapper, ArrayWrapper, Reverse, Replicate)
+// - binary_evaluator   for expression taking two arguments (CwiseBinaryOp)
+// - ternary_evaluator   for expression taking three arguments (CwiseTernaryOp)
+// - product_evaluator  for linear algebra products (Product); special case of binary_evaluator because it requires additional tags for dispatching.
+// - mapbase_evaluator  for Map, Block, Ref
+// - block_evaluator    for Block (special dispatching to a mapbase_evaluator or unary_evaluator)
+
+template< typename T,
+          typename Arg1Kind   = typename evaluator_traits<typename T::Arg1>::Kind,
+          typename Arg2Kind   = typename evaluator_traits<typename T::Arg2>::Kind,
+          typename Arg3Kind   = typename evaluator_traits<typename T::Arg3>::Kind,
+          typename Arg1Scalar = typename traits<typename T::Arg1>::Scalar,
+          typename Arg2Scalar = typename traits<typename T::Arg2>::Scalar,
+          typename Arg3Scalar = typename traits<typename T::Arg3>::Scalar> struct ternary_evaluator;
+
+template< typename T,
+          typename LhsKind   = typename evaluator_traits<typename T::Lhs>::Kind,
+          typename RhsKind   = typename evaluator_traits<typename T::Rhs>::Kind,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar> struct binary_evaluator;
+
+template< typename T,
+          typename Kind   = typename evaluator_traits<typename T::NestedExpression>::Kind,
+          typename Scalar = typename T::Scalar> struct unary_evaluator;
+          
+// evaluator_traits<T> contains traits for evaluator<T> 
+
+template<typename T>
+struct evaluator_traits_base
+{
+  // by default, get evaluator kind and shape from storage
+  typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind;
+  typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape;
+};
+
+// Default evaluator traits
+template<typename T>
+struct evaluator_traits : public evaluator_traits_base<T>
+{
+};
+
+template<typename T, typename Shape = typename evaluator_traits<T>::Shape >
+struct evaluator_assume_aliasing {
+  static const bool value = false;
+};
+
+// By default, we assume a unary expression:
+template<typename T>
+struct evaluator : public unary_evaluator<T>
+{
+  typedef unary_evaluator<T> Base;
+  EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {}
+};
+
+
+// TODO: Think about const-correctness
+template<typename T>
+struct evaluator<const T>
+  : evaluator<T>
+{
+  EIGEN_DEVICE_FUNC
+  explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
+};
+
+// ---------- base class for all evaluators ----------
+
+template<typename ExpressionType>
+struct evaluator_base : public noncopyable
+{
+  // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
+  typedef traits<ExpressionType> ExpressionTraits;
+  
+  enum {
+    Alignment = 0
+  };
+};
+
+// -------------------- Matrix and Array --------------------
+//
+// evaluator<PlainObjectBase> is a common base class for the
+// Matrix and Array evaluators.
+// Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense,
+// so no need for more sophisticated dispatching.
+
+template<typename Derived>
+struct evaluator<PlainObjectBase<Derived> >
+  : evaluator_base<Derived>
+{
+  typedef PlainObjectBase<Derived> PlainObjectType;
+  typedef typename PlainObjectType::Scalar Scalar;
+  typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = PlainObjectType::IsRowMajor,
+    IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime,
+    RowsAtCompileTime = PlainObjectType::RowsAtCompileTime,
+    ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
+    
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = traits<Derived>::EvaluatorFlags,
+    Alignment = traits<Derived>::Alignment
+  };
+  
+  EIGEN_DEVICE_FUNC evaluator()
+    : m_data(0),
+      m_outerStride(IsVectorAtCompileTime  ? 0 
+                                           : int(IsRowMajor) ? ColsAtCompileTime 
+                                           : RowsAtCompileTime)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m)
+    : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) 
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    if (IsRowMajor)
+      return m_data[row * m_outerStride.value() + col];
+    else
+      return m_data[row + col * m_outerStride.value()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_data[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    if (IsRowMajor)
+      return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col];
+    else
+      return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return const_cast<Scalar*>(m_data)[index];
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    if (IsRowMajor)
+      return ploadt<PacketType, LoadMode>(m_data + row * m_outerStride.value() + col);
+    else
+      return ploadt<PacketType, LoadMode>(m_data + row + col * m_outerStride.value());
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return ploadt<PacketType, LoadMode>(m_data + index);
+  }
+
+  template<int StoreMode,typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    if (IsRowMajor)
+      return pstoret<Scalar, PacketType, StoreMode>
+	            (const_cast<Scalar*>(m_data) + row * m_outerStride.value() + col, x);
+    else
+      return pstoret<Scalar, PacketType, StoreMode>
+                    (const_cast<Scalar*>(m_data) + row + col * m_outerStride.value(), x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
+  }
+
+protected:
+  const Scalar *m_data;
+
+  // We do not need to know the outer stride for vectors
+  variable_if_dynamic<Index, IsVectorAtCompileTime  ? 0 
+                                                    : int(IsRowMajor) ? ColsAtCompileTime 
+                                                    : RowsAtCompileTime> m_outerStride;
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+  : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
+{
+  typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+  
+  EIGEN_DEVICE_FUNC evaluator() {}
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m) 
+  { }
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+  : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
+{
+  typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+
+  EIGEN_DEVICE_FUNC evaluator() {}
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m) 
+  { }
+};
+
+// -------------------- Transpose --------------------
+
+template<typename ArgType>
+struct unary_evaluator<Transpose<ArgType>, IndexBased>
+  : evaluator_base<Transpose<ArgType> >
+{
+  typedef Transpose<ArgType> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,    
+    Flags = evaluator<ArgType>::Flags ^ RowMajorBit,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(col, row);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(col, row);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename XprType::Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(col, row);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(index);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode,PacketType>(col, row, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode,PacketType>(index, x);
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+// -------------------- CwiseNullaryOp --------------------
+// Like Matrix and Array, this is not really a unary expression, so we directly specialize evaluator.
+// Likewise, there is not need to more sophisticated dispatching here.
+
+template<typename Scalar,typename NullaryOp,
+         bool has_nullary = has_nullary_operator<NullaryOp>::value,
+         bool has_unary   = has_unary_operator<NullaryOp>::value,
+         bool has_binary  = has_binary_operator<NullaryOp>::value>
+struct nullary_wrapper
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const { return op(i,j); }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); }
+
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const { return op.template packetOp<T>(i,j); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp<T>(i); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,true,false,false>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType=0, IndexType=0) const { return op(); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType=0, IndexType=0) const { return op.template packetOp<T>(); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,false,true>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j=0) const { return op(i,j); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j=0) const { return op.template packetOp<T>(i,j); }
+};
+
+// We need the following specialization for vector-only functors assigned to a runtime vector,
+// for instance, using linspace and assigning a RowVectorXd to a MatrixXd or even a row of a MatrixXd.
+// In this case, i==0 and j is used for the actual iteration.
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,true,false>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    eigen_assert(i==0 || j==0);
+    return op(i+j);
+  }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    eigen_assert(i==0 || j==0);
+    return op.template packetOp<T>(i+j);
+  }
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp<T>(i); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,false,false> {};
+
+#if 0 && EIGEN_COMP_MSVC>0
+// Disable this ugly workaround. This is now handled in traits<Ref>::match,
+// but this piece of code might still become handly if some other weird compilation
+// erros pop up again.
+
+// MSVC exhibits a weird compilation error when
+// compiling:
+//    Eigen::MatrixXf A = MatrixXf::Random(3,3);
+//    Ref<const MatrixXf> R = 2.f*A;
+// and that has_*ary_operator<scalar_constant_op<float>> have not been instantiated yet.
+// The "problem" is that evaluator<2.f*A> is instantiated by traits<Ref>::match<2.f*A>
+// and at that time has_*ary_operator<T> returns true regardless of T.
+// Then nullary_wrapper is badly instantiated as nullary_wrapper<.,.,true,true,true>.
+// The trick is thus to defer the proper instantiation of nullary_wrapper when coeff(),
+// and packet() are really instantiated as implemented below:
+
+// This is a simple wrapper around Index to enforce the re-instantiation of
+// has_*ary_operator when needed.
+template<typename T> struct nullary_wrapper_workaround_msvc {
+  nullary_wrapper_workaround_msvc(const T&);
+  operator T()const;
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,true,true,true>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i,j);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i);
+  }
+
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i,j);
+  }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i);
+  }
+};
+#endif // MSVC workaround
+
+template<typename NullaryOp, typename PlainObjectType>
+struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
+  : evaluator_base<CwiseNullaryOp<NullaryOp,PlainObjectType> >
+{
+  typedef CwiseNullaryOp<NullaryOp,PlainObjectType> XprType;
+  typedef typename internal::remove_all<PlainObjectType>::type PlainObjectTypeCleaned;
+  
+  enum {
+    CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
+    
+    Flags = (evaluator<PlainObjectTypeCleaned>::Flags
+          &  (  HereditaryBits
+              | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
+              | (functor_traits<NullaryOp>::PacketAccess    ? PacketAccessBit : 0)))
+          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
+    Alignment = AlignedMax
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
+    : m_functor(n.functor()), m_wrapper()
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(IndexType row, IndexType col) const
+  {
+    return m_wrapper(m_functor, row, col);
+  }
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(IndexType index) const
+  {
+    return m_wrapper(m_functor,index);
+  }
+
+  template<int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(IndexType row, IndexType col) const
+  {
+    return m_wrapper.template packetOp<PacketType>(m_functor, row, col);
+  }
+
+  template<int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(IndexType index) const
+  {
+    return m_wrapper.template packetOp<PacketType>(m_functor, index);
+  }
+
+protected:
+  const NullaryOp m_functor;
+  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
+};
+
+// -------------------- CwiseUnaryOp --------------------
+
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
+  : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType> >
+{
+  typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    
+    Flags = evaluator<ArgType>::Flags
+          & (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& op)
+    : m_functor(op.functor()), 
+      m_argImpl(op.nestedExpression()) 
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_functor(m_argImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_argImpl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(index));
+  }
+
+protected:
+  const UnaryOp m_functor;
+  evaluator<ArgType> m_argImpl;
+};
+
+// -------------------- CwiseTernaryOp --------------------
+
+// this is a ternary expression
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+  : public ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+{
+  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+  typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased, IndexBased>
+  : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+{
+  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<Arg1>::CoeffReadCost + evaluator<Arg2>::CoeffReadCost + evaluator<Arg3>::CoeffReadCost + functor_traits<TernaryOp>::Cost,
+    
+    Arg1Flags = evaluator<Arg1>::Flags,
+    Arg2Flags = evaluator<Arg2>::Flags,
+    Arg3Flags = evaluator<Arg3>::Flags,
+    SameType = is_same<typename Arg1::Scalar,typename Arg2::Scalar>::value && is_same<typename Arg1::Scalar,typename Arg3::Scalar>::value,
+    StorageOrdersAgree = (int(Arg1Flags)&RowMajorBit)==(int(Arg2Flags)&RowMajorBit) && (int(Arg1Flags)&RowMajorBit)==(int(Arg3Flags)&RowMajorBit),
+    Flags0 = (int(Arg1Flags) | int(Arg2Flags) | int(Arg3Flags)) & (
+        HereditaryBits
+        | (int(Arg1Flags) & int(Arg2Flags) & int(Arg3Flags) &
+           ( (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<TernaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (Arg1Flags & RowMajorBit),
+    Alignment = EIGEN_PLAIN_ENUM_MIN(
+        EIGEN_PLAIN_ENUM_MIN(evaluator<Arg1>::Alignment, evaluator<Arg2>::Alignment),
+        evaluator<Arg3>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_arg1Impl(xpr.arg1()), 
+      m_arg2Impl(xpr.arg2()), 
+      m_arg3Impl(xpr.arg3())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(row, col),
+                              m_arg2Impl.template packet<LoadMode,PacketType>(row, col),
+                              m_arg3Impl.template packet<LoadMode,PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(index),
+                              m_arg2Impl.template packet<LoadMode,PacketType>(index),
+                              m_arg3Impl.template packet<LoadMode,PacketType>(index));
+  }
+
+protected:
+  const TernaryOp m_functor;
+  evaluator<Arg1> m_arg1Impl;
+  evaluator<Arg2> m_arg2Impl;
+  evaluator<Arg3> m_arg3Impl;
+};
+
+// -------------------- CwiseBinaryOp --------------------
+
+// this is a binary expression
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+  : public binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    
+    LhsFlags = evaluator<Lhs>::Flags,
+    RhsFlags = evaluator<Rhs>::Flags,
+    SameType = is_same<typename Lhs::Scalar,typename Rhs::Scalar>::value,
+    StorageOrdersAgree = (int(LhsFlags)&RowMajorBit)==(int(RhsFlags)&RowMajorBit),
+    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
+        HereditaryBits
+      | (int(LhsFlags) & int(RhsFlags) &
+           ( (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<Lhs>::Alignment,evaluator<Rhs>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(row, col),
+                              m_rhsImpl.template packet<LoadMode,PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(index),
+                              m_rhsImpl.template packet<LoadMode,PacketType>(index));
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+};
+
+// -------------------- CwiseUnaryView --------------------
+
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
+  : evaluator_base<CwiseUnaryView<UnaryOp, ArgType> >
+{
+  typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
+    
+    Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost...
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
+    : m_unaryOp(op.functor()), 
+      m_argImpl(op.nestedExpression()) 
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_unaryOp(m_argImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_unaryOp(m_argImpl.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_unaryOp(m_argImpl.coeffRef(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_unaryOp(m_argImpl.coeffRef(index));
+  }
+
+protected:
+  const UnaryOp m_unaryOp;
+  evaluator<ArgType> m_argImpl;
+};
+
+// -------------------- Map --------------------
+
+// FIXME perhaps the PlainObjectType could be provided by Derived::PlainObject ?
+// but that might complicate template specialization
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator;
+
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator : evaluator_base<Derived>
+{
+  typedef Derived  XprType;
+  typedef typename XprType::PointerType PointerType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  
+  enum {
+    IsRowMajor = XprType::RowsAtCompileTime,
+    ColsAtCompileTime = XprType::ColsAtCompileTime,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost
+  };
+
+  EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
+    : m_data(const_cast<PointerType>(map.data())),
+      m_innerStride(map.innerStride()),
+      m_outerStride(map.outerStride())
+  {
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
+                        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_data[col * colStride() + row * rowStride()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_data[index * m_innerStride.value()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_data[col * colStride() + row * rowStride()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_data[index * m_innerStride.value()];
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    return internal::ploadt<PacketType, LoadMode>(ptr);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
+  }
+protected:
+  EIGEN_DEVICE_FUNC
+  inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
+  EIGEN_DEVICE_FUNC
+  inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
+
+  PointerType m_data;
+  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
+  const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
+};
+
+template<typename PlainObjectType, int MapOptions, typename StrideType> 
+struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
+  : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType>
+{
+  typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+  
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             : int(StrideType::OuterStrideAtCompileTime),
+    HasNoInnerStride = InnerStrideAtCompileTime == 1,
+    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
+    HasNoStride = HasNoInnerStride && HasNoOuterStride,
+    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+    
+    PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit),
+    LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit),
+    Flags = int( evaluator<PlainObjectType>::Flags) & (LinearAccessMask&PacketAccessMask),
+    
+    Alignment = int(MapOptions)&int(AlignedMask)
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map)
+    : mapbase_evaluator<XprType, PlainObjectType>(map) 
+  { }
+};
+
+// -------------------- Ref --------------------
+
+template<typename PlainObjectType, int RefOptions, typename StrideType> 
+struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
+  : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType>
+{
+  typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
+  
+  enum {
+    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags,
+    Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref)
+    : mapbase_evaluator<XprType, PlainObjectType>(ref) 
+  { }
+};
+
+// -------------------- Block --------------------
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
+         bool HasDirectAccess = internal::has_direct_access<ArgType>::ret> struct block_evaluator;
+         
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
+struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+  : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
+    
+    ArgTypeIsRowMajor = (int(evaluator<ArgType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+               : ArgTypeIsRowMajor,
+    HasSameStorageOrderAsArgType = (IsRowMajor == ArgTypeIsRowMajor),
+    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsArgType
+                             ? int(inner_stride_at_compile_time<ArgType>::ret)
+                             : int(outer_stride_at_compile_time<ArgType>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
+                             ? int(outer_stride_at_compile_time<ArgType>::ret)
+                             : int(inner_stride_at_compile_time<ArgType>::ret),
+    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0,
+    
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
+    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
+    Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
+                                           DirectAccessBit |
+                                           MaskPacketAccessBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
+    
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
+  };
+  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+};
+
+// no direct-access => dispatch to a unary evaluator
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAccess*/ false>
+  : unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
+    : unary_evaluator<XprType>(block) 
+  {}
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBased>
+  : evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
+    : m_argImpl(block.nestedExpression()), 
+      m_startRow(block.startRow()), 
+      m_startCol(block.startCol()) 
+  { }
+ 
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    RowsAtCompileTime = XprType::RowsAtCompileTime
+  };
+ 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  { 
+    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); 
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  { 
+    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  { 
+    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); 
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  { 
+    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+ 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const 
+  { 
+    return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col); 
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const 
+  { 
+    return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                       RowsAtCompileTime == 1 ? index : 0);
+  }
+  
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x) 
+  {
+    return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x); 
+  }
+  
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x) 
+  {
+    return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                             RowsAtCompileTime == 1 ? index : 0,
+                                             x);
+  }
+ 
+protected:
+  evaluator<ArgType> m_argImpl;
+  const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+  const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
+};
+
+// TODO: This evaluator does not actually use the child evaluator; 
+// all action is via the data() as returned by the Block expression.
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
+struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
+  : mapbase_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>,
+                      typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
+  {
+    // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
+    eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
+  }
+};
+
+
+// -------------------- Select --------------------
+// NOTE shall we introduce a ternary_evaluator?
+
+// TODO enable vectorization for Select
+template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
+struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+  : evaluator_base<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+{
+  typedef Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> XprType;
+  enum {
+    CoeffReadCost = evaluator<ConditionMatrixType>::CoeffReadCost
+                  + EIGEN_PLAIN_ENUM_MAX(evaluator<ThenMatrixType>::CoeffReadCost,
+                                         evaluator<ElseMatrixType>::CoeffReadCost),
+
+    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
+    
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select)
+    : m_conditionImpl(select.conditionMatrix()),
+      m_thenImpl(select.thenMatrix()),
+      m_elseImpl(select.elseMatrix())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+ 
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    if (m_conditionImpl.coeff(row, col))
+      return m_thenImpl.coeff(row, col);
+    else
+      return m_elseImpl.coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    if (m_conditionImpl.coeff(index))
+      return m_thenImpl.coeff(index);
+    else
+      return m_elseImpl.coeff(index);
+  }
+ 
+protected:
+  evaluator<ConditionMatrixType> m_conditionImpl;
+  evaluator<ThenMatrixType> m_thenImpl;
+  evaluator<ElseMatrixType> m_elseImpl;
+};
+
+
+// -------------------- Replicate --------------------
+
+template<typename ArgType, int RowFactor, int ColFactor> 
+struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
+  : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
+{
+  typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  enum {
+    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
+  };
+  typedef typename internal::nested_eval<ArgType,Factor>::type ArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
+    LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0,
+    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit),
+    
+    Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate)
+    : m_arg(replicate.nestedExpression()),
+      m_argImpl(m_arg),
+      m_rows(replicate.nestedExpression().rows()),
+      m_cols(replicate.nestedExpression().cols())
+  {}
+ 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
+                           : RowFactor==1 ? row
+                           : row % m_rows.value();
+    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
+                           : ColFactor==1 ? col
+                           : col % m_cols.value();
+    
+    return m_argImpl.coeff(actual_row, actual_col);
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+    
+    return m_argImpl.coeff(actual_index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
+                           : RowFactor==1 ? row
+                           : row % m_rows.value();
+    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
+                           : ColFactor==1 ? col
+                           : col % m_cols.value();
+
+    return m_argImpl.template packet<LoadMode,PacketType>(actual_row, actual_col);
+  }
+  
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+
+    return m_argImpl.template packet<LoadMode,PacketType>(actual_index);
+  }
+ 
+protected:
+  const ArgTypeNested m_arg;
+  evaluator<ArgTypeNestedCleaned> m_argImpl;
+  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime> m_rows;
+  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
+};
+
+
+// -------------------- PartialReduxExpr --------------------
+
+template< typename ArgType, typename MemberOp, int Direction>
+struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
+  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
+{
+  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
+  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  typedef typename ArgType::Scalar InputScalar;
+  typedef typename XprType::Scalar Scalar;
+  enum {
+    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
+  };
+  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
+  enum {
+    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
+                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
+    
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit,
+    
+    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
+    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value));
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index i, Index j) const
+  {
+    if (Direction==Vertical)
+      return m_functor(m_arg.col(j));
+    else
+      return m_functor(m_arg.row(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index index) const
+  {
+    if (Direction==Vertical)
+      return m_functor(m_arg.col(index));
+    else
+      return m_functor(m_arg.row(index));
+  }
+
+protected:
+  typename internal::add_const_on_value_type<ArgTypeNested>::type m_arg;
+  const MemberOp m_functor;
+};
+
+
+// -------------------- MatrixWrapper and ArrayWrapper --------------------
+//
+// evaluator_wrapper_base<T> is a common base class for the
+// MatrixWrapper and ArrayWrapper evaluators.
+
+template<typename XprType>
+struct evaluator_wrapper_base
+  : evaluator_base<XprType>
+{
+  typedef typename remove_all<typename XprType::NestedExpressionType>::type ArgType;
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = evaluator<ArgType>::Flags,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename ArgType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(row, col);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(index);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode>(row, col, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode>(index, x);
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+template<typename TArgType>
+struct unary_evaluator<MatrixWrapper<TArgType> >
+  : evaluator_wrapper_base<MatrixWrapper<TArgType> >
+{
+  typedef MatrixWrapper<TArgType> XprType;
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
+    : evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
+  { }
+};
+
+template<typename TArgType>
+struct unary_evaluator<ArrayWrapper<TArgType> >
+  : evaluator_wrapper_base<ArrayWrapper<TArgType> >
+{
+  typedef ArrayWrapper<TArgType> XprType;
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
+    : evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
+  { }
+};
+
+
+// -------------------- Reverse --------------------
+
+// defined in Reverse.h:
+template<typename PacketType, bool ReversePacket> struct reverse_packet_cond;
+
+template<typename ArgType, int Direction>
+struct unary_evaluator<Reverse<ArgType, Direction> >
+  : evaluator_base<Reverse<ArgType, Direction> >
+{
+  typedef Reverse<ArgType, Direction> XprType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = XprType::IsRowMajor,
+    IsColMajor = !IsRowMajor,
+    ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
+    ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
+    ReversePacket = (Direction == BothDirections)
+                    || ((Direction == Vertical)   && IsColMajor)
+                    || ((Direction == Horizontal) && IsRowMajor),
+                    
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    // let's enable LinearAccess only with vectorization because of the product overhead
+    // FIXME enable DirectAccess with negative strides?
+    Flags0 = evaluator<ArgType>::Flags,
+    LinearAccess = ( (Direction==BothDirections) && (int(Flags0)&PacketAccessBit) )
+                  || ((ReverseRow && XprType::ColsAtCompileTime==1) || (ReverseCol && XprType::RowsAtCompileTime==1))
+                 ? LinearAccessBit : 0,
+
+    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
+    
+    Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse)
+    : m_argImpl(reverse.nestedExpression()),
+      m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
+      m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
+  { }
+ 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row,
+                           ReverseCol ? m_cols.value() - col - 1 : col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row,
+                              ReverseCol ? m_cols.value() - col - 1 : col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1
+    };
+    typedef internal::reverse_packet_cond<PacketType,ReversePacket> reverse_packet;
+    return reverse_packet::run(m_argImpl.template packet<LoadMode,PacketType>(
+                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
+                                  ReverseCol ? m_cols.value() - col - OffsetCol : col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    enum { PacketSize = unpacket_traits<PacketType>::size };
+    return preverse(m_argImpl.template packet<LoadMode,PacketType>(m_rows.value() * m_cols.value() - index - PacketSize));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    // FIXME we could factorize some code with packet(i,j)
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1
+    };
+    typedef internal::reverse_packet_cond<PacketType,ReversePacket> reverse_packet;
+    m_argImpl.template writePacket<LoadMode>(
+                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
+                                  ReverseCol ? m_cols.value() - col - OffsetCol : col,
+                                  reverse_packet::run(x));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    enum { PacketSize = unpacket_traits<PacketType>::size };
+    m_argImpl.template writePacket<LoadMode>
+      (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
+  }
+ 
+protected:
+  evaluator<ArgType> m_argImpl;
+
+  // If we do not reverse rows, then we do not need to know the number of rows; same for columns
+  // Nonetheless, in this case it is important to set to 1 such that the coeff(index) method works fine for vectors.
+  const variable_if_dynamic<Index, ReverseRow ? ArgType::RowsAtCompileTime : 1> m_rows;
+  const variable_if_dynamic<Index, ReverseCol ? ArgType::ColsAtCompileTime : 1> m_cols;
+};
+
+
+// -------------------- Diagonal --------------------
+
+template<typename ArgType, int DiagIndex>
+struct evaluator<Diagonal<ArgType, DiagIndex> >
+  : evaluator_base<Diagonal<ArgType, DiagIndex> >
+{
+  typedef Diagonal<ArgType, DiagIndex> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    Flags = (unsigned int)(evaluator<ArgType>::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit,
+    
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal)
+    : m_argImpl(diagonal.nestedExpression()),
+      m_index(diagonal.index())
+  { }
+ 
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index) const
+  {
+    return m_argImpl.coeff(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index + rowOffset(), index + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index)
+  {
+    return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+  const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
+
+private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
+};
+
+
+//----------------------------------------------------------------------
+// deprecated code
+//----------------------------------------------------------------------
+
+// -------------------- EvalToTemp --------------------
+
+// expression class for evaluating nested expression to a temporary
+
+template<typename ArgType> class EvalToTemp;
+
+template<typename ArgType>
+struct traits<EvalToTemp<ArgType> >
+  : public traits<ArgType>
+{ };
+
+template<typename ArgType>
+class EvalToTemp
+  : public dense_xpr_base<EvalToTemp<ArgType> >::type
+{
+ public:
+ 
+  typedef typename dense_xpr_base<EvalToTemp>::type Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
+ 
+  explicit EvalToTemp(const ArgType& arg)
+    : m_arg(arg)
+  { }
+ 
+  const ArgType& arg() const
+  {
+    return m_arg;
+  }
+
+  Index rows() const 
+  {
+    return m_arg.rows();
+  }
+
+  Index cols() const 
+  {
+    return m_arg.cols();
+  }
+
+ private:
+  const ArgType& m_arg;
+};
+ 
+template<typename ArgType>
+struct evaluator<EvalToTemp<ArgType> >
+  : public evaluator<typename ArgType::PlainObject>
+{
+  typedef EvalToTemp<ArgType>                   XprType;
+  typedef typename ArgType::PlainObject         PlainObject;
+  typedef evaluator<PlainObject> Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : m_result(xpr.arg())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+  }
+
+  // This constructor is used when nesting an EvalTo evaluator in another evaluator
+  EIGEN_DEVICE_FUNC evaluator(const ArgType& arg)
+    : m_result(arg)
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COREEVALUATORS_H
diff --git a/Eigen/src/Core/CoreIterators.h b/Eigen/src/Core/CoreIterators.h
index 6da4683d2..4eb42b93a 100644
--- a/Eigen/src/Core/CoreIterators.h
+++ b/Eigen/src/Core/CoreIterators.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,47 +15,113 @@ namespace Eigen {
 /* This file contains the respective InnerIterator definition of the expressions defined in Eigen/Core
  */
 
-/** \ingroup SparseCore_Module
-  * \class InnerIterator
-  * \brief An InnerIterator allows to loop over the element of a sparse (or dense) matrix or expression
-  *
-  * todo
+namespace internal {
+
+template<typename XprType, typename EvaluatorKind>
+class inner_iterator_selector;
+
+}
+
+/** \class InnerIterator
+  * \brief An InnerIterator allows to loop over the element of any matrix expression.
+  * 
+  * \warning To be used with care because an evaluator is constructed every time an InnerIterator iterator is constructed.
+  * 
+  * TODO: add a usage example
   */
+template<typename XprType>
+class InnerIterator
+{
+protected:
+  typedef internal::inner_iterator_selector<XprType, typename internal::evaluator_traits<XprType>::Kind> IteratorType;
+  typedef internal::evaluator<XprType> EvaluatorType;
+  typedef typename internal::traits<XprType>::Scalar Scalar;
+public:
+  /** Construct an iterator over the \a outerId -th row or column of \a xpr */
+  InnerIterator(const XprType &xpr, const Index &outerId)
+    : m_eval(xpr), m_iter(m_eval, outerId, xpr.innerSize())
+  {}
+  
+  /// \returns the value of the current coefficient.
+  EIGEN_STRONG_INLINE Scalar value() const          { return m_iter.value(); }
+  /** Increment the iterator \c *this to the next non-zero coefficient.
+    * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
+    */
+  EIGEN_STRONG_INLINE InnerIterator& operator++()   { m_iter.operator++(); return *this; }
+  /// \returns the column or row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index index() const           { return m_iter.index(); }
+  /// \returns the row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index row() const             { return m_iter.row(); }
+  /// \returns the column index of the current coefficient.
+  EIGEN_STRONG_INLINE Index col() const             { return m_iter.col(); }
+  /// \returns \c true if the iterator \c *this still references a valid coefficient.
+  EIGEN_STRONG_INLINE operator bool() const         { return m_iter; }
+  
+protected:
+  EvaluatorType m_eval;
+  IteratorType m_iter;
+private:
+  // If you get here, then you're not using the right InnerIterator type, e.g.:
+  //   SparseMatrix<double,RowMajor> A;
+  //   SparseMatrix<double>::InnerIterator it(A,0);
+  template<typename T> InnerIterator(const EigenBase<T>&,Index outer);
+};
+
+namespace internal {
 
-// generic version for dense matrix and expressions
-template<typename Derived> class DenseBase<Derived>::InnerIterator
+// Generic inner iterator implementation for dense objects
+template<typename XprType>
+class inner_iterator_selector<XprType, IndexBased>
 {
-  protected:
-    typedef typename Derived::Scalar Scalar;
-    typedef typename Derived::Index Index;
-
-    enum { IsRowMajor = (Derived::Flags&RowMajorBit)==RowMajorBit };
-  public:
-    EIGEN_STRONG_INLINE InnerIterator(const Derived& expr, Index outer)
-      : m_expression(expr), m_inner(0), m_outer(outer), m_end(expr.innerSize())
-    {}
-
-    EIGEN_STRONG_INLINE Scalar value() const
-    {
-      return (IsRowMajor) ? m_expression.coeff(m_outer, m_inner)
-                          : m_expression.coeff(m_inner, m_outer);
-    }
-
-    EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; return *this; }
-
-    EIGEN_STRONG_INLINE Index index() const { return m_inner; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
-
-  protected:
-    const Derived& m_expression;
-    Index m_inner;
-    const Index m_outer;
-    const Index m_end;
+protected:
+  typedef evaluator<XprType> EvaluatorType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit };
+  
+public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &innerSize)
+    : m_eval(eval), m_inner(0), m_outer(outerId), m_end(innerSize)
+  {}
+
+  EIGEN_STRONG_INLINE Scalar value() const
+  {
+    return (IsRowMajor) ? m_eval.coeff(m_outer, m_inner)
+                        : m_eval.coeff(m_inner, m_outer);
+  }
+
+  EIGEN_STRONG_INLINE inner_iterator_selector& operator++() { m_inner++; return *this; }
+
+  EIGEN_STRONG_INLINE Index index() const { return m_inner; }
+  inline Index row() const { return IsRowMajor ? m_outer : index(); }
+  inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+  EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
+
+protected:
+  const EvaluatorType& m_eval;
+  Index m_inner;
+  const Index m_outer;
+  const Index m_end;
 };
 
+// For iterator-based evaluator, inner-iterator is already implemented as
+// evaluator<>::InnerIterator
+template<typename XprType>
+class inner_iterator_selector<XprType, IteratorBased>
+ : public evaluator<XprType>::InnerIterator
+{
+protected:
+  typedef typename evaluator<XprType>::InnerIterator Base;
+  typedef evaluator<XprType> EvaluatorType;
+  
+public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &/*innerSize*/)
+    : Base(eval, outerId)
+  {}  
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_COREITERATORS_H
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index 586f77aaf..a36765e39 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -13,26 +13,6 @@
 
 namespace Eigen {
 
-/** \class CwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
-  *
-  * \param BinaryOp template functor implementing the operator
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  *
-  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
-  * It is the return type of binary operators, by which we mean only those binary operators where
-  * both the left-hand side and the right-hand side are Eigen expressions.
-  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseBinaryOp types explicitly.
-  *
-  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
-  */
-
 namespace internal {
 template<typename BinaryOp, typename Lhs, typename Rhs>
 struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
@@ -52,76 +32,75 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
   // we still want to handle the case when the result type is different.
   typedef typename result_of<
                      BinaryOp(
-                       typename Lhs::Scalar,
-                       typename Rhs::Scalar
+                       const typename Lhs::Scalar&,
+                       const typename Rhs::Scalar&
                      )
                    >::type Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
-                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
+  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind,
+                                              typename traits<Rhs>::StorageKind,
+                                              BinaryOp>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<Lhs>::StorageIndex,
+                                      typename traits<Rhs>::StorageIndex>::type StorageIndex;
   typedef typename Lhs::Nested LhsNested;
   typedef typename Rhs::Nested RhsNested;
   typedef typename remove_reference<LhsNested>::type _LhsNested;
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   enum {
-    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-    LhsFlags = _LhsNested::Flags,
-    RhsFlags = _RhsNested::Flags,
-    SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
-    StorageOrdersAgree = (int(Lhs::Flags)&RowMajorBit)==(int(Rhs::Flags)&RowMajorBit),
-    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
-        HereditaryBits
-      | (int(LhsFlags) & int(RhsFlags) &
-           ( AlignedBit
-           | (StorageOrdersAgree ? LinearAccessBit : 0)
-           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
-           )
-        )
-     ),
-    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
-    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + functor_traits<BinaryOp>::Cost
+    Flags = cwise_promote_storage_order<typename traits<Lhs>::StorageKind,typename traits<Rhs>::StorageKind,_LhsNested::Flags & RowMajorBit,_RhsNested::Flags & RowMajorBit>::value
   };
 };
 } // end namespace internal
 
-// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
-// that would take two operands of different types. If there were such an example, then this check should be
-// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
-// currently they take only one typename Scalar template parameter.
-// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
-// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
-// add together a float matrix and a double matrix.
-#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
-  EIGEN_STATIC_ASSERT((internal::functor_is_product_like<BINOP>::ret \
-                        ? int(internal::scalar_product_traits<LHS, RHS>::Defined) \
-                        : int(internal::is_same<LHS, RHS>::value)), \
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
 template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
 class CwiseBinaryOpImpl;
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOp : internal::no_assignment_operator,
+/** \class CwiseBinaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
+  *
+  * \tparam BinaryOp template functor implementing the operator
+  * \tparam LhsType the type of the left-hand side
+  * \tparam RhsType the type of the right-hand side
+  *
+  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
+  * It is the return type of binary operators, by which we mean only those binary operators where
+  * both the left-hand side and the right-hand side are Eigen expressions.
+  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseBinaryOp types explicitly.
+  *
+  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
+  */
+template<typename BinaryOp, typename LhsType, typename RhsType>
+class CwiseBinaryOp : 
   public CwiseBinaryOpImpl<
-          BinaryOp, Lhs, Rhs,
-          typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                           typename internal::traits<Rhs>::StorageKind>::ret>
+          BinaryOp, LhsType, RhsType,
+          typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                        typename internal::traits<RhsType>::StorageKind,
+                                                        BinaryOp>::ret>,
+  internal::no_assignment_operator
 {
   public:
+    
+    typedef typename internal::remove_all<BinaryOp>::type Functor;
+    typedef typename internal::remove_all<LhsType>::type Lhs;
+    typedef typename internal::remove_all<RhsType>::type Rhs;
 
     typedef typename CwiseBinaryOpImpl<
-        BinaryOp, Lhs, Rhs,
-        typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                         typename internal::traits<Rhs>::StorageKind>::ret>::Base Base;
+        BinaryOp, LhsType, RhsType,
+        typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                      typename internal::traits<Rhs>::StorageKind,
+                                                      BinaryOp>::ret>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp)
 
-    typedef typename internal::nested<Lhs>::type LhsNested;
-    typedef typename internal::nested<Rhs>::type RhsNested;
+    typedef typename internal::ref_selector<LhsType>::type LhsNested;
+    typedef typename internal::ref_selector<RhsType>::type RhsNested;
     typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
     typedef typename internal::remove_reference<RhsNested>::type _RhsNested;
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
       : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
     {
@@ -131,6 +110,7 @@ class CwiseBinaryOp : internal::no_assignment_operator,
       eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rows() const {
       // return the fixed size type if available to enable compile time optimizations
       if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
@@ -138,6 +118,7 @@ class CwiseBinaryOp : internal::no_assignment_operator,
       else
         return m_lhs.rows();
     }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index cols() const {
       // return the fixed size type if available to enable compile time optimizations
       if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
@@ -147,10 +128,13 @@ class CwiseBinaryOp : internal::no_assignment_operator,
     }
 
     /** \returns the left hand side nested expression */
+    EIGEN_DEVICE_FUNC
     const _LhsNested& lhs() const { return m_lhs; }
     /** \returns the right hand side nested expression */
+    EIGEN_DEVICE_FUNC
     const _RhsNested& rhs() const { return m_rhs; }
     /** \returns the functor representing the binary operation */
+    EIGEN_DEVICE_FUNC
     const BinaryOp& functor() const { return m_functor; }
 
   protected:
@@ -159,41 +143,13 @@ class CwiseBinaryOp : internal::no_assignment_operator,
     const BinaryOp m_functor;
 };
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
-  : public internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+// Generic API dispatcher
+template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
+class CwiseBinaryOpImpl
+  : public internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
 {
-    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
-  public:
-
-    typedef typename internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE( Derived )
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return derived().functor()(derived().lhs().coeff(rowId, colId),
-                                 derived().rhs().coeff(rowId, colId));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(rowId, colId),
-                                          derived().rhs().template packet<LoadMode>(rowId, colId));
-    }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return derived().functor()(derived().lhs().coeff(index),
-                                 derived().rhs().coeff(index));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(index),
-                                          derived().rhs().template packet<LoadMode>(index));
-    }
+public:
+  typedef typename internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
 };
 
 /** replaces \c *this by \c *this - \a other.
@@ -205,8 +161,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -219,11 +174,11 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_BINARY_OP_H
+
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index a93bab2d0..dd498f758 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -12,13 +12,24 @@
 
 namespace Eigen {
 
+namespace internal {
+template<typename NullaryOp, typename PlainObjectType>
+struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
+{
+  enum {
+    Flags = traits<PlainObjectType>::Flags & RowMajorBit
+  };
+};
+
+} // namespace internal
+
 /** \class CwiseNullaryOp
   * \ingroup Core_Module
   *
   * \brief Generic expression of a matrix where all coefficients are defined by a functor
   *
-  * \param NullaryOp template functor implementing the operator
-  * \param PlainObjectType the underlying plain matrix/array type
+  * \tparam NullaryOp template functor implementing the operator
+  * \tparam PlainObjectType the underlying plain matrix/array type
   *
   * This class represents an expression of a generic nullary operator.
   * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
@@ -27,68 +38,49 @@ namespace Eigen {
   * However, if you want to write a function returning such an expression, you
   * will need to use this class.
   *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
+  * The functor NullaryOp must expose one of the following method:
+    <table class="manual">
+    <tr            ><td>\c operator()() </td><td>if the procedural generation does not depend on the coefficient entries (e.g., random numbers)</td></tr>
+    <tr class="alt"><td>\c operator()(Index i)</td><td>if the procedural generation makes sense for vectors only and that it depends on the coefficient index \c i (e.g., linspace) </td></tr>
+    <tr            ><td>\c operator()(Index i,Index j)</td><td>if the procedural generation depends on the matrix coordinates \c i, \c j (e.g., to generate a checkerboard with 0 and 1)</td></tr>
+    </table>
+  * It is also possible to expose the last two operators if the generation makes sense for matrices but can be optimized for vectors.
+  *
+  * See DenseBase::NullaryExpr(Index,const CustomNullaryOp&) for an example binding
+  * C++11 random number generators.
+  *
+  * A nullary expression can also be used to implement custom sophisticated matrix manipulations
+  * that cannot be covered by the existing set of natively supported matrix manipulations.
+  * See this \ref TopicCustomizing_NullaryExpr "page" for some examples and additional explanations
+  * on the behavior of CwiseNullaryOp.
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr
   */
-
-namespace internal {
 template<typename NullaryOp, typename PlainObjectType>
-struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
-{
-  enum {
-    Flags = (traits<PlainObjectType>::Flags
-      & (  HereditaryBits
-         | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
-         | (functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
-      | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
-    CoeffReadCost = functor_traits<NullaryOp>::Cost
-  };
-};
-}
-
-template<typename NullaryOp, typename PlainObjectType>
-class CwiseNullaryOp : internal::no_assignment_operator,
-  public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type
+class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
 {
   public:
 
     typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)
 
-    CwiseNullaryOp(Index nbRows, Index nbCols, const NullaryOp& func = NullaryOp())
-      : m_rows(nbRows), m_cols(nbCols), m_functor(func)
+    EIGEN_DEVICE_FUNC
+    CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
+      : m_rows(rows), m_cols(cols), m_functor(func)
     {
-      eigen_assert(nbRows >= 0
-            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-            &&  nbCols >= 0
-            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols));
+      eigen_assert(rows >= 0
+            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+            &&  cols >= 0
+            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }
 
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return m_functor(rowId, colId);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_functor.packetOp(rowId, colId);
-    }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return m_functor(index);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return m_functor.packetOp(index);
-    }
-
     /** \returns the functor representing the nullary operation */
+    EIGEN_DEVICE_FUNC
     const NullaryOp& functor() const { return m_functor; }
 
   protected:
@@ -113,10 +105,10 @@ class CwiseNullaryOp : internal::no_assignment_operator,
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
 }
 
 /** \returns an expression of a matrix defined by a custom functor \a func
@@ -132,16 +124,19 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
   *
   * The template parameter \a CustomNullaryOp is the type of the functor.
   *
+  * Here is an example with C++11 random generators: \include random_cpp11.cpp
+  * Output: \verbinclude random_cpp11.out
+  * 
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, Derived>(1, size, func);
-  else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func);
+  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, PlainObject>(1, size, func);
+  else return CwiseNullaryOp<CustomNullaryOp, PlainObject>(size, 1, func);
 }
 
 /** \returns an expression of a matrix defined by a custom functor \a func
@@ -155,19 +150,19 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(RowsAtCompileTime, ColsAtCompileTime, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
 }
 
 /** \returns an expression of a constant matrix of value \a value
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this DenseBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a nbRows and \a nbCols as arguments, so Zero() should be used
+  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
   * instead.
   *
   * The template parameter \a CustomNullaryOp is the type of the functor.
@@ -176,9 +171,9 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Constant(Index nbRows, Index nbCols, const Scalar& value)
+DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_constant_op<Scalar>(value));
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
 }
 
 /** \returns an expression of a constant matrix of value \a value
@@ -220,46 +215,33 @@ DenseBase<Derived>::Constant(const Scalar& value)
   return DenseBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_constant_op<Scalar>(value));
 }
 
-/**
-  * \brief Sets a linearly space vector.
-  *
-  * The function generates 'size' equally spaced values in the closed interval [low,high].
-  * This particular version of LinSpaced() uses sequential access, i.e. vector access is
-  * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
-  * and yields faster code than the random access version.
+/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&)
   *
-  * When size is set to 1, a vector of length 1 containing 'high' is returned.
-  *
-  * \only_for_vectors
-  *
-  * Example: \include DenseBase_LinSpaced_seq.cpp
-  * Output: \verbinclude DenseBase_LinSpaced_seq.out
-  *
-  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Index,Scalar,Scalar), CwiseNullaryOp
+  * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,false>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
 }
 
-/**
-  * \copydoc DenseBase::LinSpaced(Sequential_t, Index, const Scalar&, const Scalar&)
-  * Special version for fixed size types which does not require the size parameter.
+/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&)
+  *
+  * \sa LinSpaced(Scalar,Scalar)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,false>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
 }
 
 /**
-  * \brief Sets a linearly space vector.
+  * \brief Sets a linearly spaced vector.
   *
   * The function generates 'size' equally spaced values in the closed interval [low,high].
   * When size is set to 1, a vector of length 1 containing 'high' is returned.
@@ -269,14 +251,24 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
   * Example: \include DenseBase_LinSpaced.cpp
   * Output: \verbinclude DenseBase_LinSpaced.out
   *
-  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Sequential_t,Index,const Scalar&,const Scalar&,Index), CwiseNullaryOp
+  * For integer scalar types, an even spacing is possible if and only if the length of the range,
+  * i.e., \c high-low is a scalar multiple of \c size-1, or if \c size is a scalar multiple of the
+  * number of values \c high-low+1 (meaning each value can be repeated the same number of time).
+  * If one of these two considions is not satisfied, then \c high is lowered to the largest value
+  * satisfying one of this constraint.
+  * Here are some examples:
+  *
+  * Example: \include DenseBase_LinSpacedInt.cpp
+  * Output: \verbinclude DenseBase_LinSpacedInt.out
+  *
+  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,true>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
 }
 
 /**
@@ -289,7 +281,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,true>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
 }
 
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
@@ -297,9 +289,10 @@ template<typename Derived>
 bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
     for(Index i = 0; i < rows(); ++i)
-      if(!internal::isApprox(this->coeff(i, j), val, prec))
+      if(!internal::isApprox(self.coeff(i, j), val, prec))
         return false;
   return true;
 }
@@ -324,7 +317,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
   setConstant(val);
 }
 
-/** Sets all coefficients in this expression to \a value.
+/** Sets all coefficients in this expression to value \a val.
   *
   * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
   */
@@ -334,7 +327,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
   return derived() = Constant(rows(), cols(), val);
 }
 
-/** Resizes to the given \a size, and sets all coefficients in this expression to the given \a value.
+/** Resizes to the given \a size, and sets all coefficients in this expression to the given value \a val.
   *
   * \only_for_vectors
   *
@@ -351,10 +344,10 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
   return setConstant(val);
 }
 
-/** Resizes to the given size, and sets all coefficients in this expression to the given \a value.
+/** Resizes to the given size, and sets all coefficients in this expression to the given value \a val.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   * \param val the value to which all coefficients are set
   *
   * Example: \include Matrix_setConstant_int_int.cpp
@@ -364,14 +357,14 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setConstant(Index nbRows, Index nbCols, const Scalar& val)
+PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(val);
 }
 
 /**
-  * \brief Sets a linearly space vector.
+  * \brief Sets a linearly spaced vector.
   *
   * The function generates 'size' equally spaced values in the closed interval [low,high].
   * When size is set to 1, a vector of length 1 containing 'high' is returned.
@@ -381,24 +374,30 @@ PlainObjectBase<Derived>::setConstant(Index nbRows, Index nbCols, const Scalar&
   * Example: \include DenseBase_setLinSpaced.cpp
   * Output: \verbinclude DenseBase_setLinSpaced.out
   *
-  * \sa CwiseNullaryOp
+  * For integer scalar types, do not miss the explanations on the definition
+  * of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
+  *
+  * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,false>(low,high,newSize));
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
 }
 
 /**
-  * \brief Sets a linearly space vector.
+  * \brief Sets a linearly spaced vector.
   *
-  * The function fill *this with equally spaced values in the closed interval [low,high].
+  * The function fills \c *this with equally spaced values in the closed interval [low,high].
   * When size is set to 1, a vector of length 1 containing 'high' is returned.
   *
   * \only_for_vectors
   *
-  * \sa setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
+  * For integer scalar types, do not miss the explanations on the definition
+  * of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
+  *
+  * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
@@ -425,9 +424,9 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Zero(Index nbRows, Index nbCols)
+DenseBase<Derived>::Zero(Index rows, Index cols)
 {
-  return Constant(nbRows, nbCols, Scalar(0));
+  return Constant(rows, cols, Scalar(0));
 }
 
 /** \returns an expression of a zero vector.
@@ -481,9 +480,10 @@ DenseBase<Derived>::Zero()
 template<typename Derived>
 bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
     for(Index i = 0; i < rows(); ++i)
-      if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<Scalar>(1), prec))
         return false;
   return true;
 }
@@ -520,8 +520,8 @@ PlainObjectBase<Derived>::setZero(Index newSize)
 
 /** Resizes to the given size, and sets all coefficients in this expression to zero.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setZero_int_int.cpp
   * Output: \verbinclude Matrix_setZero_int_int.out
@@ -530,9 +530,9 @@ PlainObjectBase<Derived>::setZero(Index newSize)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(Scalar(0));
 }
 
@@ -540,7 +540,7 @@ PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
 
 /** \returns an expression of a matrix where all coefficients equal one.
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
@@ -554,9 +554,9 @@ PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Ones(Index nbRows, Index nbCols)
+DenseBase<Derived>::Ones(Index rows, Index cols)
 {
-  return Constant(nbRows, nbCols, Scalar(1));
+  return Constant(rows, cols, Scalar(1));
 }
 
 /** \returns an expression of a vector where all coefficients equal one.
@@ -646,8 +646,8 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
 
 /** Resizes to the given size, and sets all coefficients in this expression to one.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setOnes_int_int.cpp
   * Output: \verbinclude Matrix_setOnes_int_int.out
@@ -656,9 +656,9 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(Scalar(1));
 }
 
@@ -666,7 +666,7 @@ PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
 
 /** \returns an expression of the identity matrix (not necessarily square).
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
@@ -680,9 +680,9 @@ PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
-MatrixBase<Derived>::Identity(Index nbRows, Index nbCols)
+MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_identity_op<Scalar>());
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
 }
 
 /** \returns an expression of the identity matrix (not necessarily square).
@@ -716,18 +716,19 @@ template<typename Derived>
 bool MatrixBase<Derived>::isIdentity
 (const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
   {
     for(Index i = 0; i < rows(); ++i)
     {
       if(i == j)
       {
-        if(!internal::isApprox(this->coeff(i, j), static_cast<Scalar>(1), prec))
+        if(!internal::isApprox(self.coeff(i, j), static_cast<Scalar>(1), prec))
           return false;
       }
       else
       {
-        if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<RealScalar>(1), prec))
+        if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<RealScalar>(1), prec))
           return false;
       }
     }
@@ -740,6 +741,7 @@ namespace internal {
 template<typename Derived, bool Big = (Derived::SizeAtCompileTime>=16)>
 struct setIdentity_impl
 {
+  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Derived& run(Derived& m)
   {
     return m = Derived::Identity(m.rows(), m.cols());
@@ -749,11 +751,11 @@ struct setIdentity_impl
 template<typename Derived>
 struct setIdentity_impl<Derived, true>
 {
-  typedef typename Derived::Index Index;
+  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Derived& run(Derived& m)
   {
     m.setZero();
-    const Index size = (std::min)(m.rows(), m.cols());
+    const Index size = numext::mini(m.rows(), m.cols());
     for(Index i = 0; i < size; ++i) m.coeffRef(i,i) = typename Derived::Scalar(1);
     return m;
   }
@@ -776,8 +778,8 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
 
 /** \brief Resizes to the given size, and writes the identity expression (not necessarily square) into *this.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setIdentity_int_int.cpp
   * Output: \verbinclude Matrix_setIdentity_int_int.out
@@ -785,9 +787,9 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
   * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index nbRows, Index nbCols)
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
-  derived().resize(nbRows, nbCols);
+  derived().resize(rows, cols);
   return setIdentity();
 }
 
diff --git a/Eigen/src/Core/CwiseTernaryOp.h b/Eigen/src/Core/CwiseTernaryOp.h
new file mode 100644
index 000000000..9f3576fec
--- /dev/null
+++ b/Eigen/src/Core/CwiseTernaryOp.h
@@ -0,0 +1,197 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_TERNARY_OP_H
+#define EIGEN_CWISE_TERNARY_OP_H
+
+namespace Eigen {
+
+namespace internal {
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct traits<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > {
+  // we must not inherit from traits<Arg1> since it has
+  // the potential to cause problems with MSVC
+  typedef typename remove_all<Arg1>::type Ancestor;
+  typedef typename traits<Ancestor>::XprKind XprKind;
+  enum {
+    RowsAtCompileTime = traits<Ancestor>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<Ancestor>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<Ancestor>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<Ancestor>::MaxColsAtCompileTime
+  };
+
+  // even though we require Arg1, Arg2, and Arg3 to have the same scalar type
+  // (see CwiseTernaryOp constructor),
+  // we still want to handle the case when the result type is different.
+  typedef typename result_of<TernaryOp(
+      const typename Arg1::Scalar&, const typename Arg2::Scalar&,
+      const typename Arg3::Scalar&)>::type Scalar;
+
+  typedef typename internal::traits<Arg1>::StorageKind StorageKind;
+  typedef typename internal::traits<Arg1>::StorageIndex StorageIndex;
+
+  typedef typename Arg1::Nested Arg1Nested;
+  typedef typename Arg2::Nested Arg2Nested;
+  typedef typename Arg3::Nested Arg3Nested;
+  typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+  enum { Flags = _Arg1Nested::Flags & RowMajorBit };
+};
+}  // end namespace internal
+
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
+          typename StorageKind>
+class CwiseTernaryOpImpl;
+
+/** \class CwiseTernaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise ternary operator is
+ * applied to two expressions
+  *
+  * \tparam TernaryOp template functor implementing the operator
+  * \tparam Arg1Type the type of the first argument
+  * \tparam Arg2Type the type of the second argument
+  * \tparam Arg3Type the type of the third argument
+  *
+  * This class represents an expression where a coefficient-wise ternary
+ * operator is applied to three expressions.
+  * It is the return type of ternary operators, by which we mean only those
+ * ternary operators where
+  * all three arguments are Eigen expressions.
+  * For example, the return type of betainc(matrix1, matrix2, matrix3) is a
+ * CwiseTernaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically
+ * don't have to name
+  * CwiseTernaryOp types explicitly.
+  *
+  * \sa MatrixBase::ternaryExpr(const MatrixBase<Argument2> &, const
+ * MatrixBase<Argument3> &, const CustomTernaryOp &) const, class CwiseBinaryOp,
+ * class CwiseUnaryOp, class CwiseNullaryOp
+  */
+template <typename TernaryOp, typename Arg1Type, typename Arg2Type,
+          typename Arg3Type>
+class CwiseTernaryOp : public CwiseTernaryOpImpl<
+                           TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+                           typename internal::traits<Arg1Type>::StorageKind>,
+                       internal::no_assignment_operator
+{
+ public:
+  typedef typename internal::remove_all<Arg1Type>::type Arg1;
+  typedef typename internal::remove_all<Arg2Type>::type Arg2;
+  typedef typename internal::remove_all<Arg3Type>::type Arg3;
+
+  typedef typename CwiseTernaryOpImpl<
+      TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+      typename internal::traits<Arg1Type>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseTernaryOp)
+
+  typedef typename internal::ref_selector<Arg1Type>::type Arg1Nested;
+  typedef typename internal::ref_selector<Arg2Type>::type Arg2Nested;
+  typedef typename internal::ref_selector<Arg3Type>::type Arg3Nested;
+  typedef typename internal::remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename internal::remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename internal::remove_reference<Arg3Nested>::type _Arg3Nested;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE CwiseTernaryOp(const Arg1& a1, const Arg2& a2,
+                                     const Arg3& a3,
+                                     const TernaryOp& func = TernaryOp())
+      : m_arg1(a1), m_arg2(a2), m_arg3(a3), m_functor(func) {
+    // require the sizes to match
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg2)
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg3)
+
+    // The index types should match
+    EIGEN_STATIC_ASSERT((internal::is_same<
+                         typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<
+                         typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+
+    eigen_assert(a1.rows() == a2.rows() && a1.cols() == a2.cols() &&
+                 a1.rows() == a3.rows() && a1.cols() == a3.cols());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index rows() const {
+    // return the fixed size type if available to enable compile time
+    // optimizations
+    if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                RowsAtCompileTime == Dynamic &&
+        internal::traits<typename internal::remove_all<Arg2Nested>::type>::
+                RowsAtCompileTime == Dynamic)
+      return m_arg3.rows();
+    else if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                     RowsAtCompileTime == Dynamic &&
+             internal::traits<typename internal::remove_all<Arg3Nested>::type>::
+                     RowsAtCompileTime == Dynamic)
+      return m_arg2.rows();
+    else
+      return m_arg1.rows();
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index cols() const {
+    // return the fixed size type if available to enable compile time
+    // optimizations
+    if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                ColsAtCompileTime == Dynamic &&
+        internal::traits<typename internal::remove_all<Arg2Nested>::type>::
+                ColsAtCompileTime == Dynamic)
+      return m_arg3.cols();
+    else if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                     ColsAtCompileTime == Dynamic &&
+             internal::traits<typename internal::remove_all<Arg3Nested>::type>::
+                     ColsAtCompileTime == Dynamic)
+      return m_arg2.cols();
+    else
+      return m_arg1.cols();
+  }
+
+  /** \returns the first argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg1Nested& arg1() const { return m_arg1; }
+  /** \returns the first argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg2Nested& arg2() const { return m_arg2; }
+  /** \returns the third argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg3Nested& arg3() const { return m_arg3; }
+  /** \returns the functor representing the ternary operation */
+  EIGEN_DEVICE_FUNC
+  const TernaryOp& functor() const { return m_functor; }
+
+ protected:
+  Arg1Nested m_arg1;
+  Arg2Nested m_arg2;
+  Arg3Nested m_arg3;
+  const TernaryOp m_functor;
+};
+
+// Generic API dispatcher
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
+          typename StorageKind>
+class CwiseTernaryOpImpl
+    : public internal::generic_xpr_base<
+          CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type {
+ public:
+  typedef typename internal::generic_xpr_base<
+      CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type Base;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CWISE_TERNARY_OP_H
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index f2de749f9..1d2dd19f2 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -13,41 +13,18 @@
 
 namespace Eigen { 
 
-/** \class CwiseUnaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
-  *
-  * \param UnaryOp template functor implementing the operator
-  * \param XprType the type of the expression to which we are applying the unary operator
-  *
-  * This class represents an expression where a unary operator is applied to an expression.
-  * It is the return type of all operations taking exactly 1 input expression, regardless of the
-  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
-  * is considered unary, because only the right-hand side is an expression, and its
-  * return type is a specialization of CwiseUnaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseUnaryOp types explicitly.
-  *
-  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
-  */
-
 namespace internal {
 template<typename UnaryOp, typename XprType>
 struct traits<CwiseUnaryOp<UnaryOp, XprType> >
  : traits<XprType>
 {
   typedef typename result_of<
-                     UnaryOp(typename XprType::Scalar)
+                     UnaryOp(const typename XprType::Scalar&)
                    >::type Scalar;
   typedef typename XprType::Nested XprTypeNested;
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   enum {
-    Flags = _XprTypeNested::Flags & (
-      HereditaryBits | LinearAccessBit | AlignedBit
-      | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
-    CoeffReadCost = _XprTypeNested::CoeffReadCost + functor_traits<UnaryOp>::Cost
+    Flags = _XprTypeNested::Flags & RowMajorBit 
   };
 };
 }
@@ -55,70 +32,70 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
 template<typename UnaryOp, typename XprType, typename StorageKind>
 class CwiseUnaryOpImpl;
 
+/** \class CwiseUnaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
+  *
+  * \tparam UnaryOp template functor implementing the operator
+  * \tparam XprType the type of the expression to which we are applying the unary operator
+  *
+  * This class represents an expression where a unary operator is applied to an expression.
+  * It is the return type of all operations taking exactly 1 input expression, regardless of the
+  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
+  * is considered unary, because only the right-hand side is an expression, and its
+  * return type is a specialization of CwiseUnaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseUnaryOp types explicitly.
+  *
+  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
+  */
 template<typename UnaryOp, typename XprType>
-class CwiseUnaryOp : internal::no_assignment_operator,
-  public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>
+class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>, internal::no_assignment_operator
 {
   public:
 
     typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
+    typedef typename internal::ref_selector<XprType>::type XprTypeNested;
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
 
-    inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
-    EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index rows() const { return m_xpr.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index cols() const { return m_xpr.cols(); }
 
     /** \returns the functor representing the unary operation */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const UnaryOp& functor() const { return m_functor; }
 
     /** \returns the nested expression */
-    const typename internal::remove_all<typename XprType::Nested>::type&
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type&
     nestedExpression() const { return m_xpr; }
 
     /** \returns the nested expression */
-    typename internal::remove_all<typename XprType::Nested>::type&
-    nestedExpression() { return m_xpr.const_cast_derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename internal::remove_all<XprTypeNested>::type&
+    nestedExpression() { return m_xpr; }
 
   protected:
-    typename XprType::Nested m_xpr;
+    XprTypeNested m_xpr;
     const UnaryOp m_functor;
 };
 
-// This is the generic implementation for dense storage.
-// It can be used for any expression types implementing the dense concept.
-template<typename UnaryOp, typename XprType>
-class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
-  : public internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
+// Generic API dispatcher
+template<typename UnaryOp, typename XprType, typename StorageKind>
+class CwiseUnaryOpImpl
+  : public internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
 {
-  public:
-
-    typedef CwiseUnaryOp<UnaryOp, XprType> Derived;
-    typedef typename internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(rowId, colId));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(rowId, colId));
-    }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(index));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(index));
-    }
+public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
 };
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index b2638d326..271033056 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -12,33 +12,19 @@
 
 namespace Eigen {
 
-/** \class CwiseUnaryView
-  * \ingroup Core_Module
-  *
-  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
-  *
-  * \param ViewOp template functor implementing the view
-  * \param MatrixType the type of the matrix we are applying the unary operator
-  *
-  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
-  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
-  */
-
 namespace internal {
 template<typename ViewOp, typename MatrixType>
 struct traits<CwiseUnaryView<ViewOp, MatrixType> >
  : traits<MatrixType>
 {
   typedef typename result_of<
-                     ViewOp(typename traits<MatrixType>::Scalar)
+                     ViewOp(const typename traits<MatrixType>::Scalar&)
                    >::type Scalar;
   typedef typename MatrixType::Nested MatrixTypeNested;
   typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
-    Flags = (traits<_MatrixTypeNested>::Flags & (HereditaryBits | LvalueBit | LinearAccessBit | DirectAccessBit)),
-    CoeffReadCost = traits<_MatrixTypeNested>::CoeffReadCost + functor_traits<ViewOp>::Cost,
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = traits<_MatrixTypeNested>::Flags & (RowMajorBit | FlagsLvalueBit | DirectAccessBit), // FIXME DirectAccessBit should not be handled by expressions
     MatrixTypeInnerStride =  inner_stride_at_compile_time<MatrixType>::ret,
     // need to cast the sizeof's from size_t to int explicitly, otherwise:
     // "error: no integral type can represent all of the enumerator values
@@ -55,6 +41,19 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> >
 template<typename ViewOp, typename MatrixType, typename StorageKind>
 class CwiseUnaryViewImpl;
 
+/** \class CwiseUnaryView
+  * \ingroup Core_Module
+  *
+  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
+  *
+  * \tparam ViewOp template functor implementing the view
+  * \tparam MatrixType the type of the matrix we are applying the unary operator
+  *
+  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
+  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
+  */
 template<typename ViewOp, typename MatrixType>
 class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename internal::traits<MatrixType>::StorageKind>
 {
@@ -62,8 +61,10 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
 
     typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-    inline CwiseUnaryView(const MatrixType& mat, const ViewOp& func = ViewOp())
+    explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
       : m_matrix(mat), m_functor(func) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
@@ -75,19 +76,27 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
     const ViewOp& functor() const { return m_functor; }
 
     /** \returns the nested expression */
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    const typename internal::remove_all<MatrixTypeNested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
-    typename internal::remove_all<typename MatrixType::Nested>::type&
+    typename internal::remove_reference<MatrixTypeNested>::type&
     nestedExpression() { return m_matrix.const_cast_derived(); }
 
   protected:
-    // FIXME changed from MatrixType::Nested because of a weird compilation error with sun CC
-    typename internal::nested<MatrixType>::type m_matrix;
+    MatrixTypeNested m_matrix;
     ViewOp m_functor;
 };
 
+// Generic API dispatcher
+template<typename ViewOp, typename XprType, typename StorageKind>
+class CwiseUnaryViewImpl
+  : public internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type Base;
+};
+
 template<typename ViewOp, typename MatrixType>
 class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
   : public internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type
@@ -100,38 +109,18 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
     EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
     
-    inline Scalar* data() { return &coeffRef(0); }
-    inline const Scalar* data() const { return &coeff(0); }
+    EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); }
 
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC inline Index innerStride() const
     {
       return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
 
-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC inline Index outerStride() const
     {
       return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(row, col));
-    }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(index));
-    }
-
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
-    {
-      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(row, col));
-    }
-
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-    {
-      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(index));
-    }
 };
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 32f60d8dc..46fe5193c 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -34,37 +34,45 @@ static inline void check_DenseIndex_is_signed() {
   * \tparam Derived is the derived type, e.g., a matrix type or an expression.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                                     typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>
-#else
   : public DenseCoeffsBase<Derived>
+#else
+  : public DenseCoeffsBase<Derived,DirectWriteAccessors>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 {
   public:
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
 
-    class InnerIterator;
+    /** Inner iterator type to iterate over the coefficients of a row or column.
+      * \sa class InnerIterator
+      */
+    typedef Eigen::InnerIterator<Derived> InnerIterator;
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
 
-    /** \brief The type of indices 
-      * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
-      * \sa \ref TopicPreprocessorDirectives.
-      */
-    typedef typename internal::traits<Derived>::Index Index; 
+    /**
+      * \brief The type used to store indices
+      * \details This typedef is relevant for types that store multiple indices such as
+      *          PermutationMatrix or Transpositions, otherwise it defaults to Eigen::Index
+      * \sa \blank \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase.
+     */
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
 
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc. */
     typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+      *
+      * It is an alias for the Scalar type */
+    typedef Scalar value_type;
+    
     typedef typename NumTraits<Scalar>::Real RealScalar;
-
     typedef DenseCoeffsBase<Derived> Base;
+
     using Base::derived;
     using Base::const_cast_derived;
     using Base::rows;
@@ -74,16 +82,6 @@ template<typename Derived> class DenseBase
     using Base::colIndexByOuterInner;
     using Base::coeff;
     using Base::coeffByOuterInner;
-    using Base::packet;
-    using Base::packetByOuterInner;
-    using Base::writePacket;
-    using Base::writePacketByOuterInner;
-    using Base::coeffRef;
-    using Base::coeffRefByOuterInner;
-    using Base::copyCoeff;
-    using Base::copyCoeffByOuterInner;
-    using Base::copyPacket;
-    using Base::copyPacketByOuterInner;
     using Base::operator();
     using Base::operator[];
     using Base::x;
@@ -169,30 +167,54 @@ template<typename Derived> class DenseBase
       InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
                              : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
 
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-        /**< This is a rough measure of how expensive it is to read one coefficient from
-          * this expression.
-          */
-
       InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
       OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
     };
+    
+    typedef typename internal::find_best_packet<Scalar,SizeAtCompileTime>::type PacketScalar;
 
-    enum { ThisConstantIsPrivateInPlainObjectBase };
+    enum { IsPlainObjectBase = 0 };
+    
+    /** The plain matrix type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Matrix<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainMatrix;
+    
+    /** The plain array type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Array<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainArray;
+
+    /** \brief The plain matrix or array type corresponding to this expression.
+      *
+      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
+      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
+      * that the return type of eval() is either PlainObject or const PlainObject&.
+      */
+    typedef typename internal::conditional<internal::is_same<typename internal::traits<Derived>::XprKind,MatrixXpr >::value,
+                                 PlainMatrix, PlainArray>::type PlainObject;
 
     /** \returns the number of nonzero coefficients which is in practice the number
       * of stored coefficients. */
+    EIGEN_DEVICE_FUNC
     inline Index nonZeros() const { return size(); }
-    /** \returns true if either the number of rows or the number of columns is equal to 1.
-      * In other words, this function returns
-      * \code rows()==1 || cols()==1 \endcode
-      * \sa rows(), cols(), IsVectorAtCompileTime. */
 
     /** \returns the outer size.
       *
       * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
       * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
       * column-major matrix, and the number of rows for a row-major matrix. */
+    EIGEN_DEVICE_FUNC
     Index outerSize() const
     {
       return IsVectorAtCompileTime ? 1
@@ -204,6 +226,7 @@ template<typename Derived> class DenseBase
       * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
       * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a 
       * column-major matrix, and the number of columns for a row-major matrix. */
+    EIGEN_DEVICE_FUNC
     Index innerSize() const
     {
       return IsVectorAtCompileTime ? this->size()
@@ -214,6 +237,7 @@ template<typename Derived> class DenseBase
       * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
       * nothing else.
       */
+    EIGEN_DEVICE_FUNC
     void resize(Index newSize)
     {
       EIGEN_ONLY_USED_FOR_DEBUG(newSize);
@@ -224,22 +248,22 @@ template<typename Derived> class DenseBase
       * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
       * nothing else.
       */
-    void resize(Index nbRows, Index nbCols)
+    EIGEN_DEVICE_FUNC
+    void resize(Index rows, Index cols)
     {
-      EIGEN_ONLY_USED_FOR_DEBUG(nbRows);
-      EIGEN_ONLY_USED_FOR_DEBUG(nbCols);
-      eigen_assert(nbRows == this->rows() && nbCols == this->cols()
+      EIGEN_ONLY_USED_FOR_DEBUG(rows);
+      EIGEN_ONLY_USED_FOR_DEBUG(cols);
+      eigen_assert(rows == this->rows() && cols == this->cols()
                 && "DenseBase::resize() does not actually allow to resize.");
     }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
-    /** \internal Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,false>,Derived> SequentialLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
+    /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> SequentialLinSpacedReturnType;
     /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,true>,Derived> RandomAccessLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> RandomAccessLinSpacedReturnType;
     /** \internal the return type of MatrixBase::eigenvalues() */
     typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
 
@@ -247,120 +271,133 @@ template<typename Derived> class DenseBase
 
     /** Copies \a other into *this. \returns a reference to *this. */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const DenseBase<OtherDerived>& other);
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const DenseBase& other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const EigenBase<OtherDerived> &other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator+=(const EigenBase<OtherDerived> &other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator-=(const EigenBase<OtherDerived> &other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& func);
 
-    /** \internal Copies \a other into *this without evaluating other. \returns a reference to *this. */
+    /** \ínternal
+      * Copies \a other into *this without evaluating other. \returns a reference to *this.
+      * \deprecated */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& lazyAssign(const DenseBase<OtherDerived>& other);
 
-    /** \internal Evaluates \a other into *this. \returns a reference to *this. */
-    template<typename OtherDerived>
-    Derived& lazyAssign(const ReturnByValue<OtherDerived>& other);
-
+    EIGEN_DEVICE_FUNC
     CommaInitializer<Derived> operator<< (const Scalar& s);
 
+    /** \deprecated it now returns \c *this */
     template<unsigned int Added,unsigned int Removed>
-    const Flagged<Derived, Added, Removed> flagged() const;
+    EIGEN_DEPRECATED
+    const Derived& flagged() const
+    { return derived(); }
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);
 
-    Eigen::Transpose<Derived> transpose();
-	typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef Transpose<Derived> TransposeReturnType;
+    EIGEN_DEVICE_FUNC
+    TransposeReturnType transpose();
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    EIGEN_DEVICE_FUNC
     ConstTransposeReturnType transpose() const;
+    EIGEN_DEVICE_FUNC
     void transposeInPlace();
-#ifndef EIGEN_NO_DEBUG
-  protected:
-    template<typename OtherDerived>
-    void checkTransposeAliasing(const OtherDerived& other) const;
-  public:
-#endif
 
-
-    static const ConstantReturnType
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
     Constant(Index rows, Index cols, const Scalar& value);
-    static const ConstantReturnType
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
     Constant(Index size, const Scalar& value);
-    static const ConstantReturnType
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
     Constant(const Scalar& value);
 
-    static const SequentialLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
     LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
-    static const RandomAccessLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(Index size, const Scalar& low, const Scalar& high);
-    static const SequentialLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
     LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
-    static const RandomAccessLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(const Scalar& low, const Scalar& high);
 
-    template<typename CustomNullaryOp>
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp>
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(Index size, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp>
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(const CustomNullaryOp& func);
 
-    static const ConstantReturnType Zero(Index rows, Index cols);
-    static const ConstantReturnType Zero(Index size);
-    static const ConstantReturnType Zero();
-    static const ConstantReturnType Ones(Index rows, Index cols);
-    static const ConstantReturnType Ones(Index size);
-    static const ConstantReturnType Ones();
-
-    void fill(const Scalar& value);
-    Derived& setConstant(const Scalar& value);
-    Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
-    Derived& setLinSpaced(const Scalar& low, const Scalar& high);
-    Derived& setZero();
-    Derived& setOnes();
-    Derived& setRandom();
-
-    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index size);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero();
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index size);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones();
+
+    EIGEN_DEVICE_FUNC void fill(const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC Derived& setLinSpaced(const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC Derived& setZero();
+    EIGEN_DEVICE_FUNC Derived& setOnes();
+    EIGEN_DEVICE_FUNC Derived& setRandom();
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
     bool isApprox(const DenseBase<OtherDerived>& other,
                   const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC 
     bool isMuchSmallerThan(const RealScalar& other,
                            const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    template<typename OtherDerived>
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
     bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
                            const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
 
-    bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     
     inline bool hasNaN() const;
     inline bool allFinite() const;
 
-    inline Derived& operator*=(const Scalar& other);
-    inline Derived& operator/=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const Scalar& other);
 
     typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
     /** \returns the matrix or vector obtained by evaluating this expression.
       *
       * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
       * a const reference, in order to avoid a useless copy.
+      * 
+      * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE EvalReturnType eval() const
     {
       // Even though MSVC does not honor strong inlining when the return type
@@ -368,61 +405,78 @@ template<typename Derived> class DenseBase
       // size types on MSVC.
       return typename internal::eval<Derived>::type(derived());
     }
-
+    
     /** swaps *this with the expression \a other.
       *
       */
     template<typename OtherDerived>
-    void swap(const DenseBase<OtherDerived>& other,
-              int = OtherDerived::ThisConstantIsPrivateInPlainObjectBase)
+    EIGEN_DEVICE_FUNC
+    void swap(const DenseBase<OtherDerived>& other)
     {
-      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
+      EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      eigen_assert(rows()==other.rows() && cols()==other.cols());
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
     /** swaps *this with the matrix or array \a other.
       *
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void swap(PlainObjectBase<OtherDerived>& other)
     {
-      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
+      eigen_assert(rows()==other.rows() && cols()==other.cols());
+      call_assignment(derived(), other.derived(), internal::swap_assign_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
+    EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
+    EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
+    template<bool Enable> EIGEN_DEVICE_FUNC
+    inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
+    template<bool Enable> EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
 
-    inline const NestByValue<Derived> nestByValue() const;
-    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
-    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+    EIGEN_DEVICE_FUNC Scalar sum() const;
+    EIGEN_DEVICE_FUNC Scalar mean() const;
+    EIGEN_DEVICE_FUNC Scalar trace() const;
 
-    Scalar sum() const;
-    Scalar mean() const;
-    Scalar trace() const;
+    EIGEN_DEVICE_FUNC Scalar prod() const;
 
-    Scalar prod() const;
+    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
+    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;
 
-    typename internal::traits<Derived>::Scalar minCoeff() const;
-    typename internal::traits<Derived>::Scalar maxCoeff() const;
-
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
 
     template<typename BinaryOp>
-    typename internal::result_of<BinaryOp(typename internal::traits<Derived>::Scalar)>::type
-    redux(const BinaryOp& func) const;
+    EIGEN_DEVICE_FUNC
+    Scalar redux(const BinaryOp& func) const;
 
     template<typename Visitor>
+    EIGEN_DEVICE_FUNC
     void visit(Visitor& func) const;
 
-    inline const WithFormat<Derived> format(const IOFormat& fmt) const;
+    /** \returns a WithFormat proxy object allowing to print a matrix the with given
+      * format \a fmt.
+      *
+      * See class IOFormat for some examples.
+      *
+      * \sa class IOFormat, class WithFormat
+      */
+    inline const WithFormat<Derived> format(const IOFormat& fmt) const
+    {
+      return WithFormat<Derived>(derived(), fmt);
+    }
 
     /** \returns the unique coefficient of a 1x1 expression */
+    EIGEN_DEVICE_FUNC
     CoeffReturnType value() const
     {
       EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
@@ -430,8 +484,8 @@ template<typename Derived> class DenseBase
       return derived().coeff(0,0);
     }
 
-    bool all(void) const;
-    bool any(void) const;
+    bool all() const;
+    bool any() const;
     Index count() const;
 
     typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
@@ -439,14 +493,35 @@ template<typename Derived> class DenseBase
     typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
     typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;
 
-    ConstRowwiseReturnType rowwise() const;
-    RowwiseReturnType rowwise();
-    ConstColwiseReturnType colwise() const;
-    ColwiseReturnType colwise();
+    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+    *
+    * Example: \include MatrixBase_rowwise.cpp
+    * Output: \verbinclude MatrixBase_rowwise.out
+    *
+    * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+    */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC inline ConstRowwiseReturnType rowwise() const {
+      return ConstRowwiseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC RowwiseReturnType rowwise();
+
+    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+    *
+    * Example: \include MatrixBase_colwise.cpp
+    * Output: \verbinclude MatrixBase_colwise.out
+    *
+    * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+    */
+    EIGEN_DEVICE_FUNC inline ConstColwiseReturnType colwise() const {
+      return ConstColwiseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC ColwiseReturnType colwise();
 
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index rows, Index cols);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index size);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random();
+    typedef CwiseNullaryOp<internal::scalar_random_op<Scalar>,PlainObject> RandomReturnType;
+    static const RandomReturnType Random(Index rows, Index cols);
+    static const RandomReturnType Random(Index size);
+    static const RandomReturnType Random();
 
     template<typename ThenDerived,typename ElseDerived>
     const Select<Derived,ThenDerived,ElseDerived>
@@ -464,45 +539,56 @@ template<typename Derived> class DenseBase
     template<int p> RealScalar lpNorm() const;
 
     template<int RowFactor, int ColFactor>
-    inline const Replicate<Derived,RowFactor,ColFactor> replicate() const;
-    
-    typedef Replicate<Derived,Dynamic,Dynamic> ReplicateReturnType;
-    inline const ReplicateReturnType replicate(Index rowFacor,Index colFactor) const;
+    EIGEN_DEVICE_FUNC
+    const Replicate<Derived,RowFactor,ColFactor> replicate() const;
+    /**
+    * \return an expression of the replication of \c *this
+    *
+    * Example: \include MatrixBase_replicate_int_int.cpp
+    * Output: \verbinclude MatrixBase_replicate_int_int.out
+    *
+    * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
+    */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC
+    const Replicate<Derived, Dynamic, Dynamic> replicate(Index rowFactor, Index colFactor) const
+    {
+      return Replicate<Derived, Dynamic, Dynamic>(derived(), rowFactor, colFactor);
+    }
 
     typedef Reverse<Derived, BothDirections> ReverseReturnType;
     typedef const Reverse<const Derived, BothDirections> ConstReverseReturnType;
-    ReverseReturnType reverse();
-    ConstReverseReturnType reverse() const;
-    void reverseInPlace();
+    EIGEN_DEVICE_FUNC ReverseReturnType reverse();
+    /** This is the const version of reverse(). */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC ConstReverseReturnType reverse() const
+    {
+      return ConstReverseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC void reverseInPlace();
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
 #   include "../plugins/BlockMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
-#ifdef EIGEN2_SUPPORT
-
-    Block<Derived> corner(CornerType type, Index cRows, Index cCols);
-    const Block<Derived> corner(CornerType type, Index cRows, Index cCols) const;
-    template<int CRows, int CCols>
-    Block<Derived, CRows, CCols> corner(CornerType type);
-    template<int CRows, int CCols>
-    const Block<Derived, CRows, CCols> corner(CornerType type) const;
-
-#endif // EIGEN2_SUPPORT
-
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
 
     // disable the use of evalTo for dense objects with a nice compilation error
-    template<typename Dest> inline void evalTo(Dest& ) const
+    template<typename Dest>
+    EIGEN_DEVICE_FUNC
+    inline void evalTo(Dest& ) const
     {
       EIGEN_STATIC_ASSERT((internal::is_same<Dest,void>::value),THE_EVAL_EVALTO_FUNCTION_SHOULD_NEVER_BE_CALLED_FOR_DENSE_OBJECTS);
     }
 
   protected:
     /** Default constructor. Do nothing. */
-    DenseBase()
+    EIGEN_DEVICE_FUNC DenseBase()
     {
       /* Just checks for self-consistency of the flags.
        * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
@@ -515,9 +601,9 @@ template<typename Derived> class DenseBase
     }
 
   private:
-    explicit DenseBase(int);
-    DenseBase(int,int);
-    template<typename OtherDerived> explicit DenseBase(const DenseBase<OtherDerived>&);
+    EIGEN_DEVICE_FUNC explicit DenseBase(int);
+    EIGEN_DEVICE_FUNC DenseBase(int,int);
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit DenseBase(const DenseBase<OtherDerived>&);
 };
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 3c890f215..c4af48ab6 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -35,7 +35,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
 {
   public:
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
 
@@ -61,6 +60,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
     using Base::size;
     using Base::derived;
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const
     {
       return int(Derived::RowsAtCompileTime) == 1 ? 0
@@ -69,6 +69,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
           : inner;
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const
     {
       return int(Derived::ColsAtCompileTime) == 1 ? 0
@@ -91,13 +92,15 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
       *
       * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
     {
       eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      return derived().coeff(row, col);
+                         && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).coeff(row,col);
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
     {
       return coeff(rowIndexByOuterInner(outer, inner),
@@ -108,11 +111,12 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
       *
       * \sa operator()(Index,Index), operator[](Index)
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType operator()(Index row, Index col) const
     {
       eigen_assert(row >= 0 && row < rows()
           && col >= 0 && col < cols());
-      return derived().coeff(row, col);
+      return coeff(row, col);
     }
 
     /** Short version: don't use this function, use
@@ -130,11 +134,14 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
       * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     coeff(Index index) const
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
       eigen_internal_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return internal::evaluator<Derived>(derived()).coeff(index);
     }
 
 
@@ -146,15 +153,14 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
       * z() const, w() const
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     operator[](Index index) const
     {
-      #ifndef EIGEN2_SUPPORT
       EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                           THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
-      #endif
       eigen_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return coeff(index);
     }
 
     /** \returns the coefficient at given index.
@@ -167,32 +173,49 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
       * z() const, w() const
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     operator()(Index index) const
     {
       eigen_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return coeff(index);
     }
 
     /** equivalent to operator[](0).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     x() const { return (*this)[0]; }
 
     /** equivalent to operator[](1).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
-    y() const { return (*this)[1]; }
+    y() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }
 
     /** equivalent to operator[](2).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
-    z() const { return (*this)[2]; }
+    z() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }
 
     /** equivalent to operator[](3).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
-    w() const { return (*this)[3]; }
+    w() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
+    }
 
     /** \internal
       * \returns the packet of coefficients starting at the given row and column. It is your responsibility
@@ -207,9 +230,9 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
     template<int LoadMode>
     EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const
     {
-      eigen_internal_assert(row >= 0 && row < rows()
-                      && col >= 0 && col < cols());
-      return derived().template packet<LoadMode>(row,col);
+      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
+      eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(row,col);
     }
 
 
@@ -234,8 +257,11 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
     template<int LoadMode>
     EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
+      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
       eigen_internal_assert(index >= 0 && index < size());
-      return derived().template packet<LoadMode>(index);
+      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(index);
     }
 
   protected:
@@ -278,7 +304,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
     typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -311,13 +336,15 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
       *
       * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
     {
       eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      return derived().coeffRef(row, col);
+                         && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).coeffRef(row,col);
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     coeffRefByOuterInner(Index outer, Index inner)
     {
@@ -330,12 +357,13 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
       * \sa operator[](Index)
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     operator()(Index row, Index col)
     {
       eigen_assert(row >= 0 && row < rows()
           && col >= 0 && col < cols());
-      return derived().coeffRef(row, col);
+      return coeffRef(row, col);
     }
 
 
@@ -354,11 +382,14 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
       * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     coeffRef(Index index)
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
       eigen_internal_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return internal::evaluator<Derived>(derived()).coeffRef(index);
     }
 
     /** \returns a reference to the coefficient at given index.
@@ -368,15 +399,14 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
       * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     operator[](Index index)
     {
-      #ifndef EIGEN2_SUPPORT
       EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                           THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
-      #endif
       eigen_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return coeffRef(index);
     }
 
     /** \returns a reference to the coefficient at given index.
@@ -388,167 +418,49 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
       * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     operator()(Index index)
     {
       eigen_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return coeffRef(index);
     }
 
     /** equivalent to operator[](0).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     x() { return (*this)[0]; }
 
     /** equivalent to operator[](1).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
-    y() { return (*this)[1]; }
-
-    /** equivalent to operator[](2).  */
-
-    EIGEN_STRONG_INLINE Scalar&
-    z() { return (*this)[2]; }
-
-    /** equivalent to operator[](3).  */
-
-    EIGEN_STRONG_INLINE Scalar&
-    w() { return (*this)[3]; }
-
-    /** \internal
-      * Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket
-    (Index row, Index col, const typename internal::packet_traits<Scalar>::type& val)
+    y()
     {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().template writePacket<StoreMode>(row,col,val);
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
     }
 
+    /** equivalent to operator[](2).  */
 
-    /** \internal */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacketByOuterInner
-    (Index outer, Index inner, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      writePacket<StoreMode>(rowIndexByOuterInner(outer, inner),
-                            colIndexByOuterInner(outer, inner),
-                            val);
-    }
-
-    /** \internal
-      * Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit and the LinearAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket
-    (Index index, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().template writePacket<StoreMode>(index,val);
-    }
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-
-    /** \internal Copies the coefficient at position (row,col) of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().coeffRef(row, col) = other.derived().coeff(row, col);
-    }
-
-    /** \internal Copies the coefficient at the given index of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().coeffRef(index) = other.derived().coeff(index);
-    }
-
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void copyCoeffByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
-    {
-      const Index row = rowIndexByOuterInner(outer,inner);
-      const Index col = colIndexByOuterInner(outer,inner);
-      // derived() is important here: copyCoeff() may be reimplemented in Derived!
-      derived().copyCoeff(row, col, other);
-    }
-
-    /** \internal Copies the packet at position (row,col) of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    z()
     {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().template writePacket<StoreMode>(row, col,
-        other.derived().template packet<LoadMode>(row, col));
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
     }
 
-    /** \internal Copies the packet at the given index of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().template writePacket<StoreMode>(index,
-        other.derived().template packet<LoadMode>(index));
-    }
+    /** equivalent to operator[](3).  */
 
-    /** \internal */
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacketByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    w()
     {
-      const Index row = rowIndexByOuterInner(outer,inner);
-      const Index col = colIndexByOuterInner(outer,inner);
-      // derived() is important here: copyCoeff() may be reimplemented in Derived!
-      derived().template copyPacket< OtherDerived, StoreMode, LoadMode>(row, col, other);
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
     }
-#endif
-
 };
 
 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
@@ -560,7 +472,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
   * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using
   * \c operator() .
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived>
 class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors>
@@ -568,7 +480,6 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
   public:
 
     typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -581,6 +492,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa outerStride(), rowStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const
     {
       return derived().innerStride();
@@ -591,6 +503,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa innerStride(), rowStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
       return derived().outerStride();
@@ -606,6 +519,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa innerStride(), outerStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index rowStride() const
     {
       return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -615,6 +529,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa innerStride(), outerStride(), rowStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index colStride() const
     {
       return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -630,7 +545,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
   * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using
   * \c operator().
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived>
 class DenseCoeffsBase<Derived, DirectWriteAccessors>
@@ -639,7 +554,6 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
   public:
 
     typedef DenseCoeffsBase<Derived, WriteAccessors> Base;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -652,6 +566,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa outerStride(), rowStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const
     {
       return derived().innerStride();
@@ -662,6 +577,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa innerStride(), rowStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
       return derived().outerStride();
@@ -677,6 +593,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa innerStride(), outerStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index rowStride() const
     {
       return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -686,6 +603,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa innerStride(), outerStride(), rowStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index colStride() const
     {
       return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -694,33 +612,42 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
 
 namespace internal {
 
-template<typename Derived, bool JustReturnZero>
+template<int Alignment, typename Derived, bool JustReturnZero>
 struct first_aligned_impl
 {
-  static inline typename Derived::Index run(const Derived&)
+  static inline Index run(const Derived&)
   { return 0; }
 };
 
-template<typename Derived>
-struct first_aligned_impl<Derived, false>
+template<int Alignment, typename Derived>
+struct first_aligned_impl<Alignment, Derived, false>
 {
-  static inline typename Derived::Index run(const Derived& m)
+  static inline Index run(const Derived& m)
   {
-    return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
+    return internal::first_aligned<Alignment>(m.data(), m.size());
   }
 };
 
-/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect to \a Alignment for vectorization.
+  *
+  * \tparam Alignment requested alignment in Bytes.
   *
   * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
   * documentation.
   */
+template<int Alignment, typename Derived>
+static inline Index first_aligned(const DenseBase<Derived>& m)
+{
+  enum { ReturnZero = (int(evaluator<Derived>::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) };
+  return first_aligned_impl<Alignment, Derived, ReturnZero>::run(m.derived());
+}
+
 template<typename Derived>
-static inline typename Derived::Index first_aligned(const Derived& m)
+static inline Index first_default_aligned(const DenseBase<Derived>& m)
 {
-  return first_aligned_impl
-          <Derived, (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)>
-          ::run(m);
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return internal::first_aligned<int(unpacket_traits<DefaultPacketType>::alignment),Derived>(m);
 }
 
 template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index a72738e55..7958feeb9 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -3,7 +3,7 @@
 //
 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+// Copyright (C) 2010-2013 Hauke Heibel <hauke.heibel@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,9 +13,9 @@
 #define EIGEN_MATRIXSTORAGE_H
 
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
 #else
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
 #endif
 
 namespace Eigen {
@@ -24,7 +24,9 @@ namespace internal {
 
 struct constructor_without_unaligned_array_assert {};
 
-template<typename T, int Size> void check_static_allocation_size()
+template<typename T, int Size>
+EIGEN_DEVICE_FUNC
+void check_static_allocation_size()
 {
   // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
   #if EIGEN_STACK_ALLOCATION_LIMIT
@@ -38,18 +40,19 @@ template<typename T, int Size> void check_static_allocation_size()
   */
 template <typename T, int Size, int MatrixOrArrayOptions,
           int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%16)==0) ? 16
-                        : 0 >
+                        : compute_default_alignment<T,Size>::value >
 struct plain_array
 {
   T array[Size];
 
-  plain_array() 
+  EIGEN_DEVICE_FUNC
+  plain_array()
   { 
     check_static_allocation_size<T,Size>();
   }
 
-  plain_array(constructor_without_unaligned_array_assert) 
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
   { 
     check_static_allocation_size<T,Size>();
   }
@@ -64,29 +67,88 @@ struct plain_array
   template<typename PtrType>
   EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \
+    eigen_assert((internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
               && "this assertion is explained here: " \
               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
               " **** READ THIS WEB PAGE !!! ****");
 #else
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \
+    eigen_assert((internal::UIntPtr(array) & (sizemask)) == 0 \
               && "this assertion is explained here: " \
               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
               " **** READ THIS WEB PAGE !!! ****");
 #endif
 
 template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 8>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
 struct plain_array<T, Size, MatrixOrArrayOptions, 16>
 {
-  EIGEN_USER_ALIGN16 T array[Size];
+  EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];
 
+  EIGEN_DEVICE_FUNC
   plain_array() 
   { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf);
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
     check_static_allocation_size<T,Size>();
   }
 
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 32>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 64>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  { 
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
   plain_array(constructor_without_unaligned_array_assert) 
   { 
     check_static_allocation_size<T,Size>();
@@ -96,9 +158,9 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 16>
 template <typename T, int MatrixOrArrayOptions, int Alignment>
 struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
 {
-  EIGEN_USER_ALIGN16 T array[1];
-  plain_array() {}
-  plain_array(constructor_without_unaligned_array_assert) {}
+  T array[1];
+  EIGEN_DEVICE_FUNC plain_array() {}
+  EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
 };
 
 } // end namespace internal
@@ -122,33 +184,54 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
 {
     internal::plain_array<T,Size,_Options> m_data;
   public:
-    inline DenseStorage() {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
+    EIGEN_DEVICE_FUNC
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()) {}
-    inline DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
-    static inline DenseIndex rows(void) {return _Rows;}
-    static inline DenseIndex cols(void) {return _Cols;}
-    inline void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void resize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC 
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
+    EIGEN_DEVICE_FUNC 
+    DenseStorage& operator=(const DenseStorage& other)
+    { 
+      if (this != &other) m_data = other.m_data;
+      return *this; 
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
+      EIGEN_UNUSED_VARIABLE(size);
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
 
 // null matrix
 template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0, _Rows, _Cols, _Options>
 {
   public:
-    inline DenseStorage() {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert) {}
-    inline DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void swap(DenseStorage& ) {}
-    static inline DenseIndex rows(void) {return _Rows;}
-    static inline DenseIndex cols(void) {return _Cols;}
-    inline void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void resize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline const T *data() const { return 0; }
-    inline T *data() { return 0; }
+    EIGEN_DEVICE_FUNC DenseStorage() {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
+    EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
+    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC const T *data() const { return 0; }
+    EIGEN_DEVICE_FUNC T *data() { return 0; }
 };
 
 // more specializations for null matrices; these are necessary to resolve ambiguities
@@ -165,86 +248,158 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, 0, Dynamic,
 template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic, Dynamic, _Options>
 {
     internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_rows;
-    DenseIndex m_cols;
+    Index m_rows;
+    Index m_cols;
   public:
-    inline DenseStorage() : m_rows(0), m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    inline DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) : m_rows(nbRows), m_cols(nbCols) {}
-    inline void swap(DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) 
+    { 
+      if (this != &other)
+      {
+        m_data = other.m_data;
+        m_rows = other.m_rows;
+        m_cols = other.m_cols;
+      }
+      return *this; 
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
     { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    inline DenseIndex rows() const {return m_rows;}
-    inline DenseIndex cols() const {return m_cols;}
-    inline void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    inline void resize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
 
 // dynamic-size matrix with fixed-size storage and fixed width
 template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Size, Dynamic, _Cols, _Options>
 {
     internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_rows;
+    Index m_rows;
   public:
-    inline DenseStorage() : m_rows(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    inline DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex) : m_rows(nbRows) {}
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    inline DenseIndex rows(void) const {return m_rows;}
-    inline DenseIndex cols(void) const {return _Cols;}
-    inline void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
-    inline void resize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) 
+    {
+      if (this != &other)
+      {
+        m_data = other.m_data;
+        m_rows = other.m_rows;
+      }
+      return *this; 
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
 
 // dynamic-size matrix with fixed-size storage and fixed height
 template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Size, _Rows, Dynamic, _Options>
 {
     internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_cols;
+    Index m_cols;
   public:
-    inline DenseStorage() : m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    inline DenseStorage(DenseIndex, DenseIndex, DenseIndex nbCols) : m_cols(nbCols) {}
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    inline DenseIndex rows(void) const {return _Rows;}
-    inline DenseIndex cols(void) const {return m_cols;}
-    inline void conservativeResize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
-    inline void resize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        m_data = other.m_data;
+        m_cols = other.m_cols;
+      }
+      return *this;
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
 
 // purely dynamic matrix.
 template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynamic, _Options>
 {
     T *m_data;
-    DenseIndex m_rows;
-    DenseIndex m_cols;
+    Index m_rows;
+    Index m_cols;
   public:
-    inline DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
        : m_data(0), m_rows(0), m_cols(0) {}
-    inline DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
-    inline void swap(DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*other.m_cols))
+      , m_rows(other.m_rows)
+      , m_cols(other.m_cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols)
+      internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
+      : m_data(std::move(other.m_data))
+      , m_rows(std::move(other.m_rows))
+      , m_cols(std::move(other.m_cols))
+    {
+      other.m_data = nullptr;
+      other.m_rows = 0;
+      other.m_cols = 0;
+    }
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
+    {
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_rows, other.m_rows);
+      swap(m_cols, other.m_cols);
+      return *this;
+    }
+#endif
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
     { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    inline DenseIndex rows(void) const {return m_rows;}
-    inline DenseIndex cols(void) const {return m_cols;}
-    inline void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    void conservativeResize(Index size, Index rows, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
-      m_rows = nbRows;
-      m_cols = nbCols;
+      m_rows = rows;
+      m_cols = cols;
     }
-    void resize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC void resize(Index size, Index rows, Index cols)
     {
       if(size != m_rows*m_cols)
       {
@@ -253,35 +408,73 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
           m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
         else
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
-      m_rows = nbRows;
-      m_cols = nbCols;
+      m_rows = rows;
+      m_cols = cols;
     }
-    inline const T *data() const { return m_data; }
-    inline T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
 };
 
 // matrix with dynamic width and fixed height (so that matrix has dynamic size).
 template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Rows, Dynamic, _Options>
 {
     T *m_data;
-    DenseIndex m_cols;
+    Index m_cols;
   public:
-    inline DenseStorage() : m_data(0), m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
-    inline DenseStorage(DenseIndex size, DenseIndex, DenseIndex nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    static inline DenseIndex rows(void) {return _Rows;}
-    inline DenseIndex cols(void) const {return m_cols;}
-    inline void conservativeResize(DenseIndex size, DenseIndex, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
+      EIGEN_UNUSED_VARIABLE(rows);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
+      , m_cols(other.m_cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows)
+      internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }    
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
+      : m_data(std::move(other.m_data))
+      , m_cols(std::move(other.m_cols))
+    {
+      other.m_data = nullptr;
+      other.m_cols = 0;
+    }
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
+    {
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_cols, other.m_cols);
+      return *this;
+    }
+#endif
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
-      m_cols = nbCols;
+      m_cols = cols;
     }
-    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index, Index cols)
     {
       if(size != _Rows*m_cols)
       {
@@ -290,34 +483,72 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
           m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
         else
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
-      m_cols = nbCols;
+      m_cols = cols;
     }
-    inline const T *data() const { return m_data; }
-    inline T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
 };
 
 // matrix with dynamic height and fixed width (so that matrix has dynamic size).
 template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dynamic, _Cols, _Options>
 {
     T *m_data;
-    DenseIndex m_rows;
+    Index m_rows;
   public:
-    inline DenseStorage() : m_data(0), m_rows(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
-    inline DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    inline DenseIndex rows(void) const {return m_rows;}
-    static inline DenseIndex cols(void) {return _Cols;}
-    inline void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
+      , m_rows(other.m_rows)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols)
+      internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }    
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
+      : m_data(std::move(other.m_data))
+      , m_rows(std::move(other.m_rows))
+    {
+      other.m_data = nullptr;
+      other.m_rows = 0;
+    }
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
+    {
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_rows, other.m_rows);
+      return *this;
+    }
+#endif
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    void conservativeResize(Index size, Index rows, Index)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
-      m_rows = nbRows;
+      m_rows = rows;
     }
-    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex nbRows, DenseIndex)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index rows, Index)
     {
       if(size != m_rows*_Cols)
       {
@@ -326,12 +557,12 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
           m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
         else
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
-      m_rows = nbRows;
+      m_rows = rows;
     }
-    inline const T *data() const { return m_data; }
-    inline T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
 };
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 68cf6d4b0..49e711257 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -21,7 +21,7 @@ namespace Eigen {
   * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
   * \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
   *              A positive value means a superdiagonal, a negative value means a subdiagonal.
-  *              You can also use Dynamic so the index can be set at runtime.
+  *              You can also use DynamicIndex so the index can be set at runtime.
   *
   * The matrix is not required to be square.
   *
@@ -37,7 +37,7 @@ template<typename MatrixType, int DiagIndex>
 struct traits<Diagonal<MatrixType,DiagIndex> >
  : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   typedef typename MatrixType::StorageKind StorageKind;
   enum {
@@ -52,8 +52,7 @@ struct traits<Diagonal<MatrixType,DiagIndex> >
                                                  MatrixType::MaxColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
     MaxColsAtCompileTime = 1,
     MaskLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags = (unsigned int)_MatrixTypeNested::Flags & (HereditaryBits | LinearAccessBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost,
+    Flags = (unsigned int)_MatrixTypeNested::Flags & (RowMajorBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit, // FIXME DirectAccessBit should not be handled by expressions
     MatrixTypeOuterStride = outer_stride_at_compile_time<MatrixType>::ret,
     InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride+1,
     OuterStrideAtCompileTime = 0
@@ -70,20 +69,28 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
     typedef typename internal::dense_xpr_base<Diagonal>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
 
-    inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const
-    { return m_index.value()<0 ? (std::min<Index>)(m_matrix.cols(),m_matrix.rows()+m_index.value()) : (std::min<Index>)(m_matrix.rows(),m_matrix.cols()-m_index.value()); }
+    {
+      return m_index.value()<0 ? numext::mini<Index>(m_matrix.cols(),m_matrix.rows()+m_index.value())
+                               : numext::mini<Index>(m_matrix.rows(),m_matrix.cols()-m_index.value());
+    }
 
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return 1; }
 
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const
     {
       return m_matrix.outerStride() + 1;
     }
 
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
       return 0;
@@ -95,62 +102,75 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
-    inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    EIGEN_DEVICE_FUNC
+    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
+    EIGEN_DEVICE_FUNC
+    inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index)
     {
       EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index row, Index) const
     {
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
     }
 
+    EIGEN_DEVICE_FUNC
     inline CoeffReturnType coeff(Index row, Index) const
     {
       return m_matrix.coeff(row+rowOffset(), row+colOffset());
     }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index idx)
     {
       EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index idx) const
     {
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
     }
 
+    EIGEN_DEVICE_FUNC
     inline CoeffReturnType coeff(Index idx) const
     {
       return m_matrix.coeff(idx+rowOffset(), idx+colOffset());
     }
 
-    const typename internal::remove_all<typename MatrixType::Nested>::type& 
+    EIGEN_DEVICE_FUNC
+    inline const typename internal::remove_all<typename MatrixType::Nested>::type& 
     nestedExpression() const 
     {
       return m_matrix;
     }
 
-    int index() const
+    EIGEN_DEVICE_FUNC
+    inline Index index() const
     {
       return m_index.value();
     }
 
   protected:
-    typename MatrixType::Nested m_matrix;
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
     const internal::variable_if_dynamicindex<Index, DiagIndex> m_index;
 
   private:
     // some compilers may fail to optimize std::max etc in case of compile-time constants...
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
-    // triger a compile time error is someone try to call packet
+    // trigger a compile-time error if someone try to call packet
     template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
     template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
 };
@@ -167,7 +187,7 @@ template<typename Derived>
 inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
-  return derived();
+  return DiagonalReturnType(derived());
 }
 
 /** This is the const version of diagonal(). */
@@ -216,20 +236,20 @@ MatrixBase<Derived>::diagonal(Index index) const
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-template<int Index>
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index>::Type
+template<int Index_>
+inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
-  return derived();
+  return typename DiagonalIndexReturnType<Index_>::Type(derived());
 }
 
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
-template<int Index>
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index>::Type
+template<int Index_>
+inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
-  return derived();
+  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index e6c220f41..ecfdce8ef 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -22,7 +22,7 @@ class DiagonalBase : public EigenBase<Derived>
     typedef typename DiagonalVectorType::Scalar Scalar;
     typedef typename DiagonalVectorType::RealScalar RealScalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
 
     enum {
       RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
@@ -30,79 +30,61 @@ class DiagonalBase : public EigenBase<Derived>
       MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
       MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
       IsVectorAtCompileTime = 0,
-      Flags = 0
+      Flags = NoPreferredStorageOrderBit
     };
 
     typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime> DenseMatrixType;
     typedef DenseMatrixType DenseType;
     typedef DiagonalMatrix<Scalar,DiagonalVectorType::SizeAtCompileTime,DiagonalVectorType::MaxSizeAtCompileTime> PlainObject;
 
+    EIGEN_DEVICE_FUNC
     inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    EIGEN_DEVICE_FUNC
     inline Derived& derived() { return *static_cast<Derived*>(this); }
 
+    EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const { return derived(); }
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived> &other) const;
-    template<typename DenseDerived>
-    void addTo(MatrixBase<DenseDerived> &other) const
-    { other.diagonal() += diagonal(); }
-    template<typename DenseDerived>
-    void subTo(MatrixBase<DenseDerived> &other) const
-    { other.diagonal() -= diagonal(); }
-
+    
+    EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
+    EIGEN_DEVICE_FUNC
     inline DiagonalVectorType& diagonal() { return derived().diagonal(); }
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return diagonal().size(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return diagonal().size(); }
 
-    /** \returns the diagonal matrix product of \c *this by the matrix \a matrix.
-      */
     template<typename MatrixDerived>
-    const DiagonalProduct<MatrixDerived, Derived, OnTheLeft>
+    EIGEN_DEVICE_FUNC
+    const Product<Derived,MatrixDerived,LazyProduct>
     operator*(const MatrixBase<MatrixDerived> &matrix) const
     {
-      return DiagonalProduct<MatrixDerived, Derived, OnTheLeft>(matrix.derived(), derived());
+      return Product<Derived, MatrixDerived, LazyProduct>(derived(),matrix.derived());
     }
 
-    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> >
+    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> > InverseReturnType;
+    EIGEN_DEVICE_FUNC
+    inline const InverseReturnType
     inverse() const
     {
-      return diagonal().cwiseInverse();
+      return InverseReturnType(diagonal().cwiseInverse());
     }
     
-    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
+    EIGEN_DEVICE_FUNC
+    inline const DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >
     operator*(const Scalar& scalar) const
     {
-      return diagonal() * scalar;
+      return DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >(diagonal() * scalar);
     }
-    friend inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
+    EIGEN_DEVICE_FUNC
+    friend inline const DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >
     operator*(const Scalar& scalar, const DiagonalBase& other)
     {
-      return other.diagonal() * scalar;
-    }
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    bool isApprox(const DiagonalBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return diagonal().isApprox(other.diagonal(), precision);
+      return DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >(scalar * other.diagonal());
     }
-    template<typename OtherDerived>
-    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return toDenseMatrix().isApprox(other, precision);
-    }
-    #endif
 };
 
-template<typename Derived>
-template<typename DenseDerived>
-void DiagonalBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
-{
-  other.setZero();
-  other.diagonal() = diagonal();
-}
 #endif
 
 /** \class DiagonalMatrix
@@ -124,10 +106,9 @@ struct traits<DiagonalMatrix<_Scalar,SizeAtCompileTime,MaxSizeAtCompileTime> >
  : traits<Matrix<_Scalar,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
   typedef Matrix<_Scalar,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1> DiagonalVectorType;
-  typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef DiagonalShape StorageKind;
   enum {
-    Flags = LvalueBit
+    Flags = LvalueBit | NoPreferredStorageOrderBit
   };
 };
 }
@@ -141,7 +122,7 @@ class DiagonalMatrix
     typedef const DiagonalMatrix& Nested;
     typedef _Scalar Scalar;
     typedef typename internal::traits<DiagonalMatrix>::StorageKind StorageKind;
-    typedef typename internal::traits<DiagonalMatrix>::Index Index;
+    typedef typename internal::traits<DiagonalMatrix>::StorageIndex StorageIndex;
     #endif
 
   protected:
@@ -151,24 +132,31 @@ class DiagonalMatrix
   public:
 
     /** const version of diagonal(). */
+    EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
     /** \returns a reference to the stored vector of diagonal coefficients. */
+    EIGEN_DEVICE_FUNC
     inline DiagonalVectorType& diagonal() { return m_diagonal; }
 
     /** Default constructor without initialization */
+    EIGEN_DEVICE_FUNC
     inline DiagonalMatrix() {}
 
     /** Constructs a diagonal matrix with given dimension  */
-    inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
 
     /** 2D constructor. */
+    EIGEN_DEVICE_FUNC
     inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x,y) {}
 
     /** 3D constructor. */
+    EIGEN_DEVICE_FUNC
     inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}
 
     /** Copy constructor. */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
@@ -178,11 +166,13 @@ class DiagonalMatrix
 
     /** generic constructor from expression of the diagonal coefficients */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other)
     {}
 
     /** Copy operator. */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     DiagonalMatrix& operator=(const DiagonalBase<OtherDerived>& other)
     {
       m_diagonal = other.diagonal();
@@ -193,6 +183,7 @@ class DiagonalMatrix
     /** This is a special case of the templated operator=. Its purpose is to
       * prevent a default operator= from hiding the templated operator=.
       */
+    EIGEN_DEVICE_FUNC
     DiagonalMatrix& operator=(const DiagonalMatrix& other)
     {
       m_diagonal = other.diagonal();
@@ -201,14 +192,19 @@ class DiagonalMatrix
     #endif
 
     /** Resizes to given size. */
+    EIGEN_DEVICE_FUNC
     inline void resize(Index size) { m_diagonal.resize(size); }
     /** Sets all coefficients to zero. */
+    EIGEN_DEVICE_FUNC
     inline void setZero() { m_diagonal.setZero(); }
     /** Resizes and sets all coefficients to zero. */
+    EIGEN_DEVICE_FUNC
     inline void setZero(Index size) { m_diagonal.setZero(size); }
     /** Sets this matrix to be the identity matrix of the current size. */
+    EIGEN_DEVICE_FUNC
     inline void setIdentity() { m_diagonal.setOnes(); }
     /** Sets this matrix to be the identity matrix of the given size. */
+    EIGEN_DEVICE_FUNC
     inline void setIdentity(Index size) { m_diagonal.setOnes(size); }
 };
 
@@ -232,14 +228,15 @@ struct traits<DiagonalWrapper<_DiagonalVectorType> >
 {
   typedef _DiagonalVectorType DiagonalVectorType;
   typedef typename DiagonalVectorType::Scalar Scalar;
-  typedef typename DiagonalVectorType::Index Index;
-  typedef typename DiagonalVectorType::StorageKind StorageKind;
+  typedef typename DiagonalVectorType::StorageIndex StorageIndex;
+  typedef DiagonalShape StorageKind;
+  typedef typename traits<DiagonalVectorType>::XprKind XprKind;
   enum {
     RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
     ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    Flags =  traits<DiagonalVectorType>::Flags & LvalueBit
+    MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    Flags =  (traits<DiagonalVectorType>::Flags & LvalueBit) | NoPreferredStorageOrderBit
   };
 };
 }
@@ -255,9 +252,11 @@ class DiagonalWrapper
     #endif
 
     /** Constructor from expression of diagonal coefficients to wrap. */
-    inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
 
     /** \returns a const reference to the wrapped expression of diagonal coefficients. */
+    EIGEN_DEVICE_FUNC
     const DiagonalVectorType& diagonal() const { return m_diagonal; }
 
   protected:
@@ -277,7 +276,7 @@ template<typename Derived>
 inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
-  return derived();
+  return DiagonalWrapper<const Derived>(derived());
 }
 
 /** \returns true if *this is approximately equal to a diagonal matrix,
@@ -291,12 +290,11 @@ MatrixBase<Derived>::asDiagonal() const
 template<typename Derived>
 bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
 {
-  using std::abs;
   if(cols() != rows()) return false;
   RealScalar maxAbsOnDiagonal = static_cast<RealScalar>(-1);
   for(Index j = 0; j < cols(); ++j)
   {
-    RealScalar absOnDiagonal = abs(coeff(j,j));
+    RealScalar absOnDiagonal = numext::abs(coeff(j,j));
     if(absOnDiagonal > maxAbsOnDiagonal) maxAbsOnDiagonal = absOnDiagonal;
   }
   for(Index j = 0; j < cols(); ++j)
@@ -308,6 +306,38 @@ bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
   return true;
 }
 
+namespace internal {
+
+template<> struct storage_kind_to_shape<DiagonalShape> { typedef DiagonalShape Shape; };
+
+struct Diagonal2Dense {};
+
+template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2Dense Kind; };
+
+// Diagonal matrix to Dense assignment
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    
+    dst.setZero();
+    dst.diagonal() = src.diagonal();
+  }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
+};
+
+} // namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_DIAGONALMATRIX_H
diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h
index 00f8f2915..d372b938f 100644
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -13,116 +13,14 @@
 
 namespace Eigen { 
 
-namespace internal {
-template<typename MatrixType, typename DiagonalType, int ProductOrder>
-struct traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
- : traits<MatrixType>
-{
-  typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-
-    _StorageOrder = MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor,
-    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
-                          ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
-    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
-    // FIXME currently we need same types, but in the future the next rule should be the one
-    //_Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
-    _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
-    _LinearAccessMask = (RowsAtCompileTime==1 || ColsAtCompileTime==1) ? LinearAccessBit : 0,
-
-    Flags = ((HereditaryBits|_LinearAccessMask|AlignedBit) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0),//(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
-    CoeffReadCost = NumTraits<Scalar>::MulCost + MatrixType::CoeffReadCost + DiagonalType::DiagonalVectorType::CoeffReadCost
-  };
-};
-}
-
-template<typename MatrixType, typename DiagonalType, int ProductOrder>
-class DiagonalProduct : internal::no_assignment_operator,
-                        public MatrixBase<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
-{
-  public:
-
-    typedef MatrixBase<DiagonalProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(DiagonalProduct)
-
-    inline DiagonalProduct(const MatrixType& matrix, const DiagonalType& diagonal)
-      : m_matrix(matrix), m_diagonal(diagonal)
-    {
-      eigen_assert(diagonal.diagonal().size() == (ProductOrder == OnTheLeft ? matrix.rows() : matrix.cols()));
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-    {
-      return m_diagonal.diagonal().coeff(ProductOrder == OnTheLeft ? row : col) * m_matrix.coeff(row, col);
-    }
-    
-    EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
-    {
-      enum {
-        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
-      };
-      return coeff(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
-    {
-      enum {
-        StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor
-      };
-      const Index indexInDiagonalVector = ProductOrder == OnTheLeft ? row : col;
-      return packet_impl<LoadMode>(row,col,indexInDiagonalVector,typename internal::conditional<
-        ((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft)
-       ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)), internal::true_type, internal::false_type>::type());
-    }
-    
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
-    {
-      enum {
-        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
-      };
-      return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-    }
-
-  protected:
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::true_type) const
-    {
-      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
-                     internal::pset1<PacketScalar>(m_diagonal.diagonal().coeff(id)));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::false_type) const
-    {
-      enum {
-        InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-        DiagonalVectorPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
-      };
-      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
-                     m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(id));
-    }
-
-    typename MatrixType::Nested m_matrix;
-    typename DiagonalType::Nested m_diagonal;
-};
-
 /** \returns the diagonal matrix product of \c *this by the diagonal matrix \a diagonal.
   */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
+inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
-  return DiagonalProduct<Derived, DiagonalDerived, OnTheRight>(derived(), a_diagonal.derived());
+  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 9d7651f1f..06ef18b8b 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -28,26 +28,31 @@ template<typename T, typename U,
 >
 struct dot_nocheck
 {
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
+  typedef typename conj_prod::result_type ResScalar;
+  EIGEN_DEVICE_FUNC
   static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
-    return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+    return a.template binaryExpr<conj_prod>(b).sum();
   }
 };
 
 template<typename T, typename U>
 struct dot_nocheck<T, U, true>
 {
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
+  typedef typename conj_prod::result_type ResScalar;
+  EIGEN_DEVICE_FUNC
   static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
-    return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+    return a.transpose().template binaryExpr<conj_prod>(b).sum();
   }
 };
 
 } // end namespace internal
 
-/** \returns the dot product of *this with other.
+/** \fn MatrixBase::dot
+  * \returns the dot product of *this with other.
   *
   * \only_for_vectors
   *
@@ -59,55 +64,30 @@ struct dot_nocheck<T, U, true>
   */
 template<typename Derived>
 template<typename OtherDerived>
-typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+EIGEN_DEVICE_FUNC
+typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
   EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+#if !(defined(EIGEN_NO_STATIC_ASSERT) && defined(EIGEN_NO_DEBUG))
   typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
   EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
-
+#endif
+  
   eigen_assert(size() == other.size());
 
   return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
 }
 
-#ifdef EIGEN2_SUPPORT
-/** \returns the dot product of *this with other, with the Eigen2 convention that the dot product is linear in the first variable
-  * (conjugating the second variable). Of course this only makes a difference in the complex case.
-  *
-  * This method is only available in EIGEN2_SUPPORT mode.
-  *
-  * \only_for_vectors
-  *
-  * \sa dot()
-  */
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::traits<Derived>::Scalar
-MatrixBase<Derived>::eigen2_dot(const MatrixBase<OtherDerived>& other) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-  eigen_assert(size() == other.size());
-
-  return internal::dot_nocheck<OtherDerived,Derived>::run(other,*this);
-}
-#endif
-
-
 //---------- implementation of L2 norm and related functions ----------
 
 /** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
   * In both cases, it consists in the sum of the square of all the matrix entries.
   * For vectors, this is also equals to the dot product of \c *this with itself.
   *
-  * \sa dot(), norm()
+  * \sa dot(), norm(), lpNorm()
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
@@ -119,16 +99,18 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
   * In both cases, it consists in the square root of the sum of the square of all the matrix entries.
   * For vectors, this is also equals to the square root of the dot product of \c *this with itself.
   *
-  * \sa dot(), squaredNorm()
+  * \sa lpNorm(), dot(), squaredNorm()
   */
 template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
-  using std::sqrt;
-  return sqrt(squaredNorm());
+  return numext::sqrt(squaredNorm());
 }
 
-/** \returns an expression of the quotient of *this by its own norm.
+/** \returns an expression of the quotient of \c *this by its own norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
   *
   * \only_for_vectors
   *
@@ -138,22 +120,77 @@ template<typename Derived>
 inline const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
-  typedef typename internal::nested<Derived>::type Nested;
-  typedef typename internal::remove_reference<Nested>::type _Nested;
+  typedef typename internal::nested_eval<Derived,2>::type _Nested;
   _Nested n(derived());
-  return n / n.norm();
+  RealScalar z = n.squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    return n / numext::sqrt(z);
+  else
+    return n;
 }
 
 /** Normalizes the vector, i.e. divides it by its own norm.
   *
   * \only_for_vectors
   *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
   * \sa norm(), normalized()
   */
 template<typename Derived>
 inline void MatrixBase<Derived>::normalize()
 {
-  *this /= norm();
+  RealScalar z = squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z);
+}
+
+/** \returns an expression of the quotient of \c *this by its own norm while avoiding underflow and overflow.
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalized() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
+  *
+  * \sa stableNorm(), stableNormalize(), normalized()
+  */
+template<typename Derived>
+inline const typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::stableNormalized() const
+{
+  typedef typename internal::nested_eval<Derived,3>::type _Nested;
+  _Nested n(derived());
+  RealScalar w = n.cwiseAbs().maxCoeff();
+  RealScalar z = (n/w).squaredNorm();
+  if(z>RealScalar(0))
+    return n / (numext::sqrt(z)*w);
+  else
+    return n;
+}
+
+/** Normalizes the vector while avoid underflow and overflow
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalize() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
+  * \sa stableNorm(), stableNormalized(), normalize()
+  */
+template<typename Derived>
+inline void MatrixBase<Derived>::stableNormalize()
+{
+  RealScalar w = cwiseAbs().maxCoeff();
+  RealScalar z = (derived()/w).squaredNorm();
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z)*w;
 }
 
 //---------- implementation of other norms ----------
@@ -164,9 +201,10 @@ template<typename Derived, int p>
 struct lpNorm_selector
 {
   typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const MatrixBase<Derived>& m)
   {
-    using std::pow;
+    EIGEN_USING_STD_MATH(pow)
     return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p);
   }
 };
@@ -174,6 +212,7 @@ struct lpNorm_selector
 template<typename Derived>
 struct lpNorm_selector<Derived, 1>
 {
+  EIGEN_DEVICE_FUNC
   static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
   {
     return m.cwiseAbs().sum();
@@ -183,6 +222,7 @@ struct lpNorm_selector<Derived, 1>
 template<typename Derived>
 struct lpNorm_selector<Derived, 2>
 {
+  EIGEN_DEVICE_FUNC
   static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
   {
     return m.norm();
@@ -192,23 +232,35 @@ struct lpNorm_selector<Derived, 2>
 template<typename Derived>
 struct lpNorm_selector<Derived, Infinity>
 {
-  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
+  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const MatrixBase<Derived>& m)
   {
+    if(Derived::SizeAtCompileTime==0 || (Derived::SizeAtCompileTime==Dynamic && m.size()==0))
+      return RealScalar(0);
     return m.cwiseAbs().maxCoeff();
   }
 };
 
 } // end namespace internal
 
-/** \returns the \f$ \ell^p \f$ norm of *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
-  *          of the coefficients of *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
-  *          norm, that is the maximum of the absolute values of the coefficients of *this.
+/** \returns the \b coefficient-wise \f$ \ell^p \f$ norm of \c *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
+  *          of the coefficients of \c *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
+  *          norm, that is the maximum of the absolute values of the coefficients of \c *this.
+  *
+  * In all cases, if \c *this is empty, then the value 0 is returned.
+  *
+  * \note For matrices, this function does not compute the <a href="https://en.wikipedia.org/wiki/Operator_norm">operator-norm</a>. That is, if \c *this is a matrix, then its coefficients are interpreted as a 1D vector. Nonetheless, you can easily compute the 1-norm and \f$\infty\f$-norm matrix operator norms using \link TutorialReductionsVisitorsBroadcastingReductionsNorm partial reductions \endlink.
   *
   * \sa norm()
   */
 template<typename Derived>
 template<int p>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+#else
+MatrixBase<Derived>::RealScalar
+#endif
 MatrixBase<Derived>::lpNorm() const
 {
   return internal::lpNorm_selector<Derived, p>::run(*this);
@@ -227,8 +279,8 @@ template<typename OtherDerived>
 bool MatrixBase<Derived>::isOrthogonal
 (const MatrixBase<OtherDerived>& other, const RealScalar& prec) const
 {
-  typename internal::nested<Derived,2>::type nested(derived());
-  typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
+  typename internal::nested_eval<Derived,2>::type nested(derived());
+  typename internal::nested_eval<OtherDerived,2>::type otherNested(other.derived());
   return numext::abs2(nested.dot(otherNested)) <= prec * prec * nested.squaredNorm() * otherNested.squaredNorm();
 }
 
@@ -246,13 +298,13 @@ bool MatrixBase<Derived>::isOrthogonal
 template<typename Derived>
 bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const
 {
-  typename Derived::Nested nested(derived());
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index i = 0; i < cols(); ++i)
   {
-    if(!internal::isApprox(nested.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
+    if(!internal::isApprox(self.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
       return false;
     for(Index j = 0; j < i; ++j)
-      if(!internal::isMuchSmallerThan(nested.col(i).dot(nested.col(j)), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.col(i).dot(self.col(j)), static_cast<Scalar>(1), prec))
         return false;
   }
   return true;
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index fadb45852..f76995af9 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -13,7 +13,9 @@
 
 namespace Eigen {
 
-/** Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
+/** \class EigenBase
+  * 
+  * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
   *
   * In other words, an EigenBase object is an object that can be copied into a MatrixBase.
   *
@@ -21,39 +23,57 @@ namespace Eigen {
   *
   * Notice that this class is trivial, it is only used to disambiguate overloaded functions.
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived> struct EigenBase
 {
 //   typedef typename internal::plain_matrix_type<Derived>::type PlainObject;
-
+  
+  /** \brief The interface type of indices
+    * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+    * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
+    * \sa StorageIndex, \ref TopicPreprocessorDirectives.
+    */
+  typedef Eigen::Index Index;
+
+  // FIXME is it needed?
   typedef typename internal::traits<Derived>::StorageKind StorageKind;
-  typedef typename internal::traits<Derived>::Index Index;
 
   /** \returns a reference to the derived object */
+  EIGEN_DEVICE_FUNC
   Derived& derived() { return *static_cast<Derived*>(this); }
   /** \returns a const reference to the derived object */
+  EIGEN_DEVICE_FUNC
   const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
+  EIGEN_DEVICE_FUNC
   inline Derived& const_cast_derived() const
   { return *static_cast<Derived*>(const_cast<EigenBase*>(this)); }
+  EIGEN_DEVICE_FUNC
   inline const Derived& const_derived() const
   { return *static_cast<const Derived*>(this); }
 
   /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
+  EIGEN_DEVICE_FUNC
   inline Index rows() const { return derived().rows(); }
   /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
+  EIGEN_DEVICE_FUNC
   inline Index cols() const { return derived().cols(); }
   /** \returns the number of coefficients, which is rows()*cols().
     * \sa rows(), cols(), SizeAtCompileTime. */
+  EIGEN_DEVICE_FUNC
   inline Index size() const { return rows() * cols(); }
 
   /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
-  template<typename Dest> inline void evalTo(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void evalTo(Dest& dst) const
   { derived().evalTo(dst); }
 
   /** \internal Don't use it, but do the equivalent: \code dst += *this; \endcode */
-  template<typename Dest> inline void addTo(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void addTo(Dest& dst) const
   {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
@@ -63,7 +83,9 @@ template<typename Derived> struct EigenBase
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst -= *this; \endcode */
-  template<typename Dest> inline void subTo(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void subTo(Dest& dst) const
   {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
@@ -73,7 +95,8 @@ template<typename Derived> struct EigenBase
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheRight(*this); \endcode */
-  template<typename Dest> inline void applyThisOnTheRight(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC inline void applyThisOnTheRight(Dest& dst) const
   {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
@@ -81,7 +104,8 @@ template<typename Derived> struct EigenBase
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheLeft(*this); \endcode */
-  template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC inline void applyThisOnTheLeft(Dest& dst) const
   {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
@@ -106,7 +130,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().evalTo(derived());
+  call_assignment(derived(), other.derived());
   return derived();
 }
 
@@ -114,7 +138,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().addTo(derived());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -122,7 +146,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().subTo(derived());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
diff --git a/Eigen/src/Core/Flagged.h b/Eigen/src/Core/Flagged.h
deleted file mode 100644
index 1f2955fc1..000000000
--- a/Eigen/src/Core/Flagged.h
+++ /dev/null
@@ -1,140 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_FLAGGED_H
-#define EIGEN_FLAGGED_H
-
-namespace Eigen { 
-
-/** \class Flagged
-  * \ingroup Core_Module
-  *
-  * \brief Expression with modified flags
-  *
-  * \param ExpressionType the type of the object of which we are modifying the flags
-  * \param Added the flags added to the expression
-  * \param Removed the flags removed from the expression (has priority over Added).
-  *
-  * This class represents an expression whose flags have been modified.
-  * It is the return type of MatrixBase::flagged()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::flagged()
-  */
-
-namespace internal {
-template<typename ExpressionType, unsigned int Added, unsigned int Removed>
-struct traits<Flagged<ExpressionType, Added, Removed> > : traits<ExpressionType>
-{
-  enum { Flags = (ExpressionType::Flags | Added) & ~Removed };
-};
-}
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed> class Flagged
-  : public MatrixBase<Flagged<ExpressionType, Added, Removed> >
-{
-  public:
-
-    typedef MatrixBase<Flagged> Base;
-    
-    EIGEN_DENSE_PUBLIC_INTERFACE(Flagged)
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
-    typedef typename ExpressionType::InnerIterator InnerIterator;
-
-    inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
-
-    inline CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(row, col);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_matrix.coeff(index);
-    }
-    
-    inline const Scalar& coeffRef(Index row, Index col) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return m_matrix.template packet<LoadMode>(row, col);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(row, col, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_matrix.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
-
-    const ExpressionType& _expression() const { return m_matrix; }
-
-    template<typename OtherDerived>
-    typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived>
-    void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;
-
-  protected:
-    ExpressionTypeNested m_matrix;
-};
-
-/** \returns an expression of *this with added and removed flags
-  *
-  * This is mostly for internal use.
-  *
-  * \sa class Flagged
-  */
-template<typename Derived>
-template<unsigned int Added,unsigned int Removed>
-inline const Flagged<Derived, Added, Removed>
-DenseBase<Derived>::flagged() const
-{
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_FLAGGED_H
diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h
index 807c7a293..7b08b45e6 100644
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h
@@ -39,29 +39,29 @@ template<typename ExpressionType> class ForceAlignedAccess
     typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)
 
-    inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
 
-    inline const CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
     {
       return m_expression.coeff(row, col);
     }
 
-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
     {
       return m_expression.const_cast_derived().coeffRef(row, col);
     }
 
-    inline const CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
     {
       return m_expression.coeff(index);
     }
 
-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
     {
       return m_expression.const_cast_derived().coeffRef(index);
     }
@@ -90,7 +90,7 @@ template<typename ExpressionType> class ForceAlignedAccess
       m_expression.const_cast_derived().template writePacket<Aligned>(index, x);
     }
 
-    operator const ExpressionType&() const { return m_expression; }
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
   protected:
     const ExpressionType& m_expression;
@@ -127,7 +127,7 @@ template<bool Enable>
 inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type
 MatrixBase<Derived>::forceAlignedAccessIf() const
 {
-  return derived();
+  return derived();  // FIXME This should not work but apparently is never used
 }
 
 /** \returns an expression of *this with forced aligned access if \a Enable is true.
@@ -138,7 +138,7 @@ template<bool Enable>
 inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type
 MatrixBase<Derived>::forceAlignedAccessIf()
 {
-  return derived();
+  return derived();  // FIXME This should not work but apparently is never used
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h
deleted file mode 100644
index 5f14c6587..000000000
--- a/Eigen/src/Core/Functors.h
+++ /dev/null
@@ -1,1026 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_FUNCTORS_H
-#define EIGEN_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-// associative functors:
-
-/** \internal
-  * \brief Template functor to compute the sum of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, MatrixBase::sum()
-  */
-template<typename Scalar> struct scalar_sum_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::padd(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sum_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the product of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmul(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_mul(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the conjugate product of two scalars
-  *
-  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
-
-  enum {
-    Conj = NumTraits<LhsScalar>::IsComplex
-  };
-  
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
-  
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = NumTraits<LhsScalar>::MulCost,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the min of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
-  */
-template<typename Scalar> struct scalar_min_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { using std::min; return (min)(a, b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmin(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_min(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_min_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMin
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the max of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
-  */
-template<typename Scalar> struct scalar_max_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { using std::max; return (max)(a, b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmax(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_max(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_max_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMax
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the hypot of two scalars
-  *
-  * \sa MatrixBase::stableNorm(), class Redux
-  */
-template<typename Scalar> struct scalar_hypot_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-//   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
-  {
-    using std::max;
-    using std::min;
-    using std::sqrt;
-    Scalar p = (max)(_x, _y);
-    Scalar q = (min)(_x, _y);
-    Scalar qp = q/p;
-    return p * sqrt(Scalar(1) + qp*qp);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
-};
-
-/** \internal
-  * \brief Template functor to compute the pow of two scalars
-  */
-template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
-  inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
-};
-template<typename Scalar, typename OtherScalar>
-struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-
-// other binary functors:
-
-/** \internal
-  * \brief Template functor to compute the difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator-
-  */
-template<typename Scalar> struct scalar_difference_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::psub(a,b); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_difference_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasSub
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the quotient of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator/()
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pdiv(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
-    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
-  };
-};
-
-
-
-/** \internal
-  * \brief Template functor to compute the and of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator&&
-  */
-struct scalar_boolean_and_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
-  EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
-};
-template<> struct functor_traits<scalar_boolean_and_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the or of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator||
-  */
-struct scalar_boolean_or_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
-  EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
-};
-template<> struct functor_traits<scalar_boolean_or_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functors for comparison of two scalars
-  * \todo Implement packet-comparisons
-  */
-template<typename Scalar, ComparisonName cmp> struct scalar_cmp_op;
-
-template<typename Scalar, ComparisonName cmp>
-struct functor_traits<scalar_cmp_op<Scalar, cmp> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = false
-  };
-};
-
-template<ComparisonName Cmp, typename Scalar>
-struct result_of<scalar_cmp_op<Scalar, Cmp>(Scalar,Scalar)> {
-  typedef bool type;
-};
-
-
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_EQ> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a==b;}
-};
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LT> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<b;}
-};
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LE> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<=b;}
-};
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_UNORD> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return !(a<=b || b<=a);}
-};
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a!=b;}
-};
-
-// unary functors:
-
-/** \internal
-  * \brief Template functor to compute the opposite of a scalar
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator-
-  */
-template<typename Scalar> struct scalar_opposite_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pnegate(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_opposite_op<Scalar> >
-{ enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasNegate };
-};
-
-/** \internal
-  * \brief Template functor to compute the absolute value of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::abs
-  */
-template<typename Scalar> struct scalar_abs_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pabs(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_abs_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAbs
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the squared absolute value of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::abs2
-  */
-template<typename Scalar> struct scalar_abs2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_abs2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 }; };
-
-/** \internal
-  * \brief Template functor to compute the conjugate of a complex value
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::conjugate()
-  */
-template<typename Scalar> struct scalar_conjugate_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_conjugate_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
-    PacketAccess = packet_traits<Scalar>::HasConj
-  };
-};
-
-/** \internal
-  * \brief Template functor to cast a scalar to another type
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::cast()
-  */
-template<typename Scalar, typename NewType>
-struct scalar_cast_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef NewType result_type;
-  EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast<Scalar, NewType>(a); }
-};
-template<typename Scalar, typename NewType>
-struct functor_traits<scalar_cast_op<Scalar,NewType> >
-{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the real part of a complex
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::real()
-  */
-template<typename Scalar>
-struct scalar_real_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_real_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the imaginary part of a complex
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::imag()
-  */
-template<typename Scalar>
-struct scalar_imag_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_imag_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the real part of a complex as a reference
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::real()
-  */
-template<typename Scalar>
-struct scalar_real_ref_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast<Scalar*>(&a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_real_ref_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the imaginary part of a complex as a reference
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::imag()
-  */
-template<typename Scalar>
-struct scalar_imag_ref_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast<Scalar*>(&a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_imag_ref_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  *
-  * \brief Template functor to compute the exponential of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::exp()
-  */
-template<typename Scalar> struct scalar_exp_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_exp_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; };
-
-/** \internal
-  *
-  * \brief Template functor to compute the logarithm of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::log()
-  */
-template<typename Scalar> struct scalar_log_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_log_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
-
-/** \internal
-  * \brief Template functor to multiply a scalar by a fixed other one
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
-  */
-/* NOTE why doing the pset1() in packetOp *is* an optimization ?
- * indeed it seems better to declare m_other as a Packet and do the pset1() once
- * in the constructor. However, in practice:
- *  - GCC does not like m_other as a Packet and generate a load every time it needs it
- *  - on the other hand GCC is able to moves the pset1() outside the loop :)
- *  - simpler code ;)
- * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
- */
-template<typename Scalar>
-struct scalar_multiple_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_multiple_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-template<typename Scalar1, typename Scalar2>
-struct scalar_multiple2_op {
-  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
-  EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
-  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
-};
-template<typename Scalar1,typename Scalar2>
-struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
-{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to divide a scalar by a fixed other one
-  *
-  * This functor is used to implement the quotient of a matrix by
-  * a scalar where the scalar type is not necessarily a floating point type.
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator/
-  */
-template<typename Scalar>
-struct scalar_quotient1_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_quotient1_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
-
-// nullary functors
-
-template<typename Scalar>
-struct scalar_constant_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return internal::pset1<Packet>(m_other); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_constant_op<Scalar> >
-// FIXME replace this packet test by a safe one
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
-
-template<typename Scalar> struct scalar_identity_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_identity_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
-
-template <typename Scalar, bool RandomAccess> struct linspaced_op_impl;
-
-// linear access for packet ops:
-// 1) initialization
-//   base = [low, ..., low] + ([step, ..., step] * [-size, ..., 0])
-// 2) each step (where size is 1 for coeff access or PacketSize for packet access)
-//   base += [size*step, ..., size*step]
-//
-// TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
-//       in order to avoid the padd() in operator() ?
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,false>
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_packetStep(pset1<Packet>(packet_traits<Scalar>::size*step)),
-  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Scalar>(-packet_traits<Scalar>::size)))) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
-  { 
-    m_base = padd(m_base, pset1<Packet>(m_step));
-    return m_low+Scalar(i)*m_step; 
-  }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
-
-  const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_packetStep;
-  mutable Packet m_base;
-};
-
-// random access for packet ops:
-// 1) each step
-//   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,true>
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Scalar>(0)) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
-  { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(Scalar(i)),m_interPacket))); }
-
-  const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_lowPacket;
-  const Packet m_stepPacket;
-  const Packet m_interPacket;
-};
-
-// ----- Linspace functor ----------------------------------------------------------------
-
-// Forward declaration (we default to random access which does not really give
-// us a speed gain when using packet access but it allows to use the functor in
-// nested expressions).
-template <typename Scalar, bool RandomAccess = true> struct linspaced_op;
-template <typename Scalar, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,RandomAccess> >
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
-template <typename Scalar, bool RandomAccess> struct linspaced_op
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-  linspaced_op(const Scalar& low, const Scalar& high, DenseIndex num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const 
-  {
-    eigen_assert(col==0 || row==0);
-    return impl(col + row);
-  }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
-  {
-    eigen_assert(col==0 || row==0);
-    return impl.packetOp(col + row);
-  }
-
-  // This proxy object handles the actual required temporaries, the different
-  // implementations (random vs. sequential access) as well as the
-  // correct piping to size 2/4 packet operations.
-  const linspaced_op_impl<Scalar,RandomAccess> impl;
-};
-
-// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
-// to indicate whether a functor allows linear access, just always answering 'yes' except for
-// scalar_identity_op.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
-template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
-
-// In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
-// where the mixing of different types is handled by scalar_product_traits
-// In particular, real * complex<real> is allowed.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-
-
-/** \internal
-  * \brief Template functor to add a scalar to a fixed other one
-  * \sa class CwiseUnaryOp, Array::operator+
-  */
-/* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
-template<typename Scalar>
-struct scalar_add_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
-  inline scalar_add_op(const Scalar& other) : m_other(other) { }
-  inline Scalar operator() (const Scalar& a) const { return a + m_other; }
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::padd(a, pset1<Packet>(m_other)); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_add_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to compute the square root of a scalar
-  * \sa class CwiseUnaryOp, Cwise::sqrt()
-  */
-template<typename Scalar> struct scalar_sqrt_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sqrt_op<Scalar> >
-{ enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasSqrt
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the cosine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::cos()
-  */
-template<typename Scalar> struct scalar_cos_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
-  inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_cos_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasCos
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the sine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::sin()
-  */
-template<typename Scalar> struct scalar_sin_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sin_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasSin
-  };
-};
-
-
-/** \internal
-  * \brief Template functor to compute the tan of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::tan()
-  */
-template<typename Scalar> struct scalar_tan_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_tan_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasTan
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the arc cosine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::acos()
-  */
-template<typename Scalar> struct scalar_acos_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_acos_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasACos
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the arc sine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::asin()
-  */
-template<typename Scalar> struct scalar_asin_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_asin_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasASin
-  };
-};
-
-/** \internal
-  * \brief Template functor to raise a scalar to a power
-  * \sa class CwiseUnaryOp, Cwise::pow
-  */
-template<typename Scalar>
-struct scalar_pow_op {
-  // FIXME default copy constructors seems bugged with std::complex<>
-  inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
-  inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
-  inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
-  const Scalar m_exponent;
-};
-template<typename Scalar>
-struct functor_traits<scalar_pow_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to compute the quotient between a scalar and array entries.
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_mult_op {
-  scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
-  inline Scalar operator() (const Scalar& a) const { return m_other / a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(m_other),a); }
-  Scalar m_other;
-};
-
-/** \internal
-  * \brief Template functor to compute the inverse of a scalar
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
-  inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
-
-/** \internal
-  * \brief Template functor to compute the square of a scalar
-  * \sa class CwiseUnaryOp, Cwise::square()
-  */
-template<typename Scalar>
-struct scalar_square_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
-  inline Scalar operator() (const Scalar& a) const { return a*a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_square_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-/** \internal
-  * \brief Template functor to compute the cube of a scalar
-  * \sa class CwiseUnaryOp, Cwise::cube()
-  */
-template<typename Scalar>
-struct scalar_cube_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
-  inline Scalar operator() (const Scalar& a) const { return a*a*a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,pmul(a,a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_cube_op<Scalar> >
-{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-// default functor traits for STL functors:
-
-template<typename T>
-struct functor_traits<std::multiplies<T> >
-{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::divides<T> >
-{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::plus<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::minus<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::negate<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_or<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_and<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_not<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::greater<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::less<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::greater_equal<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::less_equal<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::equal_to<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::not_equal_to<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binder2nd<T> >
-{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binder1st<T> >
-{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::unary_negate<T> >
-{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binary_negate<T> >
-{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
-
-#ifdef EIGEN_STDEXT_SUPPORT
-
-template<typename T0,typename T1>
-struct functor_traits<std::project1st<T0,T1> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::project2nd<T0,T1> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::select2nd<std::pair<T0,T1> > >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::select1st<std::pair<T0,T1> > >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::unary_compose<T0,T1> >
-{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false }; };
-
-template<typename T0,typename T1,typename T2>
-struct functor_traits<std::binary_compose<T0,T1,T2> >
-{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false }; };
-
-#endif // EIGEN_STDEXT_SUPPORT
-
-// allow to add new functors and specializations of functor_traits from outside Eigen.
-// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
-#ifdef EIGEN_FUNCTORS_PLUGIN
-#include EIGEN_FUNCTORS_PLUGIN
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_FUNCTORS_H
diff --git a/Eigen/src/Core/Fuzzy.h b/Eigen/src/Core/Fuzzy.h
index fe63bd298..3e403a09d 100644
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -19,18 +19,19 @@ namespace internal
 template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isApprox_selector
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
   {
-    using std::min;
-    typename internal::nested<Derived,2>::type nested(x);
-    typename internal::nested<OtherDerived,2>::type otherNested(y);
-    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * (min)(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
+    typename internal::nested_eval<Derived,2>::type nested(x);
+    typename internal::nested_eval<OtherDerived,2>::type otherNested(y);
+    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * numext::mini(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
   }
 };
 
 template<typename Derived, typename OtherDerived>
 struct isApprox_selector<Derived, OtherDerived, true>
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar&)
   {
     return x.matrix() == y.matrix();
@@ -40,6 +41,7 @@ struct isApprox_selector<Derived, OtherDerived, true>
 template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isMuchSmallerThan_object_selector
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
   {
     return x.cwiseAbs2().sum() <= numext::abs2(prec) * y.cwiseAbs2().sum();
@@ -49,6 +51,7 @@ struct isMuchSmallerThan_object_selector
 template<typename Derived, typename OtherDerived>
 struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const OtherDerived&, const typename Derived::RealScalar&)
   {
     return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
@@ -58,6 +61,7 @@ struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
 template<typename Derived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isMuchSmallerThan_scalar_selector
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const typename Derived::RealScalar& y, const typename Derived::RealScalar& prec)
   {
     return x.cwiseAbs2().sum() <= numext::abs2(prec * y);
@@ -67,6 +71,7 @@ struct isMuchSmallerThan_scalar_selector
 template<typename Derived>
 struct isMuchSmallerThan_scalar_selector<Derived, true>
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const typename Derived::RealScalar&, const typename Derived::RealScalar&)
   {
     return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 9e805a80f..0f16cd8e3 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -11,29 +11,7 @@
 #ifndef EIGEN_GENERAL_PRODUCT_H
 #define EIGEN_GENERAL_PRODUCT_H
 
-namespace Eigen { 
-
-/** \class GeneralProduct
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the product of two general matrices or vectors
-  *
-  * \param LhsNested the type used to store the left-hand side
-  * \param RhsNested the type used to store the right-hand side
-  * \param ProductMode the type of the product
-  *
-  * This class represents an expression of the product of two general matrices.
-  * We call a general matrix, a dense matrix with full storage. For instance,
-  * This excludes triangular, selfadjoint, and sparse matrices.
-  * It is the return type of the operator* between general matrices. Its template
-  * arguments are determined automatically by ProductReturnType. Therefore,
-  * GeneralProduct should never be used direclty. To determine the result type of a
-  * function which involves a matrix product, use ProductReturnType::Type.
-  *
-  * \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
-  */
-template<typename Lhs, typename Rhs, int ProductType = internal::product_type<Lhs,Rhs>::value>
-class GeneralProduct;
+namespace Eigen {
 
 enum {
   Large = 2,
@@ -47,7 +25,8 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
 template<int Size, int MaxSize> struct product_size_category
 {
   enum { is_large = MaxSize == Dynamic ||
-                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD,
+                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
+                    (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
          value = is_large  ? Large
                : Size == 1 ? 1
                            : Small
@@ -59,15 +38,14 @@ template<typename Lhs, typename Rhs> struct product_type
   typedef typename remove_all<Lhs>::type _Lhs;
   typedef typename remove_all<Rhs>::type _Rhs;
   enum {
-    MaxRows  = _Lhs::MaxRowsAtCompileTime,
-    Rows  = _Lhs::RowsAtCompileTime,
-    MaxCols  = _Rhs::MaxColsAtCompileTime,
-    Cols  = _Rhs::ColsAtCompileTime,
-    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
-                                           _Rhs::MaxRowsAtCompileTime),
-    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
-                                        _Rhs::RowsAtCompileTime),
-    LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+    MaxRows = traits<_Lhs>::MaxRowsAtCompileTime,
+    Rows    = traits<_Lhs>::RowsAtCompileTime,
+    MaxCols = traits<_Rhs>::MaxColsAtCompileTime,
+    Cols    = traits<_Rhs>::ColsAtCompileTime,
+    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::MaxColsAtCompileTime,
+                                           traits<_Rhs>::MaxRowsAtCompileTime),
+    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::ColsAtCompileTime,
+                                        traits<_Rhs>::RowsAtCompileTime)
   };
 
   // the splitting into different lines of code here, introducing the _select enums and the typedef below,
@@ -82,7 +60,8 @@ private:
 
 public:
   enum {
-    value = selector::ret
+    value = selector::ret,
+    ret = selector::ret
   };
 #ifdef EIGEN_DEBUG_PRODUCT
   static void debug()
@@ -98,12 +77,13 @@ public:
 #endif
 };
 
-
 /* The following allows to select the kind of product at compile time
  * based on the three dimensions of the product.
  * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
 // FIXME I'm not sure the current mapping is the ideal one.
 template<int M, int N>  struct product_type_selector<M,N,1>              { enum { ret = OuterProduct }; };
+template<int M>         struct product_type_selector<M, 1, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
+template<int N>         struct product_type_selector<1, N, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
 template<int Depth>     struct product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
 template<>              struct product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
 template<>              struct product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
@@ -122,60 +102,12 @@ template<>              struct product_type_selector<Small,Small,Large>  { enum
 template<>              struct product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
 template<>              struct product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
 template<>              struct product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = CoeffBasedProductMode }; };
 template<>              struct product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };
 
 } // end namespace internal
 
-/** \class ProductReturnType
-  * \ingroup Core_Module
-  *
-  * \brief Helper class to get the correct and optimized returned type of operator*
-  *
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  * \param ProductMode the type of the product (determined automatically by internal::product_mode)
-  *
-  * This class defines the typename Type representing the optimized product expression
-  * between two matrix expressions. In practice, using ProductReturnType<Lhs,Rhs>::Type
-  * is the recommended way to define the result type of a function returning an expression
-  * which involve a matrix product. The class Product should never be
-  * used directly.
-  *
-  * \sa class Product, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
-  */
-template<typename Lhs, typename Rhs, int ProductType>
-struct ProductReturnType
-{
-  // TODO use the nested type to reduce instanciations ????
-//   typedef typename internal::nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-//   typedef typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-
-  typedef GeneralProduct<Lhs/*Nested*/, Rhs/*Nested*/, ProductType> Type;
-};
-
-template<typename Lhs, typename Rhs>
-struct ProductReturnType<Lhs,Rhs,CoeffBasedProductMode>
-{
-  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
-  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, EvalBeforeAssigningBit | EvalBeforeNestingBit> Type;
-};
-
-template<typename Lhs, typename Rhs>
-struct ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
-{
-  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
-  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, NestByRefBit> Type;
-};
-
-// this is a workaround for sun CC
-template<typename Lhs, typename Rhs>
-struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
-{};
-
 /***********************************************************************
 *  Implementation of Inner Vector Vector Product
 ***********************************************************************/
@@ -187,119 +119,10 @@ struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedPr
 // product ends up to a row-vector times col-vector product... To tackle this use
 // case, we could have a specialization for Block<MatrixType,1,1> with: operator=(Scalar x);
 
-namespace internal {
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,InnerProduct> >
- : traits<Matrix<typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> >
-{};
-
-}
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, InnerProduct>
-  : internal::no_assignment_operator,
-    public Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1>
-{
-    typedef Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> Base;
-  public:
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-      Base::coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
-    }
-
-    /** Convertion to scalar */
-    operator const typename Base::Scalar() const {
-      return Base::coeff(0,0);
-    }
-};
-
 /***********************************************************************
 *  Implementation of Outer Vector Vector Product
 ***********************************************************************/
 
-namespace internal {
-
-// Column major
-template<typename ProductType, typename Dest, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const false_type&)
-{
-  typedef typename Dest::Index Index;
-  // FIXME make sure lhs is sequentially stored
-  // FIXME not very good if rhs is real and lhs complex while alpha is real too
-  const Index cols = dest.cols();
-  for (Index j=0; j<cols; ++j)
-    func(dest.col(j), prod.rhs().coeff(0,j) * prod.lhs());
-}
-
-// Row major
-template<typename ProductType, typename Dest, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const true_type&) {
-  typedef typename Dest::Index Index;
-  // FIXME make sure rhs is sequentially stored
-  // FIXME not very good if lhs is real and rhs complex while alpha is real too
-  const Index rows = dest.rows();
-  for (Index i=0; i<rows; ++i)
-    func(dest.row(i), prod.lhs().coeff(i,0) * prod.rhs());
-}
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,OuterProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs> >
-{};
-
-}
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, OuterProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs>
-{
-    template<typename T> struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
-    
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    }
-    
-    struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-    struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-    struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
-    struct adds {
-      Scalar m_scale;
-      adds(const Scalar& s) : m_scale(s) {}
-      template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
-        dst.const_cast_derived() += m_scale * src;
-      }
-    };
-    
-    template<typename Dest>
-    inline void evalTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, set(), IsRowMajor<Dest>());
-    }
-    
-    template<typename Dest>
-    inline void addTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, add(), IsRowMajor<Dest>());
-    }
-
-    template<typename Dest>
-    inline void subTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, sub(), IsRowMajor<Dest>());
-    }
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      internal::outer_product_selector_run(*this, dest, adds(alpha), IsRowMajor<Dest>());
-    }
-};
-
 /***********************************************************************
 *  Implementation of General Matrix Vector Product
 ***********************************************************************/
@@ -313,60 +136,13 @@ class GeneralProduct<Lhs, Rhs, OuterProduct>
  */
 namespace internal {
 
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,GemvProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs> >
-{};
-
 template<int Side, int StorageOrder, bool BlasCompatible>
-struct gemv_selector;
+struct gemv_dense_selector;
 
 } // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemvProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    typedef typename Lhs::Scalar LhsScalar;
-    typedef typename Rhs::Scalar RhsScalar;
-
-    GeneralProduct(const Lhs& a_lhs, const Rhs& a_rhs) : Base(a_lhs,a_rhs)
-    {
-//       EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::Scalar, typename Rhs::Scalar>::value),
-//         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    }
-
-    enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-    typedef typename internal::conditional<int(Side)==OnTheRight,_LhsNested,_RhsNested>::type MatrixType;
-
-    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-    {
-      eigen_assert(m_lhs.rows() == dst.rows() && m_rhs.cols() == dst.cols());
-      internal::gemv_selector<Side,(int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
-                       bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)>::run(*this, dst, alpha);
-    }
-};
-
 namespace internal {
 
-// The vector is on the left => transposition
-template<int StorageOrder, bool BlasCompatible>
-struct gemv_selector<OnTheLeft,StorageOrder,BlasCompatible>
-{
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
-  {
-    Transpose<Dest> destT(dest);
-    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
-    gemv_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
-      ::run(GeneralProduct<Transpose<const typename ProductType::_RhsNested>,Transpose<const typename ProductType::_LhsNested>, GemvProduct>
-        (prod.rhs().transpose(), prod.lhs().transpose()), destT, alpha);
-  }
-};
-
 template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vector_if;
 
 template<typename Scalar,int Size,int MaxSize>
@@ -384,126 +160,161 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 {
-  #if EIGEN_ALIGN_STATICALLY
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
-  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
-  #else
-  // Some architectures cannot align on the stack,
-  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
   enum {
     ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
     PacketSize      = internal::packet_traits<Scalar>::size
   };
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
+  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
+  #else
+  // Some architectures cannot align on the stack,
+  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
   EIGEN_STRONG_INLINE Scalar* data() {
     return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16)
+            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
             : m_data.array;
   }
   #endif
 };
 
-template<> struct gemv_selector<OnTheRight,ColMajor,true>
+// The vector is on the left => transposition
+template<int StorageOrder, bool BlasCompatible>
+struct gemv_dense_selector<OnTheLeft,StorageOrder,BlasCompatible>
 {
-  template<typename ProductType, typename Dest>
-  static inline void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::LhsScalar   LhsScalar;
-    typedef typename ProductType::RhsScalar   RhsScalar;
-    typedef typename ProductType::Scalar      ResScalar;
-    typedef typename ProductType::RealScalar  RealScalar;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
-
-    ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
-    ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
-
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    Transpose<Dest> destT(dest);
+    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
+    gemv_dense_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
+      ::run(rhs.transpose(), lhs.transpose(), destT, alpha);
+  }
+};
+
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static inline void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    typedef typename Lhs::Scalar   LhsScalar;
+    typedef typename Rhs::Scalar   RhsScalar;
+    typedef typename Dest::Scalar  ResScalar;
+    typedef typename Dest::RealScalar  RealScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
+
+    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
+    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+                                  * RhsBlasTraits::extractScalarFactor(rhs);
+
+    // make sure Dest is a compile-time vector type (bug 1166)
+    typedef typename conditional<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr>::type ActualDest;
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
-      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+      EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
       ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+      MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal
     };
 
-    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
-
-    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
-    bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-    
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
     RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
 
-    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
-                                                  evalToDest ? dest.data() : static_dest.data());
-    
-    if(!evalToDest)
+    if(!MightCannotUseDest)
     {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = dest.size();
-      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
-      if(!alphaIsCompatible)
+      // shortcut if we are sure to be able to use dest directly,
+      // this ease the compiler to generate cleaner and more optimzized code for most common cases
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          dest.data(), 1,
+          compatibleAlpha);
+    }
+    else
+    {
+      gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+      const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+      const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+      ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                    evalToDest ? dest.data() : static_dest.data());
+
+      if(!evalToDest)
       {
-        MappedDest(actualDestPtr, dest.size()).setZero();
-        compatibleAlpha = RhsScalar(1);
+        #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        Index size = dest.size();
+        EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        #endif
+        if(!alphaIsCompatible)
+        {
+          MappedDest(actualDestPtr, dest.size()).setZero();
+          compatibleAlpha = RhsScalar(1);
+        }
+        else
+          MappedDest(actualDestPtr, dest.size()) = dest;
       }
-      else
-        MappedDest(actualDestPtr, dest.size()) = dest;
-    }
 
-    general_matrix_vector_product
-      <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
-        actualLhs.rows(), actualLhs.cols(),
-        actualLhs.data(), actualLhs.outerStride(),
-        actualRhs.data(), actualRhs.innerStride(),
-        actualDestPtr, 1,
-        compatibleAlpha);
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          actualDestPtr, 1,
+          compatibleAlpha);
 
-    if (!evalToDest)
-    {
-      if(!alphaIsCompatible)
-        dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
-      else
-        dest = MappedDest(actualDestPtr, dest.size());
+      if (!evalToDest)
+      {
+        if(!alphaIsCompatible)
+          dest.matrix() += actualAlpha * MappedDest(actualDestPtr, dest.size());
+        else
+          dest = MappedDest(actualDestPtr, dest.size());
+      }
     }
   }
 };
 
-template<> struct gemv_selector<OnTheRight,RowMajor,true>
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename ProductType::LhsScalar LhsScalar;
-    typedef typename ProductType::RhsScalar RhsScalar;
-    typedef typename ProductType::Scalar    ResScalar;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::_ActualRhsType _ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-
-    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
-
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    typedef typename Lhs::Scalar   LhsScalar;
+    typedef typename Rhs::Scalar   RhsScalar;
+    typedef typename Dest::Scalar  ResScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+                                  * RhsBlasTraits::extractScalarFactor(rhs);
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
-      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
     };
 
-    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
 
     ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
         DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
@@ -511,45 +322,48 @@ template<> struct gemv_selector<OnTheRight,RowMajor,true>
     if(!DirectlyUseRhs)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = actualRhs.size();
+      Index size = actualRhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
 
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
     general_matrix_vector_product
-      <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+        <Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
         actualLhs.rows(), actualLhs.cols(),
-        actualLhs.data(), actualLhs.outerStride(),
-        actualRhsPtr, 1,
-        dest.data(), dest.innerStride(),
+        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+        RhsMapper(actualRhsPtr, 1),
+        dest.data(), dest.col(0).innerStride(), //NOTE  if dest is not a vector at compile-time, then dest.innerStride() might be wrong. (bug 1166)
         actualAlpha);
   }
 };
 
-template<> struct gemv_selector<OnTheRight,ColMajor,false>
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,false>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename Dest::Index Index;
-    // TODO makes sure dest is sequentially stored in memory, otherwise use a temp
-    const Index size = prod.rhs().rows();
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
+    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory, otherwise use a temp
+    typename nested_eval<Rhs,1>::type actual_rhs(rhs);
+    const Index size = rhs.rows();
     for(Index k=0; k<size; ++k)
-      dest += (alpha*prod.rhs().coeff(k)) * prod.lhs().col(k);
+      dest += (alpha*actual_rhs.coeff(k)) * lhs.col(k);
   }
 };
 
-template<> struct gemv_selector<OnTheRight,RowMajor,false>
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename Dest::Index Index;
-    // TODO makes sure rhs is sequentially stored in memory, otherwise use a temp
-    const Index rows = prod.rows();
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
+    typename nested_eval<Rhs,Lhs::RowsAtCompileTime>::type actual_rhs(rhs);
+    const Index rows = dest.rows();
     for(Index i=0; i<rows; ++i)
-      dest.coeffRef(i) += alpha * (prod.lhs().row(i).cwiseProduct(prod.rhs().transpose())).sum();
+      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(actual_rhs.transpose())).sum();
   }
 };
 
@@ -565,9 +379,11 @@ template<> struct gemv_selector<OnTheRight,RowMajor,false>
   *
   * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
   */
+#ifndef __CUDACC__
+
 template<typename Derived>
 template<typename OtherDerived>
-inline const typename ProductReturnType<Derived, OtherDerived>::Type
+inline const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
   // A note regarding the function declaration: In MSVC, this function will sometimes
@@ -592,9 +408,12 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 #ifdef EIGEN_DEBUG_PRODUCT
   internal::product_type<Derived,OtherDerived>::debug();
 #endif
-  return typename ProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+
+  return Product<Derived, OtherDerived>(derived(), other.derived());
 }
 
+#endif // __CUDACC__
+
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
   *
   * The returned product will behave like any other expressions: the coefficients of the product will be
@@ -608,7 +427,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
   */
 template<typename Derived>
 template<typename OtherDerived>
-const typename LazyProductReturnType<Derived,OtherDerived>::Type
+const Product<Derived,OtherDerived,LazyProduct>
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
   enum {
@@ -627,7 +446,7 @@ MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
     INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
   EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
 
-  return typename LazyProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+  return Product<Derived,OtherDerived,LazyProduct>(derived(), other.derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 5f783ebee..27033a2dd 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -42,21 +42,28 @@ namespace internal {
 struct default_packet_traits
 {
   enum {
+    HasHalfPacket = 0,
+
     HasAdd    = 1,
     HasSub    = 1,
     HasMul    = 1,
     HasNegate = 1,
     HasAbs    = 1,
+    HasArg    = 0,
     HasAbs2   = 1,
     HasMin    = 1,
     HasMax    = 1,
     HasConj   = 1,
     HasSetLinear = 1,
+    HasBlend  = 0,
 
     HasDiv    = 0,
     HasSqrt   = 0,
+    HasRsqrt  = 0,
     HasExp    = 0,
     HasLog    = 0,
+    HasLog1p  = 0,
+    HasLog10  = 0,
     HasPow    = 0,
 
     HasSin    = 0,
@@ -64,17 +71,37 @@ struct default_packet_traits
     HasTan    = 0,
     HasASin   = 0,
     HasACos   = 0,
-    HasATan   = 0
+    HasATan   = 0,
+    HasSinh   = 0,
+    HasCosh   = 0,
+    HasTanh   = 0,
+    HasLGamma = 0,
+    HasDiGamma = 0,
+    HasZeta = 0,
+    HasPolygamma = 0,
+    HasErf = 0,
+    HasErfc = 0,
+    HasIGamma = 0,
+    HasIGammac = 0,
+    HasBetaInc = 0,
+
+    HasRound  = 0,
+    HasFloor  = 0,
+    HasCeil   = 0,
+
+    HasSign   = 0
   };
 };
 
 template<typename T> struct packet_traits : default_packet_traits
 {
   typedef T type;
+  typedef T half;
   enum {
     Vectorizable = 0,
     size = 1,
-    AlignedOnScalar = 0
+    AlignedOnScalar = 0,
+    HasHalfPacket = 0
   };
   enum {
     HasAdd    = 0,
@@ -90,135 +117,239 @@ template<typename T> struct packet_traits : default_packet_traits
   };
 };
 
+template<typename T> struct packet_traits<const T> : packet_traits<T> { };
+
+template <typename Src, typename Tgt> struct type_casting_traits {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+/** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a) {
+  return static_cast<TgtPacket>(a);
+}
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
+  return static_cast<TgtPacket>(a);
+}
+
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) {
+  return static_cast<TgtPacket>(a);
+}
+
 /** \internal \returns a + b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 padd(const Packet& a,
         const Packet& b) { return a+b; }
 
 /** \internal \returns a - b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 psub(const Packet& a,
         const Packet& b) { return a-b; }
 
 /** \internal \returns -a (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pnegate(const Packet& a) { return -a; }
 
 /** \internal \returns conj(a) (coeff-wise) */
-template<typename Packet> inline Packet
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pconj(const Packet& a) { return numext::conj(a); }
 
 /** \internal \returns a * b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmul(const Packet& a,
         const Packet& b) { return a*b; }
 
 /** \internal \returns a / b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pdiv(const Packet& a,
         const Packet& b) { return a/b; }
 
 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmin(const Packet& a,
-        const Packet& b) { using std::min; return (min)(a, b); }
+        const Packet& b) { return numext::mini(a, b); }
 
 /** \internal \returns the max of \a a and \a b  (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmax(const Packet& a,
-        const Packet& b) { using std::max; return (max)(a, b); }
+        const Packet& b) { return numext::maxi(a, b); }
 
 /** \internal \returns the absolute value of \a a */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pabs(const Packet& a) { using std::abs; return abs(a); }
 
+/** \internal \returns the phase angle of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+parg(const Packet& a) { using numext::arg; return arg(a); }
+
 /** \internal \returns the bitwise and of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pand(const Packet& a, const Packet& b) { return a & b; }
 
 /** \internal \returns the bitwise or of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 por(const Packet& a, const Packet& b) { return a | b; }
 
 /** \internal \returns the bitwise xor of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pxor(const Packet& a, const Packet& b) { return a ^ b; }
 
 /** \internal \returns the bitwise andnot of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pandnot(const Packet& a, const Packet& b) { return a & (!b); }
 
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
 /** \internal \returns a packet version of \a *from, (un-aligned load) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+
+/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
+
 /** \internal \returns a packet with elements of \a *from duplicated.
-  * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
-  * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
+  * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
+  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
   * Currently, this function is only used for scalar * complex products.
- */
-template<typename Packet> inline Packet
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
-/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
-template<typename Packet> inline Packet
-pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+/** \internal \returns a packet with elements of \a *from quadrupled.
+  * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
+  * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
+  * Currently, this function is only used in matrix products.
+  * For packet-size smaller or equal to 4, this function is equivalent to pload1 
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ploadquad(const typename unpacket_traits<Packet>::type* from)
+{ return pload1<Packet>(from); }
+
+/** \internal equivalent to
+  * \code
+  * a0 = pload1(a+0);
+  * a1 = pload1(a+1);
+  * a2 = pload1(a+2);
+  * a3 = pload1(a+3);
+  * \endcode
+  * \sa pset1, pload1, ploaddup, pbroadcast2
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC
+inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
+                        Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+{
+  a0 = pload1<Packet>(a+0);
+  a1 = pload1<Packet>(a+1);
+  a2 = pload1<Packet>(a+2);
+  a3 = pload1<Packet>(a+3);
+}
+
+/** \internal equivalent to
+  * \code
+  * a0 = pload1(a+0);
+  * a1 = pload1(a+1);
+  * \endcode
+  * \sa pset1, pload1, ploaddup, pbroadcast4
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC
+inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
+                        Packet& a0, Packet& a1)
+{
+  a0 = pload1<Packet>(a+0);
+  a1 = pload1<Packet>(a+1);
+}
 
 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Scalar> inline typename packet_traits<Scalar>::type
-plset(const Scalar& a) { return a; }
+template<typename Packet> inline Packet
+plset(const typename unpacket_traits<Packet>::type& a) { return a; }
 
 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet> inline void pstore(Scalar* to, const Packet& from)
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
 { (*to) = from; }
 
 /** \internal copy the packet \a from to \a *to, (un-aligned store) */
-template<typename Scalar, typename Packet> inline void pstoreu(Scalar* to, const Packet& from)
-{ (*to) = from; }
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
+{  (*to) = from; }
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
+ { return ploadu<Packet>(from); }
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
+ { pstore(to, from); }
 
 /** \internal tries to do cache prefetching of \a addr */
-template<typename Scalar> inline void prefetch(const Scalar* addr)
+template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
-#if !defined(_MSC_VER)
-__builtin_prefetch(addr);
+#ifdef __CUDA_ARCH__
+#if defined(__LP64__)
+  // 64-bit pointer operand constraint for inlined asm
+  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+#else
+  // 32-bit pointer operand constraint for inlined asm
+  asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
+#endif
+#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC)
+  __builtin_prefetch(addr);
 #endif
 }
 
 /** \internal \returns the first element of a packet */
-template<typename Packet> inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
 { return a; }
 
 /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 preduxp(const Packet* vecs) { return vecs[0]; }
 
 /** \internal \returns the sum of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
+{ return a; }
+
+/** \internal \returns the sum of the elements of \a a by block of 4 elements.
+  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
+  * For packet-size smaller or equal to 4, this boils down to a noop.
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline
+typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
+predux_downto4(const Packet& a)
 { return a; }
 
 /** \internal \returns the product of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }
 
 /** \internal \returns the min of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
 { return a; }
 
 /** \internal \returns the max of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
 { return a; }
 
 /** \internal \returns the reversed elements of \a a*/
-template<typename Packet> inline Packet preverse(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
 
-
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
-template<typename Packet> inline Packet pcplxflip(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
   // FIXME: uncomment the following in case we drop the internal imag and real functions.
 //   using std::imag;
@@ -250,6 +381,22 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { using std::acos; return acos(a); }
 
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patan(const Packet& a) { using std::atan; return atan(a); }
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }
+
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { using std::exp; return exp(a); }
@@ -258,10 +405,36 @@ Packet pexp(const Packet& a) { using std::exp; return exp(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog(const Packet& a) { using std::log; return log(a); }
 
+/** \internal \returns the log1p of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog1p(const Packet& a) { return numext::log1p(a); }
+
+/** \internal \returns the log10 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog10(const Packet& a) { using std::log10; return log10(a); }
+
 /** \internal \returns the square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
 
+/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet prsqrt(const Packet& a) {
+  return pdiv(pset1<Packet>(1), psqrt(a));
+}
+
+/** \internal \returns the rounded value of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pround(const Packet& a) { using numext::round; return round(a); }
+
+/** \internal \returns the floor of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
+
+/** \internal \returns the ceil of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
+
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
@@ -275,34 +448,45 @@ inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename u
 }
 
 /** \internal \returns a * b + c (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmadd(const Packet&  a,
          const Packet&  b,
          const Packet&  c)
 { return padd(pmul(a, b),c); }
 
 /** \internal \returns a packet version of \a *from.
-  * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
-template<typename Packet, int LoadMode>
-inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 {
-  if(LoadMode == Aligned)
+  if(Alignment >= unpacket_traits<Packet>::alignment)
     return pload<Packet>(from);
   else
     return ploadu<Packet>(from);
 }
 
 /** \internal copy the packet \a from to \a *to.
-  * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet, int LoadMode>
-inline void pstoret(Scalar* to, const Packet& from)
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
 {
-  if(LoadMode == Aligned)
+  if(Alignment >= unpacket_traits<Packet>::alignment)
     pstore(to, from);
   else
     pstoreu(to, from);
 }
 
+/** \internal \returns a packet version of \a *from.
+  * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
+  * hardware if available to speedup the loading of data that won't be modified
+  * by the current computation.
+  */
+template<typename Packet, int LoadMode>
+inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+{
+  return ploadt<Packet, LoadMode>(from);
+}
+
 /** \internal default implementation of palign() allowing partial specialization */
 template<int Offset,typename PacketType>
 struct palign_impl
@@ -336,15 +520,74 @@ inline void palign(PacketType& first, const PacketType& second)
 * Fast complex products (GCC generates a function call which is very slow)
 ***************************************************************************/
 
+// Eigen+CUDA does not support complexes.
+#ifndef __CUDACC__
+
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
 
 template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
 { return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
 
+#endif
+
+
+/***************************************************************************
+ * PacketBlock, that is a collection of N packets where the number of words
+ * in the packet is a multiple of N.
+***************************************************************************/
+template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock {
+  Packet packet[N];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
+  // Nothing to do in the scalar case, i.e. a 1x1 matrix.
+}
+
+/***************************************************************************
+ * Selector, i.e. vector of N boolean values used to select (i.e. blend)
+ * words from 2 packets.
+***************************************************************************/
+template <size_t N> struct Selector {
+  bool select[N];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  return ifPacket.select[0] ? thenPacket : elsePacket;
+}
+
+/** \internal \returns \a a with the first coefficient replaced by the scalar b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pinsertfirst(const Packet& a, typename unpacket_traits<Packet>::type b)
+{
+  // Default implementation based on pblend.
+  // It must be specialized for higher performance.
+  Selector<unpacket_traits<Packet>::size> mask;
+  mask.select[0] = true;
+  // This for loop should be optimized away by the compiler.
+  for(Index i=1; i<unpacket_traits<Packet>::size; ++i)
+    mask.select[i] = false;
+  return pblend(mask, pset1<Packet>(b), a);
+}
+
+/** \internal \returns \a a with the last coefficient replaced by the scalar b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b)
+{
+  // Default implementation based on pblend.
+  // It must be specialized for higher performance.
+  Selector<unpacket_traits<Packet>::size> mask;
+  // This for loop should be optimized away by the compiler.
+  for(Index i=0; i<unpacket_traits<Packet>::size-1; ++i)
+    mask.select[i] = false;
+  mask.select[unpacket_traits<Packet>::size-1] = true;
+  return pblend(mask, pset1<Packet>(b), a);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
 
 #endif // EIGEN_GENERIC_PACKET_MATH_H
-
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 2acf97723..769dc255c 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2010-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,13 +11,30 @@
 #ifndef EIGEN_GLOBAL_FUNCTIONS_H
 #define EIGEN_GLOBAL_FUNCTIONS_H
 
-#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR) \
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
+  /** \returns an expression of the coefficient-wise DOC_OP of \a x
+
+    DOC_DETAILS
+
+    \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_##NAME">Math functions</a>, class CwiseUnaryOp
+    */ \
+  template<typename Derived> \
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
+  NAME(const Eigen::ArrayBase<Derived>& x);
+
+#else
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
   template<typename Derived> \
   inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
-  NAME(const Eigen::ArrayBase<Derived>& x) { \
-    return x.derived(); \
+  (NAME)(const Eigen::ArrayBase<Derived>& x) { \
+    return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
   }
 
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
 #define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
   \
   template<typename Derived> \
@@ -30,55 +47,133 @@
   { \
     static inline typename NAME##_retval<ArrayBase<Derived> >::type run(const Eigen::ArrayBase<Derived>& x) \
     { \
-      return x.derived(); \
+      return typename NAME##_retval<ArrayBase<Derived> >::type(x.derived()); \
     } \
   };
 
-
 namespace Eigen
 {
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op,real part,\sa ArrayBase::real)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op,imaginary part,\sa ArrayBase::imag)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op,complex conjugate,\sa ArrayBase::conjugate)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op,inverse,\sa ArrayBase::inverse)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op,sine,\sa ArrayBase::sin)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op,cosine,\sa ArrayBase::cos)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op,tangent,\sa ArrayBase::tan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op,arc-tangent,\sa ArrayBase::atan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op,arc-sine,\sa ArrayBase::asin)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op,arc-consine,\sa ArrayBase::acos)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op,not-a-number test,\sa Eigen::isinf DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isnan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
   
+  /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
+    *
+    * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
+    *
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  template<typename Derived,typename ScalarExponent>
+  inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
+#else
+  template<typename Derived,typename ScalarExponent>
+  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
+    return x.derived().pow(exponent);
+  }
+
   template<typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar>, const Derived>
+  inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
   pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
     return x.derived().pow(exponent);
   }
+#endif
 
-  template<typename Derived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<Derived>& exponents) 
+  /** \returns an expression of the coefficient-wise power of \a x to the given array of \a exponents.
+    *
+    * This function computes the coefficient-wise power.
+    *
+    * Example: \include Cwise_array_power_array.cpp
+    * Output: \verbinclude Cwise_array_power_array.out
+    * 
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+  template<typename Derived,typename ExponentDerived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
   {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>(
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
       x.derived(),
       exponents.derived()
     );
   }
   
-  /**
-  * \brief Component-wise division of a scalar by array elements.
-  **/
-  template <typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>
-    operator/(const typename Derived::Scalar& s, const Eigen::ArrayBase<Derived>& a)
+  /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
+    *
+    * This function computes the coefficient-wise power between a scalar and an array of exponents.
+    *
+    * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression (\c Derived::Scalar).
+    *
+    * Example: \include Cwise_scalar_power_array.cpp
+    * Output: \verbinclude Cwise_scalar_power_array.out
+    * 
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  template<typename Scalar,typename Derived>
+  inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
+  pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
+#else
+  template<typename Scalar, typename Derived>
+  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
+          const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
+  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
   {
-    return Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>(
-      a.derived(),
-      Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>(s)  
-    );
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
+            typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
   }
 
+  template<typename Derived>
+  inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
+  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
+  {
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
+      typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
+  }
+#endif
+
+
   namespace internal
   {
     EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(real,scalar_real_op)
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index 8d4bc59e9..da7fd6cce 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -49,7 +49,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   */
 struct IOFormat
 {
-  /** Default contructor, see class IOFormat for the meaning of the parameters */
+  /** Default constructor, see class IOFormat for the meaning of the parameters */
   IOFormat(int _precision = StreamPrecision, int _flags = 0,
     const std::string& _coeffSeparator = " ",
     const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
@@ -57,6 +57,10 @@ struct IOFormat
   : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
     rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
   {
+    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
+    // don't add rowSpacer if columns are not to be aligned
+    if((flags & DontAlignCols))
+      return;
     int i = int(matSuffix.length())-1;
     while (i>=0 && matSuffix[i]!='\n')
     {
@@ -76,7 +80,7 @@ struct IOFormat
   *
   * \brief Pseudo expression providing matrix output with given format
   *
-  * \param ExpressionType the type of the object on which IO stream operations are performed
+  * \tparam ExpressionType the type of the object on which IO stream operations are performed
   *
   * This class represents an expression with stream operators controlled by a given IOFormat.
   * It is the return type of DenseBase::format()
@@ -101,52 +105,24 @@ class WithFormat
     }
 
   protected:
-    const typename ExpressionType::Nested m_matrix;
+    typename ExpressionType::Nested m_matrix;
     IOFormat m_format;
 };
 
-/** \returns a WithFormat proxy object allowing to print a matrix the with given
-  * format \a fmt.
-  *
-  * See class IOFormat for some examples.
-  *
-  * \sa class IOFormat, class WithFormat
-  */
-template<typename Derived>
-inline const WithFormat<Derived>
-DenseBase<Derived>::format(const IOFormat& fmt) const
-{
-  return WithFormat<Derived>(derived(), fmt);
-}
-
 namespace internal {
 
-template<typename Scalar, bool IsInteger>
-struct significant_decimals_default_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline int run()
-  {
-    using std::ceil;
-    using std::log;
-    return cast<RealScalar,int>(ceil(-log(NumTraits<RealScalar>::epsilon())/log(RealScalar(10))));
-  }
-};
-
+// NOTE: This helper is kept for backward compatibility with previous code specializing
+//       this internal::significant_decimals_impl structure. In the future we should directly
+//       call digits10() which has been introduced in July 2016 in 3.3.
 template<typename Scalar>
-struct significant_decimals_default_impl<Scalar, true>
+struct significant_decimals_impl
 {
   static inline int run()
   {
-    return 0;
+    return NumTraits<Scalar>::digits10();
   }
 };
 
-template<typename Scalar>
-struct significant_decimals_impl
-  : significant_decimals_default_impl<Scalar, NumTraits<Scalar>::IsInteger>
-{};
-
 /** \internal
   * print the matrix \a _m to the output stream \a s using the output format \a fmt */
 template<typename Derived>
@@ -160,7 +136,6 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   
   typename Derived::Nested m = _m;
   typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Index Index;
 
   Index width = 0;
 
diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
new file mode 100644
index 000000000..b76f0439d
--- /dev/null
+++ b/Eigen/src/Core/Inverse.h
@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INVERSE_H
+#define EIGEN_INVERSE_H
+
+namespace Eigen { 
+
+template<typename XprType,typename StorageKind> class InverseImpl;
+
+namespace internal {
+
+template<typename XprType>
+struct traits<Inverse<XprType> >
+  : traits<typename XprType::PlainObject>
+{
+  typedef typename XprType::PlainObject PlainObject;
+  typedef traits<PlainObject> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit
+  };
+};
+
+} // end namespace internal
+
+/** \class Inverse
+  *
+  * \brief Expression of the inverse of another expression
+  *
+  * \tparam XprType the type of the expression we are taking the inverse
+  *
+  * This class represents an abstract expression of A.inverse()
+  * and most of the time this is the only way it is used.
+  *
+  */
+template<typename XprType>
+class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::StorageKind>
+{
+public:
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename XprType::PlainObject                       PlainObject;
+  typedef typename XprType::Scalar                            Scalar;
+  typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
+  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
+  typedef typename internal::ref_selector<Inverse>::type Nested;
+  typedef typename internal::remove_all<XprType>::type NestedExpression;
+  
+  explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
+    : m_xpr(xpr)
+  {}
+
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+
+  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
+
+protected:
+  XprTypeNested m_xpr;
+};
+
+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class InverseImpl
+  : public internal::generic_xpr_base<Inverse<XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<Inverse<XprType> >::type Base;
+  typedef typename XprType::Scalar Scalar;
+private:
+
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+namespace internal {
+
+/** \internal
+  * \brief Default evaluator for Inverse expression.
+  * 
+  * This default evaluator for Inverse expression simply evaluate the inverse into a temporary
+  * by a call to internal::call_assignment_no_alias.
+  * Therefore, inverse implementers only have to specialize Assignment<Dst,Inverse<...>, ...> for
+  * there own nested expression.
+  *
+  * \sa class Inverse
+  */
+template<typename ArgType>
+struct unary_evaluator<Inverse<ArgType> >
+  : public evaluator<typename Inverse<ArgType>::PlainObject>
+{
+  typedef Inverse<ArgType> InverseType;
+  typedef typename InverseType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+  
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  unary_evaluator(const InverseType& inv_xpr)
+    : m_result(inv_xpr.rows(), inv_xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    internal::call_assignment_no_alias(m_result, inv_xpr);
+  }
+  
+protected:
+  PlainObject m_result;
+};
+  
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INVERSE_H
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index f804c89d6..06d196702 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -13,13 +13,35 @@
 
 namespace Eigen { 
 
+namespace internal {
+template<typename PlainObjectType, int MapOptions, typename StrideType>
+struct traits<Map<PlainObjectType, MapOptions, StrideType> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             : int(StrideType::OuterStrideAtCompileTime),
+    Alignment = int(MapOptions)&int(AlignedMask),
+    Flags0 = TraitsBase::Flags & (~NestByRefBit),
+    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
+  };
+private:
+  enum { Options }; // Expressions don't have Options
+};
+}
+
 /** \class Map
   * \ingroup Core_Module
   *
   * \brief A matrix or vector expression mapping an existing array of data.
   *
   * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies whether the pointer is \c #Aligned, or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
   *                The default is \c #Unaligned.
   * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
   *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
@@ -63,44 +85,6 @@ namespace Eigen {
   *
   * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
   */
-
-namespace internal {
-template<typename PlainObjectType, int MapOptions, typename StrideType>
-struct traits<Map<PlainObjectType, MapOptions, StrideType> >
-  : public traits<PlainObjectType>
-{
-  typedef traits<PlainObjectType> TraitsBase;
-  typedef typename PlainObjectType::Index Index;
-  typedef typename PlainObjectType::Scalar Scalar;
-  enum {
-    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
-                             ? int(PlainObjectType::InnerStrideAtCompileTime)
-                             : int(StrideType::InnerStrideAtCompileTime),
-    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
-                             ? int(PlainObjectType::OuterStrideAtCompileTime)
-                             : int(StrideType::OuterStrideAtCompileTime),
-    HasNoInnerStride = InnerStrideAtCompileTime == 1,
-    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
-    HasNoStride = HasNoInnerStride && HasNoOuterStride,
-    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
-    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
-    KeepsPacketAccess = bool(HasNoInnerStride)
-                        && ( bool(IsDynamicSize)
-                           || HasNoOuterStride
-                           || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
-    Flags0 = TraitsBase::Flags & (~NestByRefBit),
-    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
-    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
-           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
-    Flags3 = is_lvalue<PlainObjectType>::value ? int(Flags2) : (int(Flags2) & ~LvalueBit),
-    Flags = KeepsPacketAccess ? int(Flags3) : (int(Flags3) & ~PacketAccessBit)
-  };
-private:
-  enum { Options }; // Expressions don't have Options
-};
-}
-
 template<typename PlainObjectType, int MapOptions, typename StrideType> class Map
   : public MapBase<Map<PlainObjectType, MapOptions, StrideType> >
 {
@@ -110,19 +94,17 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     EIGEN_DENSE_PUBLIC_INTERFACE(Map)
 
     typedef typename Base::PointerType PointerType;
-#if EIGEN2_SUPPORT_STAGE <= STAGE30_FULL_EIGEN3_API
-    typedef const Scalar* PointerArgType;
-    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return const_cast<PointerType>(ptr); }
-#else
     typedef PointerType PointerArgType;
+    EIGEN_DEVICE_FUNC
     inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
-#endif
 
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const
     {
       return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
     }
 
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
       return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
@@ -134,10 +116,11 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     /** Constructor in the fixed-size case.
       *
       * \param dataPtr pointer to the array to map
-      * \param a_stride optional Stride object, passing the strides.
+      * \param stride optional Stride object, passing the strides.
       */
-    inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
+    EIGEN_DEVICE_FUNC
+    explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr)), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
@@ -145,11 +128,12 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     /** Constructor in the dynamic-size vector case.
       *
       * \param dataPtr pointer to the array to map
-      * \param a_size the size of the vector expression
-      * \param a_stride optional Stride object, passing the strides.
+      * \param size the size of the vector expression
+      * \param stride optional Stride object, passing the strides.
       */
-    inline Map(PointerArgType dataPtr, Index a_size, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), a_size), m_stride(a_stride)
+    EIGEN_DEVICE_FUNC
+    inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
@@ -157,12 +141,13 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     /** Constructor in the dynamic-size matrix case.
       *
       * \param dataPtr pointer to the array to map
-      * \param nbRows the number of rows of the matrix expression
-      * \param nbCols the number of columns of the matrix expression
-      * \param a_stride optional Stride object, passing the strides.
+      * \param rows the number of rows of the matrix expression
+      * \param cols the number of columns of the matrix expression
+      * \param stride optional Stride object, passing the strides.
       */
-    inline Map(PointerArgType dataPtr, Index nbRows, Index nbCols, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), nbRows, nbCols), m_stride(a_stride)
+    EIGEN_DEVICE_FUNC
+    inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
@@ -173,19 +158,6 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     StrideType m_stride;
 };
 
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-inline Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
-  ::Array(const Scalar *data)
-{
-  this->_set_noalias(Eigen::Map<const Array>(data));
-}
-
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-inline Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
-  ::Matrix(const Scalar *data)
-{
-  this->_set_noalias(Eigen::Map<const Matrix>(data));
-}
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 92e114129..020f939ad 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -12,15 +12,25 @@
 #define EIGEN_MAPBASE_H
 
 #define EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) \
-      EIGEN_STATIC_ASSERT((int(internal::traits<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
+      EIGEN_STATIC_ASSERT((int(internal::evaluator<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
                           YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)
 
 namespace Eigen { 
 
-/** \class MapBase
-  * \ingroup Core_Module
+/** \ingroup Core_Module
   *
-  * \brief Base class for Map and Block expression with direct access
+  * \brief Base class for dense Map and Block expression with direct access
+  *
+  * This base class provides the const low-level accessors (e.g. coeff, coeffRef) of dense
+  * Map and Block objects with direct access.
+  * Typical users do not have to directly deal with this class.
+  *
+  * This class can be extended by through the macro plugin \c EIGEN_MAPBASE_PLUGIN.
+  * See \link TopicCustomizing_Plugins customizing Eigen \endlink for details.
+  *
+  * The \c Derived class has to provide the following two methods describing the memory layout:
+  *  \code Index innerStride() const; \endcode
+  *  \code Index outerStride() const; \endcode
   *
   * \sa class Map, class Block
   */
@@ -37,7 +47,6 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     };
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -76,8 +85,10 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
 
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
-    inline Index rows() const { return m_rows.value(); }
-    inline Index cols() const { return m_cols.value(); }
+    /** \copydoc DenseBase::rows() */
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
+    /** \copydoc DenseBase::cols() */
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }
 
     /** Returns a pointer to the first coefficient of the matrix or vector.
       *
@@ -85,30 +96,39 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       *
       * \sa innerStride(), outerStride()
       */
-    inline const Scalar* data() const { return m_data; }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }
 
+    /** \copydoc PlainObjectBase::coeff(Index,Index) const */
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeff(Index rowId, Index colId) const
     {
       return m_data[colId * colStride() + rowId * rowStride()];
     }
 
+    /** \copydoc PlainObjectBase::coeff(Index) const */
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeff(Index index) const
     {
       EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
       return m_data[index * innerStride()];
     }
 
+    /** \copydoc PlainObjectBase::coeffRef(Index,Index) const */
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return this->m_data[colId * colStride() + rowId * rowStride()];
     }
 
+    /** \copydoc PlainObjectBase::coeffRef(Index) const */
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
       EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
       return this->m_data[index * innerStride()];
     }
 
+    /** \internal */
     template<int LoadMode>
     inline PacketScalar packet(Index rowId, Index colId) const
     {
@@ -116,6 +136,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
                (m_data + (colId * colStride() + rowId * rowStride()));
     }
 
+    /** \internal */
     template<int LoadMode>
     inline PacketScalar packet(Index index) const
     {
@@ -123,12 +144,16 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
     }
 
-    inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
+    /** \internal Constructor for fixed size matrices or vectors */
+    EIGEN_DEVICE_FUNC
+    explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
     {
       EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-      checkSanity();
+      checkSanity<Derived>();
     }
 
+    /** \internal Constructor for dynamically sized vectors */
+    EIGEN_DEVICE_FUNC
     inline MapBase(PointerType dataPtr, Index vecSize)
             : m_data(dataPtr),
               m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
@@ -137,34 +162,56 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
       eigen_assert(vecSize >= 0);
       eigen_assert(dataPtr == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == vecSize);
-      checkSanity();
+      checkSanity<Derived>();
     }
 
-    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols)
-            : m_data(dataPtr), m_rows(nbRows), m_cols(nbCols)
+    /** \internal Constructor for dynamically sized matrices */
+    EIGEN_DEVICE_FUNC
+    inline MapBase(PointerType dataPtr, Index rows, Index cols)
+            : m_data(dataPtr), m_rows(rows), m_cols(cols)
     {
       eigen_assert( (dataPtr == 0)
-              || (   nbRows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-                  && nbCols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols)));
-      checkSanity();
+              || (   rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+                  && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
+      checkSanity<Derived>();
     }
 
+    #ifdef EIGEN_MAPBASE_PLUGIN
+    #include EIGEN_MAPBASE_PLUGIN
+    #endif
+
   protected:
 
-    void checkSanity() const
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
     {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
-                                        internal::inner_stride_at_compile_time<Derived>::ret==1),
-                          PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0)
-                   && "input pointer is not aligned on a 16 byte boundary");
+#if EIGEN_MAX_ALIGN_BYTES>0
+      eigen_assert((   ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
+                    || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
+#endif
     }
 
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    void checkSanity(typename internal::enable_if<internal::traits<T>::Alignment==0,void*>::type = 0) const
+    {}
+
     PointerType m_data;
     const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
     const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
 };
 
+/** \ingroup Core_Module
+  *
+  * \brief Base class for non-const dense Map and Block expression with direct access
+  *
+  * This base class provides the non-const low-level accessors (e.g. coeff and coeffRef) of
+  * dense Map and Block objects with direct access.
+  * It inherits MapBase<Derived, ReadOnlyAccessors> which defines the const variant for reading specific entries.
+  *
+  * \sa class Map, class Block
+  */
 template<typename Derived> class MapBase<Derived, WriteAccessors>
   : public MapBase<Derived, ReadOnlyAccessors>
 {
@@ -175,7 +222,7 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
 
     typedef typename Base::Scalar Scalar;
     typedef typename Base::PacketScalar PacketScalar;
-    typedef typename Base::Index Index;
+    typedef typename Base::StorageIndex StorageIndex;
     typedef typename Base::PointerType PointerType;
 
     using Base::derived;
@@ -196,14 +243,18 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
                     const Scalar
                   >::type ScalarWithConstIfNotLvalue;
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return this->m_data; }
+    EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue* data() { return this->m_data; } // no const-cast here so non-const-correct code will give a compile error
 
+    EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col)
     {
       return this->m_data[col * colStride() + row * rowStride()];
     }
 
+    EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
     {
       EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
@@ -225,10 +276,11 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
                 (this->m_data + index * innerStride(), val);
     }
 
-    explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
-    inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}
+    EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}
 
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const MapBase& other)
     {
       ReadOnlyMapBase::Base::operator=(other);
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 2bfc5ebd9..8d47fb8a4 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -10,11 +10,25 @@
 #ifndef EIGEN_MATHFUNCTIONS_H
 #define EIGEN_MATHFUNCTIONS_H
 
+// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
+// TODO this should better be moved to NumTraits
+#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L
+
+
 namespace Eigen {
 
+// On WINCE, std::abs is defined for int only, so let's defined our own overloads:
+// This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
+#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500
+long        abs(long        x) { return (labs(x));  }
+double      abs(double      x) { return (fabs(x));  }
+float       abs(float       x) { return (fabsf(x)); }
+long double abs(long double x) { return (fabsl(x)); }
+#endif
+
 namespace internal {
 
-/** \internal \struct global_math_functions_filtering_base
+/** \internal \class global_math_functions_filtering_base
   *
   * What it does:
   * Defines a typedef 'type' as follows:
@@ -62,6 +76,7 @@ template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct real_default_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
     return x;
@@ -72,6 +87,7 @@ template<typename Scalar>
 struct real_default_impl<Scalar,true>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
     using std::real;
@@ -81,13 +97,25 @@ struct real_default_impl<Scalar,true>
 
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
 
+#ifdef __CUDA_ARCH__
+template<typename T>
+struct real_impl<std::complex<T> >
+{
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x)
+  {
+    return x.real();
+  }
+};
+#endif
+
 template<typename Scalar>
 struct real_retval
 {
   typedef typename NumTraits<Scalar>::Real type;
 };
 
-
 /****************************************************************************
 * Implementation of imag                                                 *
 ****************************************************************************/
@@ -96,6 +124,7 @@ template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct imag_default_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar&)
   {
     return RealScalar(0);
@@ -106,6 +135,7 @@ template<typename Scalar>
 struct imag_default_impl<Scalar,true>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
     using std::imag;
@@ -115,6 +145,19 @@ struct imag_default_impl<Scalar,true>
 
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
 
+#ifdef __CUDA_ARCH__
+template<typename T>
+struct imag_impl<std::complex<T> >
+{
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x)
+  {
+    return x.imag();
+  }
+};
+#endif
+
 template<typename Scalar>
 struct imag_retval
 {
@@ -129,10 +172,12 @@ template<typename Scalar>
 struct real_ref_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar& run(Scalar& x)
   {
     return reinterpret_cast<RealScalar*>(&x)[0];
   }
+  EIGEN_DEVICE_FUNC
   static inline const RealScalar& run(const Scalar& x)
   {
     return reinterpret_cast<const RealScalar*>(&x)[0];
@@ -153,10 +198,12 @@ template<typename Scalar, bool IsComplex>
 struct imag_ref_default_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar& run(Scalar& x)
   {
     return reinterpret_cast<RealScalar*>(&x)[1];
   }
+  EIGEN_DEVICE_FUNC
   static inline const RealScalar& run(const Scalar& x)
   {
     return reinterpret_cast<RealScalar*>(&x)[1];
@@ -166,10 +213,12 @@ struct imag_ref_default_impl
 template<typename Scalar>
 struct imag_ref_default_impl<Scalar, false>
 {
+  EIGEN_DEVICE_FUNC
   static inline Scalar run(Scalar&)
   {
     return Scalar(0);
   }
+  EIGEN_DEVICE_FUNC
   static inline const Scalar run(const Scalar&)
   {
     return Scalar(0);
@@ -192,6 +241,7 @@ struct imag_ref_retval
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct conj_impl
 {
+  EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
   {
     return x;
@@ -201,6 +251,7 @@ struct conj_impl
 template<typename Scalar>
 struct conj_impl<Scalar,true>
 {
+  EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
   {
     using std::conj;
@@ -218,26 +269,40 @@ struct conj_retval
 * Implementation of abs2                                                 *
 ****************************************************************************/
 
-template<typename Scalar>
-struct abs2_impl
+template<typename Scalar,bool IsComplex>
+struct abs2_impl_default
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
     return x*x;
   }
 };
 
-template<typename RealScalar>
-struct abs2_impl<std::complex<RealScalar> >
+template<typename Scalar>
+struct abs2_impl_default<Scalar, true> // IsComplex
 {
-  static inline RealScalar run(const std::complex<RealScalar>& x)
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
   {
     return real(x)*real(x) + imag(x)*imag(x);
   }
 };
 
 template<typename Scalar>
+struct abs2_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return abs2_impl_default<Scalar,NumTraits<Scalar>::IsComplex>::run(x);
+  }
+};
+
+template<typename Scalar>
 struct abs2_retval
 {
   typedef typename NumTraits<Scalar>::Real type;
@@ -251,9 +316,10 @@ template<typename Scalar, bool IsComplex>
 struct norm1_default_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
     return abs(real(x)) + abs(imag(x));
   }
 };
@@ -261,9 +327,10 @@ struct norm1_default_impl
 template<typename Scalar>
 struct norm1_default_impl<Scalar, false>
 {
+  EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
   {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
     return abs(x);
   }
 };
@@ -287,16 +354,22 @@ struct hypot_impl
   typedef typename NumTraits<Scalar>::Real RealScalar;
   static inline RealScalar run(const Scalar& x, const Scalar& y)
   {
-    using std::max;
-    using std::min;
-    using std::abs;
-    using std::sqrt;
+    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD_MATH(sqrt);
     RealScalar _x = abs(x);
     RealScalar _y = abs(y);
-    RealScalar p = (max)(_x, _y);
-    if(p==RealScalar(0)) return 0;
-    RealScalar q = (min)(_x, _y);
-    RealScalar qp = q/p;
+    Scalar p, qp;
+    if(_x>_y)
+    {
+      p = _x;
+      qp = _y / p;
+    }
+    else
+    {
+      p = _y;
+      qp = _x / p;
+    }
+    if(p==RealScalar(0)) return RealScalar(0);
     return p * sqrt(RealScalar(1) + qp*qp);
   }
 };
@@ -314,6 +387,7 @@ struct hypot_retval
 template<typename OldType, typename NewType>
 struct cast_impl
 {
+  EIGEN_DEVICE_FUNC
   static inline NewType run(const OldType& x)
   {
     return static_cast<NewType>(x);
@@ -323,48 +397,124 @@ struct cast_impl
 // here, for once, we're plainly returning NewType: we don't want cast to do weird things.
 
 template<typename OldType, typename NewType>
+EIGEN_DEVICE_FUNC
 inline NewType cast(const OldType& x)
 {
   return cast_impl<OldType, NewType>::run(x);
 }
 
 /****************************************************************************
-* Implementation of atanh2                                                *
+* Implementation of round                                                   *
 ****************************************************************************/
 
-template<typename Scalar, bool IsInteger>
-struct atanh2_default_impl
-{
-  typedef Scalar retval;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline Scalar run(const Scalar& x, const Scalar& y)
+#if EIGEN_HAS_CXX11_MATH
+  template<typename Scalar>
+  struct round_impl {
+    static inline Scalar run(const Scalar& x)
+    {
+      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+      using std::round;
+      return round(x);
+    }
+  };
+#else
+  template<typename Scalar>
+  struct round_impl
   {
-    using std::abs;
-    using std::log;
-    using std::sqrt;
-    Scalar z = x / y;
-    if (y == Scalar(0) || abs(z) > sqrt(NumTraits<RealScalar>::epsilon()))
-      return RealScalar(0.5) * log((y + x) / (y - x));
-    else
-      return z + z*z*z / RealScalar(3);
-  }
+    static inline Scalar run(const Scalar& x)
+    {
+      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+      EIGEN_USING_STD_MATH(floor);
+      EIGEN_USING_STD_MATH(ceil);
+      return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5));
+    }
+  };
+#endif
+
+template<typename Scalar>
+struct round_retval
+{
+  typedef Scalar type;
 };
 
+/****************************************************************************
+* Implementation of arg                                                     *
+****************************************************************************/
+
+#if EIGEN_HAS_CXX11_MATH
+  template<typename Scalar>
+  struct arg_impl {
+    static inline Scalar run(const Scalar& x)
+    {
+      EIGEN_USING_STD_MATH(arg);
+      return arg(x);
+    }
+  };
+#else
+  template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+  struct arg_default_impl
+  {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_DEVICE_FUNC
+    static inline RealScalar run(const Scalar& x)
+    {
+      return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); }
+  };
+
+  template<typename Scalar>
+  struct arg_default_impl<Scalar,true>
+  {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_DEVICE_FUNC
+    static inline RealScalar run(const Scalar& x)
+    {
+      EIGEN_USING_STD_MATH(arg);
+      return arg(x);
+    }
+  };
+
+  template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
+#endif
+
 template<typename Scalar>
-struct atanh2_default_impl<Scalar, true>
+struct arg_retval
 {
-  static inline Scalar run(const Scalar&, const Scalar&)
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of log1p                                                   *
+****************************************************************************/
+
+namespace std_fallback {
+  // fallback log1p implementation in case there is no log1p(Scalar) function in namespace of Scalar,
+  // or that there is no suitable std::log1p function available
+  template<typename Scalar>
+  EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_USING_STD_MATH(log);
+    Scalar x1p = RealScalar(1) + x;
+    return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
+  }
+}
+
+template<typename Scalar>
+struct log1p_impl {
+  static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    return Scalar(0);
+    #if EIGEN_HAS_CXX11_MATH
+    using std::log1p;
+    #endif
+    using std_fallback::log1p;
+    return log1p(x);
   }
 };
 
-template<typename Scalar>
-struct atanh2_impl : atanh2_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
 
 template<typename Scalar>
-struct atanh2_retval
+struct log1p_retval
 {
   typedef Scalar type;
 };
@@ -373,24 +523,26 @@ struct atanh2_retval
 * Implementation of pow                                                  *
 ****************************************************************************/
 
-template<typename Scalar, bool IsInteger>
-struct pow_default_impl
+template<typename ScalarX,typename ScalarY, bool IsInteger = NumTraits<ScalarX>::IsInteger&&NumTraits<ScalarY>::IsInteger>
+struct pow_impl
 {
-  typedef Scalar retval;
-  static inline Scalar run(const Scalar& x, const Scalar& y)
+  //typedef Scalar retval;
+  typedef typename ScalarBinaryOpTraits<ScalarX,ScalarY,internal::scalar_pow_op<ScalarX,ScalarY> >::ReturnType result_type;
+  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y)
   {
-    using std::pow;
+    EIGEN_USING_STD_MATH(pow);
     return pow(x, y);
   }
 };
 
-template<typename Scalar>
-struct pow_default_impl<Scalar, true>
+template<typename ScalarX,typename ScalarY>
+struct pow_impl<ScalarX,ScalarY, true>
 {
-  static inline Scalar run(Scalar x, Scalar y)
+  typedef ScalarX result_type;
+  static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y)
   {
-    Scalar res(1);
-    eigen_assert(!NumTraits<Scalar>::IsSigned || y >= 0);
+    ScalarX res(1);
+    eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
     if(y & 1) res *= x;
     y >>= 1;
     while(y)
@@ -403,15 +555,6 @@ struct pow_default_impl<Scalar, true>
   }
 };
 
-template<typename Scalar>
-struct pow_impl : pow_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
-
-template<typename Scalar>
-struct pow_retval
-{
-  typedef Scalar type;
-};
-
 /****************************************************************************
 * Implementation of random                                               *
 ****************************************************************************/
@@ -447,48 +590,48 @@ struct random_default_impl<Scalar, false, false>
 };
 
 enum {
-  floor_log2_terminate,
-  floor_log2_move_up,
-  floor_log2_move_down,
-  floor_log2_bogus
+  meta_floor_log2_terminate,
+  meta_floor_log2_move_up,
+  meta_floor_log2_move_down,
+  meta_floor_log2_bogus
 };
 
-template<unsigned int n, int lower, int upper> struct floor_log2_selector
+template<unsigned int n, int lower, int upper> struct meta_floor_log2_selector
 {
   enum { middle = (lower + upper) / 2,
-         value = (upper <= lower + 1) ? int(floor_log2_terminate)
-               : (n < (1 << middle)) ? int(floor_log2_move_down)
-               : (n==0) ? int(floor_log2_bogus)
-               : int(floor_log2_move_up)
+         value = (upper <= lower + 1) ? int(meta_floor_log2_terminate)
+               : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
+               : (n==0) ? int(meta_floor_log2_bogus)
+               : int(meta_floor_log2_move_up)
   };
 };
 
 template<unsigned int n,
          int lower = 0,
          int upper = sizeof(unsigned int) * CHAR_BIT - 1,
-         int selector = floor_log2_selector<n, lower, upper>::value>
-struct floor_log2 {};
+         int selector = meta_floor_log2_selector<n, lower, upper>::value>
+struct meta_floor_log2 {};
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_down>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down>
 {
-  enum { value = floor_log2<n, lower, floor_log2_selector<n, lower, upper>::middle>::value };
+  enum { value = meta_floor_log2<n, lower, meta_floor_log2_selector<n, lower, upper>::middle>::value };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_up>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up>
 {
-  enum { value = floor_log2<n, floor_log2_selector<n, lower, upper>::middle, upper>::value };
+  enum { value = meta_floor_log2<n, meta_floor_log2_selector<n, lower, upper>::middle, upper>::value };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_terminate>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate>
 {
   enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_bogus>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
 {
   // no value, error at compile time
 };
@@ -496,11 +639,24 @@ struct floor_log2<n, lower, upper, floor_log2_bogus>
 template<typename Scalar>
 struct random_default_impl<Scalar, false, true>
 {
-  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
-
   static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    return x + Scalar((NonInteger(y)-x+1) * std::rand() / (RAND_MAX + NonInteger(1)));
+  { 
+    typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
+    if(y<x)
+      return x;
+    // the following difference might overflow on a 32 bits system,
+    // but since y>=x the result converted to an unsigned long is still correct.
+    std::size_t range = ScalarX(y)-ScalarX(x);
+    std::size_t offset = 0;
+    // rejection sampling
+    std::size_t divisor = 1;
+    std::size_t multiplier = 1;
+    if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1);
+    else               multiplier = 1 + range/(std::size_t(RAND_MAX)+1);
+    do {
+      offset = (std::size_t(std::rand()) * multiplier) / divisor;
+    } while (offset > range);
+    return Scalar(ScalarX(x) + offset);
   }
 
   static inline Scalar run()
@@ -508,7 +664,7 @@ struct random_default_impl<Scalar, false, true>
 #ifdef EIGEN_MAKING_DOCS
     return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
 #else
-    enum { rand_bits = floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+    enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
            scalar_bits = sizeof(Scalar) * CHAR_BIT,
            shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
            offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
@@ -545,97 +701,588 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
   return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
 }
 
+// Implementatin of is* functions
+
+// std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
+#if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
+#define EIGEN_USE_STD_FPCLASSIFY 1
+#else
+#define EIGEN_USE_STD_FPCLASSIFY 0
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isnan_impl(const T&) { return false; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isinf_impl(const T&) { return false; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isfinite_impl(const T&) { return true; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isfinite_impl(const T& x)
+{
+  #ifdef __CUDA_ARCH__
+    return (::isfinite)(x);
+  #elif EIGEN_USE_STD_FPCLASSIFY
+    using std::isfinite;
+    return isfinite EIGEN_NOT_A_MACRO (x);
+  #else
+    return x<=NumTraits<T>::highest() && x>=NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isinf_impl(const T& x)
+{
+  #ifdef __CUDA_ARCH__
+    return (::isinf)(x);
+  #elif EIGEN_USE_STD_FPCLASSIFY
+    using std::isinf;
+    return isinf EIGEN_NOT_A_MACRO (x);
+  #else
+    return x>NumTraits<T>::highest() || x<NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isnan_impl(const T& x)
+{
+  #ifdef __CUDA_ARCH__
+    return (::isnan)(x);
+  #elif EIGEN_USE_STD_FPCLASSIFY
+    using std::isnan;
+    return isnan EIGEN_NOT_A_MACRO (x);
+  #else
+    return x != x;
+  #endif
+}
+
+#if (!EIGEN_USE_STD_FPCLASSIFY)
+
+#if EIGEN_COMP_MSVC
+
+template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x)
+{
+  return _fpclass(x)==_FPCLASS_NINF || _fpclass(x)==_FPCLASS_PINF;
+}
+
+//MSVC defines a _isnan builtin function, but for double only
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x)!=0; }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x)!=0; }
+
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_msvc_helper(x); }
+
+#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC)
+
+#if EIGEN_GNUC_AT_LEAST(5,0)
+  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only")))
+#else
+  // NOTE the inline qualifier and noinline attribute are both needed: the former is to avoid linking issue (duplicate symbol),
+  //      while the second prevent too aggressive optimizations in fast-math mode:
+  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only")))
+#endif
+
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x)      { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x)       { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x)      { return __builtin_isinf(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x)       { return __builtin_isinf(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); }
+
+#undef EIGEN_TMP_NOOPT_ATTRIB
+
+#endif
+
+#endif
+
+// The following overload are defined at the end of this file
+template<typename T> EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x);
+template<typename T> EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
+template<typename T> EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
+
+template<typename T> T generic_fast_tanh_float(const T& a_x);
+
 } // end namespace internal
 
 /****************************************************************************
-* Generic math function                                                    *
+* Generic math functions                                                    *
 ****************************************************************************/
 
 namespace numext {
 
+#ifndef __CUDA_ARCH__
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
+{
+  EIGEN_USING_STD_MATH(min);
+  return min EIGEN_NOT_A_MACRO (x,y);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
+{
+  EIGEN_USING_STD_MATH(max);
+  return max EIGEN_NOT_A_MACRO (x,y);
+}
+#else
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
+{
+  return y < x ? y : x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
+{
+  return fminf(x, y);
+}
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
+{
+  return x < y ? y : x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
+{
+  return fmaxf(x, y);
+}
+#endif
+
+
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
-}  
+}
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) >::type real_ref(const Scalar& x)
 {
   return internal::real_ref_impl<Scalar>::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(arg, Scalar) arg(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x)
 {
   return internal::imag_ref_impl<Scalar>::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) imag_ref(Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& y)
 {
   return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }
 
 template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
 {
-  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
+  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }
 
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float log1p(const float &x) { return ::log1pf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double log1p(const double &x) { return ::log1p(x); }
+#endif
+
+template<typename ScalarX,typename ScalarY>
+EIGEN_DEVICE_FUNC
+inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const ScalarX& x, const ScalarY& y)
+{
+  return internal::pow_impl<ScalarX,ScalarY>::run(x, y);
+}
+
+template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
+template<typename T> EIGEN_DEVICE_FUNC bool (isinf)   (const T &x) { return internal::isinf_impl(x); }
+template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
+
 template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+T (floor)(const T& x)
+{
+  EIGEN_USING_STD_MATH(floor);
+  return floor(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float floor(const float &x) { return ::floorf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double floor(const double &x) { return ::floor(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+T (ceil)(const T& x)
+{
+  EIGEN_USING_STD_MATH(ceil);
+  return ceil(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float ceil(const float &x) { return ::ceilf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double ceil(const double &x) { return ::ceil(x); }
+#endif
+
+
+/** Log base 2 for 32 bits positive integers.
+  * Conveniently returns 0 for x==0. */
+inline int log2(int x)
 {
-  return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
+  eigen_assert(x>=0);
+  unsigned int v(x);
+  static const int table[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  return table[(v * 0x07C4ACDDU) >> 27];
 }
 
-// std::isfinite is non standard, so let's define our own version,
-// even though it is not very efficient.
-template<typename T> bool (isfinite)(const T& x)
+/** \returns the square root of \a x.
+  *
+  * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode,
+  * but slightly faster for float/double and some compilers (e.g., gcc), thanks to
+  * specializations when SSE is enabled.
+  *
+  * It's usage is justified in performance critical functions, like norm/normalize.
+  */
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T sqrt(const T &x)
 {
-  return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
+  EIGEN_USING_STD_MATH(sqrt);
+  return sqrt(x);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T log(const T &x) {
+  EIGEN_USING_STD_MATH(log);
+  return log(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float log(const float &x) { return ::logf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double log(const double &x) { return ::log(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+typename NumTraits<T>::Real abs(const T &x) {
+  EIGEN_USING_STD_MATH(abs);
+  return abs(x);
 }
 
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float abs(const float &x) { return ::fabsf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double abs(const double &x) { return ::fabs(x); }
+
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float abs(const std::complex<float>& x) {
+  return ::hypotf(x.real(), x.imag());
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double abs(const std::complex<double>& x) {
+  return ::hypot(x.real(), x.imag());
+}
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T exp(const T &x) {
+  EIGEN_USING_STD_MATH(exp);
+  return exp(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float exp(const float &x) { return ::expf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double exp(const double &x) { return ::exp(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T cos(const T &x) {
+  EIGEN_USING_STD_MATH(cos);
+  return cos(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float cos(const float &x) { return ::cosf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double cos(const double &x) { return ::cos(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T sin(const T &x) {
+  EIGEN_USING_STD_MATH(sin);
+  return sin(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sin(const float &x) { return ::sinf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sin(const double &x) { return ::sin(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T tan(const T &x) {
+  EIGEN_USING_STD_MATH(tan);
+  return tan(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float tan(const float &x) { return ::tanf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double tan(const double &x) { return ::tan(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T acos(const T &x) {
+  EIGEN_USING_STD_MATH(acos);
+  return acos(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float acos(const float &x) { return ::acosf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double acos(const double &x) { return ::acos(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T asin(const T &x) {
+  EIGEN_USING_STD_MATH(asin);
+  return asin(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float asin(const float &x) { return ::asinf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double asin(const double &x) { return ::asin(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T atan(const T &x) {
+  EIGEN_USING_STD_MATH(atan);
+  return atan(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float atan(const float &x) { return ::atanf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double atan(const double &x) { return ::atan(x); }
+#endif
+
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T cosh(const T &x) {
+  EIGEN_USING_STD_MATH(cosh);
+  return cosh(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float cosh(const float &x) { return ::coshf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double cosh(const double &x) { return ::cosh(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T sinh(const T &x) {
+  EIGEN_USING_STD_MATH(sinh);
+  return sinh(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sinh(const float &x) { return ::sinhf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sinh(const double &x) { return ::sinh(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T tanh(const T &x) {
+  EIGEN_USING_STD_MATH(tanh);
+  return tanh(x);
+}
+
+#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float tanh(float x) { return internal::generic_fast_tanh_float(x); }
+#endif
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float tanh(const float &x) { return ::tanhf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double tanh(const double &x) { return ::tanh(x); }
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T fmod(const T& a, const T& b) {
+  EIGEN_USING_STD_MATH(fmod);
+  return fmod(a, b);
+}
+
+#ifdef __CUDACC__
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float fmod(const float& a, const float& b) {
+  return ::fmodf(a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double fmod(const double& a, const double& b) {
+  return ::fmod(a, b);
+}
+#endif
+
 } // end namespace numext
 
 namespace internal {
 
+template<typename T>
+EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x)
+{
+  return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x));
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x)
+{
+  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x)
+{
+  return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x));
+}
+
 /****************************************************************************
 * Implementation of fuzzy comparisons                                       *
 ****************************************************************************/
@@ -649,18 +1296,17 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, false, false>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
   {
-    using std::abs;
-    return abs(x) <= abs(y) * prec;
+    return numext::abs(x) <= numext::abs(y) * prec;
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
-    using std::min;
-    using std::abs;
-    return abs(x - y) <= (min)(abs(x), abs(y)) * prec;
+    return numext::abs(x - y) <= numext::mini(numext::abs(x), numext::abs(y)) * prec;
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
     return x <= y || isApprox(x, y, prec);
@@ -671,15 +1317,17 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, false, true>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const Scalar& x, const Scalar&, const RealScalar&)
   {
     return x == Scalar(0);
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar&)
   {
     return x == y;
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar&)
   {
     return x <= y;
@@ -690,38 +1338,38 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, true, false>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
   {
     return numext::abs2(x) <= numext::abs2(y) * prec * prec;
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
-    using std::min;
-    return numext::abs2(x - y) <= (min)(numext::abs2(x), numext::abs2(y)) * prec * prec;
+    return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
   }
 };
 
 template<typename Scalar>
 struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
 
-template<typename Scalar, typename OtherScalar>
+template<typename Scalar, typename OtherScalar> EIGEN_DEVICE_FUNC
 inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
-                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                              const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
 }
 
-template<typename Scalar>
+template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApprox(const Scalar& x, const Scalar& y,
-                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                     const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
 }
 
-template<typename Scalar>
+template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
-                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                               const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
 }
@@ -742,17 +1390,19 @@ template<> struct scalar_fuzzy_impl<bool>
 {
   typedef bool RealScalar;
   
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
   {
     return !x;
   }
   
+  EIGEN_DEVICE_FUNC
   static inline bool isApprox(bool x, bool y, bool)
   {
     return x == y;
   }
 
+  EIGEN_DEVICE_FUNC
   static inline bool isApproxOrLessThan(const bool& x, const bool& y, const bool&)
   {
     return (!x) || y;
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
new file mode 100644
index 000000000..3c9ef22fa
--- /dev/null
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATHFUNCTIONSIMPL_H
+#define EIGEN_MATHFUNCTIONSIMPL_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 13/6-degree rational interpolant which
+    is accurate up to a couple of ulp in the range [-9, 9], outside of which
+    the tanh(x) = +/-1.
+
+    This implementation works on both scalars and packets.
+*/
+template<typename T>
+T generic_fast_tanh_float(const T& a_x)
+{
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is +/-1.0f in single-precision.
+  const T plus_9 = pset1<T>(9.f);
+  const T minus_9 = pset1<T>(-9.f);
+  // NOTE GCC prior to 6.3 might improperly optimize this max/min
+  //      step such that if a_x is nan, x will be either 9 or -9,
+  //      and tanh will return 1 or -1 instead of nan.
+  //      This is supposed to be fixed in gcc6.3,
+  //      see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  const T x = pmax(minus_9,pmin(plus_9,a_x));
+  // The monomial coefficients of the numerator polynomial (odd).
+  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
+  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
+  const T alpha_5 = pset1<T>(1.48572235717979e-05f);
+  const T alpha_7 = pset1<T>(5.12229709037114e-08f);
+  const T alpha_9 = pset1<T>(-8.60467152213735e-11f);
+  const T alpha_11 = pset1<T>(2.00018790482477e-13f);
+  const T alpha_13 = pset1<T>(-2.76076847742355e-16f);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const T beta_0 = pset1<T>(4.89352518554385e-03f);
+  const T beta_2 = pset1<T>(2.26843463243900e-03f);
+  const T beta_4 = pset1<T>(1.18534705686654e-04f);
+  const T beta_6 = pset1<T>(1.19825839466702e-06f);
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  T p = pmadd(x2, alpha_13, alpha_11);
+  p = pmadd(x2, p, alpha_9);
+  p = pmadd(x2, p, alpha_7);
+  p = pmadd(x2, p, alpha_5);
+  p = pmadd(x2, p, alpha_3);
+  p = pmadd(x2, p, alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial p.
+  T q = pmadd(x2, beta_6, beta_4);
+  q = pmadd(x2, q, beta_2);
+  q = pmadd(x2, q, beta_0);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATHFUNCTIONSIMPL_H
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index d7d0b5b9a..90c336d8c 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -13,6 +13,45 @@
 
 namespace Eigen {
 
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+private:
+  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
+  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
+  enum {
+      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
+      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
+      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
+      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
+      required_alignment = unpacket_traits<PacketScalar>::alignment,
+      packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0
+    };
+    
+public:
+  typedef _Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef Eigen::Index StorageIndex;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _MaxRows,
+    MaxColsAtCompileTime = _MaxCols,
+    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
+    Options = _Options,
+    InnerStrideAtCompileTime = 1,
+    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
+    
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
+    Alignment = actual_alignment
+  };
+};
+}
+
 /** \class Matrix
   * \ingroup Core_Module
   *
@@ -24,13 +63,13 @@ namespace Eigen {
   * The %Matrix class encompasses \em both fixed-size and dynamic-size objects (\ref fixedsize "note").
   *
   * The first three template parameters are required:
-  * \tparam _Scalar \anchor matrix_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
-  *                 User defined sclar types are supported as well (see \ref user_defined_scalars "here").
+  * \tparam _Scalar Numeric type, e.g. float, double, int or std::complex<float>.
+  *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
   * \tparam _Rows Number of rows, or \b Dynamic
   * \tparam _Cols Number of columns, or \b Dynamic
   *
   * The remaining template parameters are optional -- in most cases you don't have to worry about them.
-  * \tparam _Options \anchor matrix_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of either
   *                 \b #AutoAlign or \b #DontAlign.
   *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
   *                 for vectorization. It defaults to aligning matrices except for fixed sizes that aren't a multiple of the packet size.
@@ -67,7 +106,7 @@ namespace Eigen {
   * \endcode
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
   *
   * <i><b>Some notes:</b></i>
   *
@@ -97,32 +136,44 @@ namespace Eigen {
   * are the dimensions of the original matrix, while _Rows and _Cols are Dynamic.</dd>
   * </dl>
   *
-  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, 
-  * \ref TopicStorageOrders 
+  * <i><b>ABI and storage layout</b></i>
+  *
+  * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3.
+  * <table  class="manual">
+  * <tr><th>Matrix type</th><th>Equivalent C structure</th></tr>
+  * <tr><td>\code Matrix<T,Dynamic,Dynamic> \endcode</td><td>\code
+  * struct {
+  *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+  *   Eigen::Index rows, cols;
+  *  };
+  * \endcode</td></tr>
+  * <tr class="alt"><td>\code
+  * Matrix<T,Dynamic,1>
+  * Matrix<T,1,Dynamic> \endcode</td><td>\code
+  * struct {
+  *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+  *   Eigen::Index size;
+  *  };
+  * \endcode</td></tr>
+  * <tr><td>\code Matrix<T,Rows,Cols> \endcode</td><td>\code
+  * struct {
+  *   T data[Rows*Cols];        // with (size_t(data)%A(Rows*Cols*sizeof(T)))==0
+  *  };
+  * \endcode</td></tr>
+  * <tr class="alt"><td>\code Matrix<T,Dynamic,Dynamic,0,MaxRows,MaxCols> \endcode</td><td>\code
+  * struct {
+  *   T data[MaxRows*MaxCols];  // with (size_t(data)%A(MaxRows*MaxCols*sizeof(T)))==0
+  *   Eigen::Index rows, cols;
+  *  };
+  * \endcode</td></tr>
+  * </table>
+  * Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two
+  * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES.
+  *
+  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy,
+  * \ref TopicStorageOrders
   */
 
-namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  typedef _Scalar Scalar;
-  typedef Dense StorageKind;
-  typedef DenseIndex Index;
-  typedef MatrixXpr XprKind;
-  enum {
-    RowsAtCompileTime = _Rows,
-    ColsAtCompileTime = _Cols,
-    MaxRowsAtCompileTime = _MaxRows,
-    MaxColsAtCompileTime = _MaxCols,
-    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
-    Options = _Options,
-    InnerStrideAtCompileTime = 1,
-    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime
-  };
-};
-}
-
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 class Matrix
   : public PlainObjectBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
@@ -151,6 +202,7 @@ class Matrix
       *
       * \callgraph
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix& operator=(const Matrix& other)
     {
       return Base::_set(other);
@@ -167,7 +219,8 @@ class Matrix
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix& operator=(const MatrixBase<OtherDerived>& other)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix& operator=(const DenseBase<OtherDerived>& other)
     {
       return Base::_set(other);
     }
@@ -179,12 +232,14 @@ class Matrix
       * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix& operator=(const EigenBase<OtherDerived> &other)
     {
       return Base::operator=(other);
     }
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue<OtherDerived>& func)
     {
       return Base::operator=(func);
@@ -200,6 +255,7 @@ class Matrix
       *
       * \sa resize(Index,Index)
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix() : Base()
     {
       Base::_check_template_params();
@@ -207,45 +263,87 @@ class Matrix
     }
 
     // FIXME is it still needed
-    Matrix(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC
+    explicit Matrix(internal::constructor_without_unaligned_array_assert)
       : Base(internal::constructor_without_unaligned_array_assert())
     { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
 
-    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
-      *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Matrix() instead.
-      */
-    EIGEN_STRONG_INLINE explicit Matrix(Index dim)
-      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+      : Base(std::move(other))
     {
       Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Matrix)
-      eigen_assert(dim >= 0);
-      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
+        Base::_set_noalias(other);
     }
+    EIGEN_DEVICE_FUNC
+    Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
+    {
+      other.swap(*this);
+      return *this;
+    }
+#endif
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
+
+    // This constructor is for both 1x1 matrices and dynamic vectors
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Matrix(const T& x)
+    {
+      Base::_check_template_params();
+      Base::template _init1<T>(x);
+    }
+
     template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
     {
       Base::_check_template_params();
       Base::template _init2<T0,T1>(x, y);
     }
     #else
+    /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
+    EIGEN_DEVICE_FUNC
+    explicit Matrix(const Scalar *data);
+
+    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * This is useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass these parameters, so one should use the default constructor
+      * Matrix() instead.
+      * 
+      * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
+      * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
+      * For fixed-size \c 1x1 matrices it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
+      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+      */
+    EIGEN_STRONG_INLINE explicit Matrix(Index dim);
+    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
+    Matrix(const Scalar& x);
     /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
       *
       * This is useful for dynamic-size matrices. For fixed-size matrices,
       * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead. */
+      * Matrix() instead.
+      * 
+      * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
+      * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
+      * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
+      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+      */
+    EIGEN_DEVICE_FUNC
     Matrix(Index rows, Index cols);
+    
     /** \brief Constructs an initialized 2D vector with given coefficients */
     Matrix(const Scalar& x, const Scalar& y);
     #endif
 
     /** \brief Constructs an initialized 3D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
     {
       Base::_check_template_params();
@@ -255,6 +353,7 @@ class Matrix
       m_storage.data()[2] = z;
     }
     /** \brief Constructs an initialized 4D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
     {
       Base::_check_template_params();
@@ -265,76 +364,33 @@ class Matrix
       m_storage.data()[3] = w;
     }
 
-    explicit Matrix(const Scalar *data);
-
-    /** \brief Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix(const MatrixBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      // This test resides here, to bring the error messages closer to the user. Normally, these checks
-      // are performed deeply within the library, thus causing long and scary error traces.
-      EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
     /** \brief Copy constructor */
-    EIGEN_STRONG_INLINE Matrix(const Matrix& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** \brief Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix(const Matrix& other) : Base(other)
+    { }
 
     /** \brief Copy constructor for generic expressions.
       * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::_resize_to_match(other);
-      // FIXME/CHECK: isn't *this = other.derived() more efficient. it allows to
-      //              go for pure _set() implementations, right?
-      *this = other;
-    }
-
-    /** \internal
-      * \brief Override MatrixBase::swap() since for dynamic-sized matrices
-      * of same type it is enough to swap the data pointers.
-      */
-    template<typename OtherDerived>
-    void swap(MatrixBase<OtherDerived> const & other)
-    { this->_swap(other.derived()); }
+      : Base(other.derived())
+    { }
 
-    inline Index innerStride() const { return 1; }
-    inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
 
     /////////// Geometry module ///////////
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     explicit Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Matrix& operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
 
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    explicit Matrix(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    template<typename OtherDerived>
-    Matrix& operator=(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    #endif
-
     // allow to extend Matrix outside Eigen
     #ifdef EIGEN_MATRIX_PLUGIN
     #include EIGEN_MATRIX_PLUGIN
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index b67a7c119..f7cf04cde 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -41,9 +41,9 @@ namespace Eigen {
   * \endcode
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived> class MatrixBase
   : public DenseBase<Derived>
@@ -52,7 +52,7 @@ template<typename Derived> class MatrixBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typedef MatrixBase StorageBaseType;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -66,7 +66,6 @@ template<typename Derived> class MatrixBase
     using Base::MaxSizeAtCompileTime;
     using Base::IsVectorAtCompileTime;
     using Base::Flags;
-    using Base::CoeffReadCost;
 
     using Base::derived;
     using Base::const_cast_derived;
@@ -98,25 +97,14 @@ template<typename Derived> class MatrixBase
 
     /** \returns the size of the main diagonal, which is min(rows(),cols()).
       * \sa rows(), cols(), SizeAtCompileTime. */
-    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
+    EIGEN_DEVICE_FUNC
+    inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }
 
-    /** \brief The plain matrix type corresponding to this expression.
-      *
-      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
-      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
-      * that the return type of eval() is either PlainObject or const PlainObject&.
-      */
-    typedef Matrix<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
+    typedef typename Base::PlainObject PlainObject;
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
     /** \internal the return type of MatrixBase::adjoint() */
     typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
                         CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
@@ -125,7 +113,7 @@ template<typename Derived> class MatrixBase
     /** \internal Return type of eigenvalues() */
     typedef Matrix<std::complex<RealScalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType;
     /** \internal the return type of identity */
-    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,Derived> IdentityReturnType;
+    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,PlainObject> IdentityReturnType;
     /** \internal the return type of unit vectors */
     typedef Block<const CwiseNullaryOp<internal::scalar_identity_op<Scalar>, SquareMatrixType>,
                   internal::traits<Derived>::RowsAtCompileTime,
@@ -133,6 +121,7 @@ template<typename Derived> class MatrixBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@@ -141,41 +130,53 @@ template<typename Derived> class MatrixBase
 #     include EIGEN_MATRIXBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const MatrixBase& other);
 
     // We cannot inherit here via Base::operator= since it is causing
     // trouble with MSVC.
 
     template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const DenseBase<OtherDerived>& other);
 
     template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const EigenBase<OtherDerived>& other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& other);
 
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
-
-    template<typename MatrixPower, typename Lhs, typename Rhs>
-    Derived& lazyAssign(const MatrixPowerProduct<MatrixPower, Lhs,Rhs>& other);
-
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator+=(const MatrixBase<OtherDerived>& other);
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator-=(const MatrixBase<OtherDerived>& other);
 
+#ifdef __CUDACC__
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    const Product<Derived,OtherDerived,LazyProduct>
+    operator*(const MatrixBase<OtherDerived> &other) const
+    { return this->lazyProduct(other); }
+#else
+
     template<typename OtherDerived>
-    const typename ProductReturnType<Derived,OtherDerived>::Type
+    const Product<Derived,OtherDerived>
     operator*(const MatrixBase<OtherDerived> &other) const;
 
+#endif
+
     template<typename OtherDerived>
-    const typename LazyProductReturnType<Derived,OtherDerived>::Type
+    EIGEN_DEVICE_FUNC
+    const Product<Derived,OtherDerived,LazyProduct>
     lazyProduct(const MatrixBase<OtherDerived> &other) const;
 
     template<typename OtherDerived>
@@ -188,84 +189,93 @@ template<typename Derived> class MatrixBase
     void applyOnTheRight(const EigenBase<OtherDerived>& other);
 
     template<typename DiagonalDerived>
-    const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
+    EIGEN_DEVICE_FUNC
+    const Product<Derived, DiagonalDerived, LazyProduct>
     operator*(const DiagonalBase<DiagonalDerived> &diagonal) const;
 
     template<typename OtherDerived>
-    typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+    EIGEN_DEVICE_FUNC
+    typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
     dot(const MatrixBase<OtherDerived>& other) const;
 
-    #ifdef EIGEN2_SUPPORT
-      template<typename OtherDerived>
-      Scalar eigen2_dot(const MatrixBase<OtherDerived>& other) const;
-    #endif
-
-    RealScalar squaredNorm() const;
-    RealScalar norm() const;
+    EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
+    EIGEN_DEVICE_FUNC RealScalar norm() const;
     RealScalar stableNorm() const;
     RealScalar blueNorm() const;
     RealScalar hypotNorm() const;
-    const PlainObject normalized() const;
-    void normalize();
+    EIGEN_DEVICE_FUNC const PlainObject normalized() const;
+    EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const;
+    EIGEN_DEVICE_FUNC void normalize();
+    EIGEN_DEVICE_FUNC void stableNormalize();
 
-    const AdjointReturnType adjoint() const;
-    void adjointInPlace();
+    EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
+    EIGEN_DEVICE_FUNC void adjointInPlace();
 
     typedef Diagonal<Derived> DiagonalReturnType;
+    EIGEN_DEVICE_FUNC
     DiagonalReturnType diagonal();
+
     typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
+    EIGEN_DEVICE_FUNC
     ConstDiagonalReturnType diagonal() const;
 
     template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
     template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };
 
-    template<int Index> typename DiagonalIndexReturnType<Index>::Type diagonal();
-    template<int Index> typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
-    
+    template<int Index>
+    EIGEN_DEVICE_FUNC
+    typename DiagonalIndexReturnType<Index>::Type diagonal();
+
+    template<int Index>
+    EIGEN_DEVICE_FUNC
+    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
+
     typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
     typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
 
+    EIGEN_DEVICE_FUNC
     DiagonalDynamicIndexReturnType diagonal(Index index);
+    EIGEN_DEVICE_FUNC
     ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;
 
-    #ifdef EIGEN2_SUPPORT
-    template<unsigned int Mode> typename internal::eigen2_part_return_type<Derived, Mode>::type part();
-    template<unsigned int Mode> const typename internal::eigen2_part_return_type<Derived, Mode>::type part() const;
-    
-    // huuuge hack. make Eigen2's matrix.part<Diagonal>() work in eigen3. Problem: Diagonal is now a class template instead
-    // of an integer constant. Solution: overload the part() method template wrt template parameters list.
-    template<template<typename T, int N> class U>
-    const DiagonalWrapper<ConstDiagonalReturnType> part() const
-    { return diagonal().asDiagonal(); }
-    #endif // EIGEN2_SUPPORT
-
     template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
     template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };
 
-    template<unsigned int Mode> typename TriangularViewReturnType<Mode>::Type triangularView();
-    template<unsigned int Mode> typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
+    template<unsigned int Mode>
+    EIGEN_DEVICE_FUNC
+    typename TriangularViewReturnType<Mode>::Type triangularView();
+    template<unsigned int Mode>
+    EIGEN_DEVICE_FUNC
+    typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
 
     template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
     template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView<const Derived, UpLo> Type; };
 
-    template<unsigned int UpLo> typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
-    template<unsigned int UpLo> typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+    template<unsigned int UpLo>
+    EIGEN_DEVICE_FUNC
+    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+    template<unsigned int UpLo>
+    EIGEN_DEVICE_FUNC
+    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
 
     const SparseView<Derived> sparseView(const Scalar& m_reference = Scalar(0),
                                          const typename NumTraits<Scalar>::Real& m_epsilon = NumTraits<Scalar>::dummy_precision()) const;
-    static const IdentityReturnType Identity();
-    static const IdentityReturnType Identity(Index rows, Index cols);
-    static const BasisReturnType Unit(Index size, Index i);
-    static const BasisReturnType Unit(Index i);
-    static const BasisReturnType UnitX();
-    static const BasisReturnType UnitY();
-    static const BasisReturnType UnitZ();
-    static const BasisReturnType UnitW();
-
+    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity();
+    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index size, Index i);
+    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index i);
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitX();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitY();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitZ();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitW();
+
+    EIGEN_DEVICE_FUNC
     const DiagonalWrapper<const Derived> asDiagonal() const;
     const PermutationWrapper<const Derived> asPermutation() const;
 
+    EIGEN_DEVICE_FUNC
     Derived& setIdentity();
+    EIGEN_DEVICE_FUNC
     Derived& setIdentity(Index rows, Index cols);
 
     bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -297,59 +307,45 @@ template<typename Derived> class MatrixBase
 
     NoAlias<Derived,Eigen::MatrixBase > noalias();
 
-    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type forceAlignedAccessIf() const;
-    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+    // TODO forceAlignedAccess is temporarily disabled
+    // Need to find a nicer workaround.
+    inline const Derived& forceAlignedAccess() const { return derived(); }
+    inline Derived& forceAlignedAccess() { return derived(); }
+    template<bool Enable> inline const Derived& forceAlignedAccessIf() const { return derived(); }
+    template<bool Enable> inline Derived& forceAlignedAccessIf() { return derived(); }
 
-    Scalar trace() const;
+    EIGEN_DEVICE_FUNC Scalar trace() const;
 
-/////////// Array module ///////////
+    template<int p> EIGEN_DEVICE_FUNC RealScalar lpNorm() const;
 
-    template<int p> RealScalar lpNorm() const;
-
-    MatrixBase<Derived>& matrix() { return *this; }
-    const MatrixBase<Derived>& matrix() const { return *this; }
+    EIGEN_DEVICE_FUNC MatrixBase<Derived>& matrix() { return *this; }
+    EIGEN_DEVICE_FUNC const MatrixBase<Derived>& matrix() const { return *this; }
 
     /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
       * \sa ArrayBase::matrix() */
-    ArrayWrapper<Derived> array() { return derived(); }
-    const ArrayWrapper<const Derived> array() const { return derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return ArrayWrapper<Derived>(derived()); }
+    /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
+      * \sa ArrayBase::matrix() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const { return ArrayWrapper<const Derived>(derived()); }
 
 /////////// LU module ///////////
 
-    const FullPivLU<PlainObject> fullPivLu() const;
-    const PartialPivLU<PlainObject> partialPivLu() const;
+    inline const FullPivLU<PlainObject> fullPivLu() const;
+    inline const PartialPivLU<PlainObject> partialPivLu() const;
 
-    #if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-    const LU<PlainObject> lu() const;
-    #endif
+    inline const PartialPivLU<PlainObject> lu() const;
 
-    #ifdef EIGEN2_SUPPORT
-    const LU<PlainObject> eigen2_lu() const;
-    #endif
+    inline const Inverse<Derived> inverse() const;
 
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    const PartialPivLU<PlainObject> lu() const;
-    #endif
-    
-    #ifdef EIGEN2_SUPPORT
     template<typename ResultType>
-    void computeInverse(MatrixBase<ResultType> *result) const {
-      *result = this->inverse();
-    }
-    #endif
-
-    const internal::inverse_impl<Derived> inverse() const;
-    template<typename ResultType>
-    void computeInverseAndDetWithCheck(
+    inline void computeInverseAndDetWithCheck(
       ResultType& inverse,
       typename ResultType::Scalar& determinant,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
     template<typename ResultType>
-    void computeInverseWithCheck(
+    inline void computeInverseWithCheck(
       ResultType& inverse,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
@@ -358,65 +354,70 @@ template<typename Derived> class MatrixBase
 
 /////////// Cholesky module ///////////
 
-    const LLT<PlainObject>  llt() const;
-    const LDLT<PlainObject> ldlt() const;
+    inline const LLT<PlainObject>  llt() const;
+    inline const LDLT<PlainObject> ldlt() const;
 
 /////////// QR module ///////////
 
-    const HouseholderQR<PlainObject> householderQr() const;
-    const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
-    const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
-    
-    #ifdef EIGEN2_SUPPORT
-    const QR<PlainObject> qr() const;
-    #endif
+    inline const HouseholderQR<PlainObject> householderQr() const;
+    inline const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
+    inline const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
+    inline const CompleteOrthogonalDecomposition<PlainObject> completeOrthogonalDecomposition() const;
 
-    EigenvaluesReturnType eigenvalues() const;
-    RealScalar operatorNorm() const;
+/////////// Eigenvalues module ///////////
 
-/////////// SVD module ///////////
+    inline EigenvaluesReturnType eigenvalues() const;
+    inline RealScalar operatorNorm() const;
 
-    JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
+/////////// SVD module ///////////
 
-    #ifdef EIGEN2_SUPPORT
-    SVD<PlainObject> svd() const;
-    #endif
+    inline JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
+    inline BDCSVD<PlainObject>    bdcSvd(unsigned int computationOptions = 0) const;
 
 /////////// Geometry module ///////////
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /// \internal helper struct to form the return type of the cross product
     template<typename OtherDerived> struct cross_product_return_type {
-      typedef typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+      typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
       typedef Matrix<Scalar,MatrixBase::RowsAtCompileTime,MatrixBase::ColsAtCompileTime> type;
     };
     #endif // EIGEN_PARSED_BY_DOXYGEN
     template<typename OtherDerived>
-    typename cross_product_return_type<OtherDerived>::type
+    EIGEN_DEVICE_FUNC
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    inline typename cross_product_return_type<OtherDerived>::type
+#else
+    inline PlainObject
+#endif
     cross(const MatrixBase<OtherDerived>& other) const;
+
     template<typename OtherDerived>
-    PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
-    PlainObject unitOrthogonal(void) const;
-    Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
-    
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
+    EIGEN_DEVICE_FUNC
+    inline PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
+
+    EIGEN_DEVICE_FUNC
+    inline PlainObject unitOrthogonal(void) const;
+
+    EIGEN_DEVICE_FUNC
+    inline Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
+
     // put this as separate enum value to work around possible GCC 4.3 bug (?)
-    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1?Vertical:Horizontal };
+    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
+                                          : ColsAtCompileTime==1 ? Vertical : Horizontal };
     typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
-    HomogeneousReturnType homogeneous() const;
-    #endif
-    
+    EIGEN_DEVICE_FUNC
+    inline HomogeneousReturnType homogeneous() const;
+
     enum {
       SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
     };
     typedef Block<const Derived,
                   internal::traits<Derived>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
                   internal::traits<Derived>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> ConstStartMinusOne;
-    typedef CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>,
-                const ConstStartMinusOne > HNormalizedReturnType;
-
-    const HNormalizedReturnType hnormalized() const;
+    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(ConstStartMinusOne,Scalar,quotient) HNormalizedReturnType;
+    EIGEN_DEVICE_FUNC
+    inline const HNormalizedReturnType hnormalized() const;
 
 ////////// Householder module ///////////
 
@@ -440,6 +441,15 @@ template<typename Derived> class MatrixBase
     template<typename OtherScalar>
     void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
 
+///////// SparseCore module /////////
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE const typename SparseMatrixBase<OtherDerived>::template CwiseProductDenseReturnType<Derived>::Type
+    cwiseProduct(const SparseMatrixBase<OtherDerived> &other) const
+    {
+      return other.cwiseProduct(derived());
+    }
+
 ///////// MatrixFunctions module /////////
 
     typedef typename internal::stem_function<Scalar>::type StemFunction;
@@ -452,49 +462,15 @@ template<typename Derived> class MatrixBase
     const MatrixSquareRootReturnValue<Derived> sqrt() const;
     const MatrixLogarithmReturnValue<Derived> log() const;
     const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
-
-#ifdef EIGEN2_SUPPORT
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                      EvalBeforeAssigningBit>& other);
-
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                      EvalBeforeAssigningBit>& other);
-
-    /** \deprecated because .lazy() is deprecated
-      * Overloaded for cache friendly product evaluation */
-    template<typename OtherDerived>
-    Derived& lazyAssign(const Flagged<OtherDerived, 0, EvalBeforeAssigningBit>& other)
-    { return lazyAssign(other._expression()); }
-
-    template<unsigned int Added>
-    const Flagged<Derived, Added, 0> marked() const;
-    const Flagged<Derived, 0, EvalBeforeAssigningBit> lazy() const;
-
-    inline const Cwise<Derived> cwise() const;
-    inline Cwise<Derived> cwise();
-
-    VectorBlock<Derived> start(Index size);
-    const VectorBlock<const Derived> start(Index size) const;
-    VectorBlock<Derived> end(Index size);
-    const VectorBlock<const Derived> end(Index size) const;
-    template<int Size> VectorBlock<Derived,Size> start();
-    template<int Size> const VectorBlock<const Derived,Size> start() const;
-    template<int Size> VectorBlock<Derived,Size> end();
-    template<int Size> const VectorBlock<const Derived,Size> end() const;
-
-    Minor<Derived> minor(Index row, Index col);
-    const Minor<Derived> minor(Index row, Index col) const;
-#endif
+    const MatrixComplexPowerReturnValue<Derived> pow(const std::complex<RealScalar>& p) const;
 
   protected:
-    MatrixBase() : Base() {}
+    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
 
   private:
-    explicit MatrixBase(int);
-    MatrixBase(int,int);
-    template<typename OtherDerived> explicit MatrixBase(const MatrixBase<OtherDerived>&);
+    EIGEN_DEVICE_FUNC explicit MatrixBase(int);
+    EIGEN_DEVICE_FUNC MatrixBase(int,int);
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit MatrixBase(const MatrixBase<OtherDerived>&);
   protected:
     // mixing arrays and matrices is not legal
     template<typename OtherDerived> Derived& operator+=(const ArrayBase<OtherDerived>& )
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index a893b1761..13adf070e 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -13,25 +13,24 @@
 
 namespace Eigen {
 
+namespace internal {
+template<typename ExpressionType>
+struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
+{};
+}
+
 /** \class NestByValue
   * \ingroup Core_Module
   *
   * \brief Expression which must be nested by value
   *
-  * \param ExpressionType the type of the object of which we are requiring nesting-by-value
+  * \tparam ExpressionType the type of the object of which we are requiring nesting-by-value
   *
   * This class is the return type of MatrixBase::nestByValue()
   * and most of the time this is the only way it is used.
   *
   * \sa MatrixBase::nestByValue()
   */
-
-namespace internal {
-template<typename ExpressionType>
-struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
-{};
-}
-
 template<typename ExpressionType> class NestByValue
   : public internal::dense_xpr_base< NestByValue<ExpressionType> >::type
 {
@@ -40,29 +39,29 @@ template<typename ExpressionType> class NestByValue
     typedef typename internal::dense_xpr_base<NestByValue>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
 
-    inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
 
-    inline const CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
     {
       return m_expression.coeff(row, col);
     }
 
-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
     {
       return m_expression.const_cast_derived().coeffRef(row, col);
     }
 
-    inline const CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
     {
       return m_expression.coeff(index);
     }
 
-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
     {
       return m_expression.const_cast_derived().coeffRef(index);
     }
@@ -91,7 +90,7 @@ template<typename ExpressionType> class NestByValue
       m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
     }
 
-    operator const ExpressionType&() const { return m_expression; }
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
   protected:
     const ExpressionType m_expression;
diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h
index 768bfb18c..33908010b 100644
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -17,7 +17,7 @@ namespace Eigen {
   *
   * \brief Pseudo expression providing an operator = assuming no aliasing
   *
-  * \param ExpressionType the type of the object on which to do the lazy assignment
+  * \tparam ExpressionType the type of the object on which to do the lazy assignment
   *
   * This class represents an expression with special assignment operators
   * assuming no aliasing between the target expression and the source expression.
@@ -30,62 +30,36 @@ namespace Eigen {
 template<typename ExpressionType, template <typename> class StorageBase>
 class NoAlias
 {
-    typedef typename ExpressionType::Scalar Scalar;
   public:
-    NoAlias(ExpressionType& expression) : m_expression(expression) {}
-
-    /** Behaves like MatrixBase::lazyAssign(other)
-      * \sa MatrixBase::lazyAssign() */
+    typedef typename ExpressionType::Scalar Scalar;
+    
+    explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
+    
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
-    { return internal::assign_selector<ExpressionType,OtherDerived,false>::run(m_expression,other.derived()); }
-
-    /** \sa MatrixBase::operator+= */
+    {
+      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+      return m_expression;
+    }
+    
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
     {
-      typedef SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
-      SelfAdder tmp(m_expression);
-      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
-      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
-      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
       return m_expression;
     }
-
-    /** \sa MatrixBase::operator-= */
+    
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
     {
-      typedef SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
-      SelfAdder tmp(m_expression);
-      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
-      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
-      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
       return m_expression;
     }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    { other.derived().addTo(m_expression); return m_expression; }
-
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    { other.derived().subTo(m_expression); return m_expression; }
-
-    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
-    { return m_expression.derived() += CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
-
-    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
-    { return m_expression.derived() -= CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
-    
-    template<typename OtherDerived>
-    ExpressionType& operator=(const ReturnByValue<OtherDerived>& func)
-    { return m_expression = func; }
-#endif
-
+    EIGEN_DEVICE_FUNC
     ExpressionType& expression() const
     {
       return m_expression;
@@ -126,7 +100,7 @@ class NoAlias
 template<typename Derived>
 NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
 {
-  return derived();
+  return NoAlias<Derived, Eigen::MatrixBase >(derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index bac9e50b8..dd61195bc 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -12,24 +12,57 @@
 
 namespace Eigen {
 
+namespace internal {
+
+// default implementation of digits10(), based on numeric_limits if specialized,
+// 0 for integer types, and log10(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits10_impl
+{
+  static int run() { return std::numeric_limits<T>::digits10; }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,false> // Floating point
+{
+  static int run() {
+    using std::log10;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log10(NumTraits<Real>::epsilon())));
+  }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,true> // Integer
+{
+  static int run() { return 0; }
+};
+
+} // end namespace internal
+
 /** \class NumTraits
   * \ingroup Core_Module
   *
   * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
   *
-  * \param T the numeric type at hand
+  * \tparam T the numeric type at hand
   *
   * This class stores enums, typedefs and static methods giving information about a numeric type.
   *
   * The provided data consists of:
-  * \li A typedef \a Real, giving the "real part" type of \a T. If \a T is already real,
-  *     then \a Real is just a typedef to \a T. If \a T is \c std::complex<U> then \a Real
+  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
+  *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
   *     is a typedef to \a U.
-  * \li A typedef \a NonInteger, giving the type that should be used for operations producing non-integral values,
+  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
   *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
   *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
   *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
   *     only intended as a helper for code that needs to explicitly promote types.
+  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
+  *     Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
   * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
   *     this means, just use \a T here.
   * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
@@ -42,10 +75,14 @@ namespace Eigen {
   * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
   * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
   *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike std::numeric_limits::epsilon(), returns a \a Real instead of a \a T.
+  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
+  *     it returns a \a Real instead of a \a T.
   * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
   *     value by the fuzzy comparison operators.
   * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
+  *     which is used as the default implementation if specialized.
   */
 
 template<typename T> struct GenericNumTraits
@@ -67,22 +104,47 @@ template<typename T> struct GenericNumTraits
                      T
                    >::type NonInteger;
   typedef T Nested;
+  typedef T Literal;
+
+  EIGEN_DEVICE_FUNC
+  static inline Real epsilon()
+  {
+    return numext::numeric_limits<T>::epsilon();
+  }
 
-  static inline Real epsilon() { return std::numeric_limits<T>::epsilon(); }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10()
+  {
+    return internal::default_digits10_impl<T>::run();
+  }
+
+  EIGEN_DEVICE_FUNC
   static inline Real dummy_precision()
   {
     // make sure to override this for floating-point types
     return Real(0);
   }
-  static inline T highest() { return (std::numeric_limits<T>::max)(); }
-  static inline T lowest()  { return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)()); }
-  
-#ifdef EIGEN2_SUPPORT
-  enum {
-    HasFloatingPoint = !IsInteger
-  };
-  typedef NonInteger FloatingPoint;
-#endif
+
+
+  EIGEN_DEVICE_FUNC
+  static inline T highest() {
+    return (numext::numeric_limits<T>::max)();
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline T lowest()  {
+    return IsInteger ? (numext::numeric_limits<T>::min)() : (-(numext::numeric_limits<T>::max)());
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline T infinity() {
+    return numext::numeric_limits<T>::infinity();
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline T quiet_NaN() {
+    return numext::numeric_limits<T>::quiet_NaN();
+  }
 };
 
 template<typename T> struct NumTraits : GenericNumTraits<T>
@@ -91,11 +153,13 @@ template<typename T> struct NumTraits : GenericNumTraits<T>
 template<> struct NumTraits<float>
   : GenericNumTraits<float>
 {
+  EIGEN_DEVICE_FUNC
   static inline float dummy_precision() { return 1e-5f; }
 };
 
 template<> struct NumTraits<double> : GenericNumTraits<double>
 {
+  EIGEN_DEVICE_FUNC
   static inline double dummy_precision() { return 1e-12; }
 };
 
@@ -109,6 +173,7 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
   : GenericNumTraits<std::complex<_Real> >
 {
   typedef _Real Real;
+  typedef typename NumTraits<_Real>::Literal Literal;
   enum {
     IsComplex = 1,
     RequireInitialization = NumTraits<_Real>::RequireInitialization,
@@ -117,8 +182,12 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
     MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
   };
 
+  EIGEN_DEVICE_FUNC
   static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC
   static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
 };
 
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -130,21 +199,48 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
   typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar;
   typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger;
   typedef ArrayType & Nested;
-  
+  typedef typename NumTraits<Scalar>::Literal Literal;
+
   enum {
     IsComplex = NumTraits<Scalar>::IsComplex,
     IsInteger = NumTraits<Scalar>::IsInteger,
     IsSigned  = NumTraits<Scalar>::IsSigned,
     RequireInitialization = 1,
-    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
-    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
-    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
+    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
+    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
+    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
   };
-  
+
+  EIGEN_DEVICE_FUNC
   static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
+  EIGEN_DEVICE_FUNC
   static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
 };
 
+template<> struct NumTraits<std::string>
+  : GenericNumTraits<std::string>
+{
+  enum {
+    RequireInitialization = 1,
+    ReadCost = HugeCost,
+    AddCost  = HugeCost,
+    MulCost  = HugeCost
+  };
+
+  static inline int digits10() { return 0; }
+
+private:
+  static inline std::string epsilon();
+  static inline std::string dummy_precision();
+  static inline std::string lowest();
+  static inline std::string highest();
+  static inline std::string infinity();
+  static inline std::string quiet_NaN();
+};
+
+// Empty specialization for void to allow template specialization based on NumTraits<T>::Real with T==void and SFINAE.
+template<> struct NumTraits<void> {};
+
 } // end namespace Eigen
 
 #endif // EIGEN_NUMTRAITS_H
diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index 85ffae265..b1fb455b9 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,14 +13,18 @@
 
 namespace Eigen { 
 
-template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;
+namespace internal {
+
+enum PermPermProduct_t {PermPermProduct};
+
+} // end namespace internal
 
 /** \class PermutationBase
   * \ingroup Core_Module
   *
   * \brief Base class for permutations
   *
-  * \param Derived the derived class
+  * \tparam Derived the derived class
   *
   * This class is the base class for all expressions representing a permutation matrix,
   * internally stored as a vector of integers.
@@ -38,17 +42,6 @@ template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKi
   *
   * \sa class PermutationMatrix, class PermutationWrapper
   */
-
-namespace internal {
-
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
-struct permut_matrix_product_retval;
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
-struct permut_sparsematrix_product_retval;
-enum PermPermProduct_t {PermPermProduct};
-
-} // end namespace internal
-
 template<typename Derived>
 class PermutationBase : public EigenBase<Derived>
 {
@@ -60,19 +53,20 @@ class PermutationBase : public EigenBase<Derived>
     typedef typename Traits::IndicesType IndicesType;
     enum {
       Flags = Traits::Flags,
-      CoeffReadCost = Traits::CoeffReadCost,
       RowsAtCompileTime = Traits::RowsAtCompileTime,
       ColsAtCompileTime = Traits::ColsAtCompileTime,
       MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
     };
-    typedef typename Traits::Scalar Scalar;
-    typedef typename Traits::Index Index;
-    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
+    typedef typename Traits::StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
             DenseMatrixType;
-    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,Index>
+    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,StorageIndex>
             PlainPermutationType;
+    typedef PlainPermutationType PlainObject;
     using Base::derived;
+    typedef Inverse<Derived> InverseReturnType;
+    typedef void Scalar;
     #endif
 
     /** Copies the other permutation into *this */
@@ -118,7 +112,7 @@ class PermutationBase : public EigenBase<Derived>
     void evalTo(MatrixBase<DenseDerived>& other) const
     {
       other.setZero();
-      for (int i=0; i<rows();++i)
+      for (Index i=0; i<rows(); ++i)
         other.coeffRef(indices().coeff(i),i) = typename DenseDerived::Scalar(1);
     }
     #endif
@@ -147,7 +141,8 @@ class PermutationBase : public EigenBase<Derived>
     /** Sets *this to be the identity permutation matrix */
     void setIdentity()
     {
-      for(Index i = 0; i < size(); ++i)
+      StorageIndex n = StorageIndex(size());
+      for(StorageIndex i = 0; i < n; ++i)
         indices().coeffRef(i) = i;
     }
 
@@ -163,18 +158,18 @@ class PermutationBase : public EigenBase<Derived>
       *
       * \returns a reference to *this.
       *
-      * \warning This is much slower than applyTranspositionOnTheRight(int,int):
+      * \warning This is much slower than applyTranspositionOnTheRight(Index,Index):
       * this has linear complexity and requires a lot of branching.
       *
-      * \sa applyTranspositionOnTheRight(int,int)
+      * \sa applyTranspositionOnTheRight(Index,Index)
       */
     Derived& applyTranspositionOnTheLeft(Index i, Index j)
     {
       eigen_assert(i>=0 && j>=0 && i<size() && j<size());
       for(Index k = 0; k < size(); ++k)
       {
-        if(indices().coeff(k) == i) indices().coeffRef(k) = j;
-        else if(indices().coeff(k) == j) indices().coeffRef(k) = i;
+        if(indices().coeff(k) == i) indices().coeffRef(k) = StorageIndex(j);
+        else if(indices().coeff(k) == j) indices().coeffRef(k) = StorageIndex(i);
       }
       return derived();
     }
@@ -185,7 +180,7 @@ class PermutationBase : public EigenBase<Derived>
       *
       * This is a fast operation, it only consists in swapping two indices.
       *
-      * \sa applyTranspositionOnTheLeft(int,int)
+      * \sa applyTranspositionOnTheLeft(Index,Index)
       */
     Derived& applyTranspositionOnTheRight(Index i, Index j)
     {
@@ -196,16 +191,16 @@ class PermutationBase : public EigenBase<Derived>
 
     /** \returns the inverse permutation matrix.
       *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
       */
-    inline Transpose<PermutationBase> inverse() const
-    { return derived(); }
+    inline InverseReturnType inverse() const
+    { return InverseReturnType(derived()); }
     /** \returns the tranpose permutation matrix.
       *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
       */
-    inline Transpose<PermutationBase> transpose() const
-    { return derived(); }
+    inline InverseReturnType transpose() const
+    { return InverseReturnType(derived()); }
 
     /**** multiplication helpers to hopefully get RVO ****/
 
@@ -215,13 +210,13 @@ class PermutationBase : public EigenBase<Derived>
     template<typename OtherDerived>
     void assignTranspose(const PermutationBase<OtherDerived>& other)
     {
-      for (int i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
+      for (Index i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
     }
     template<typename Lhs,typename Rhs>
     void assignProduct(const Lhs& lhs, const Rhs& rhs)
     {
       eigen_assert(lhs.cols() == rhs.rows());
-      for (int i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
+      for (Index i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
     }
 #endif
 
@@ -229,7 +224,7 @@ class PermutationBase : public EigenBase<Derived>
 
     /** \returns the product permutation matrix.
       *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
       */
     template<typename Other>
     inline PlainPermutationType operator*(const PermutationBase<Other>& other) const
@@ -237,18 +232,18 @@ class PermutationBase : public EigenBase<Derived>
 
     /** \returns the product of a permutation with another inverse permutation.
       *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
       */
     template<typename Other>
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other) const
+    inline PlainPermutationType operator*(const InverseImpl<Other,PermutationStorage>& other) const
     { return PlainPermutationType(internal::PermPermProduct, *this, other.eval()); }
 
     /** \returns the product of an inverse permutation with another permutation.
       *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
       */
     template<typename Other> friend
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other, const PermutationBase& perm)
+    inline PlainPermutationType operator*(const InverseImpl<Other, PermutationStorage>& other, const PermutationBase& perm)
     { return PlainPermutationType(internal::PermPermProduct, other.eval(), perm); }
     
     /** \returns the determinant of the permutation matrix, which is either 1 or -1 depending on the parity of the permutation.
@@ -284,39 +279,43 @@ class PermutationBase : public EigenBase<Derived>
 
 };
 
+namespace internal {
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
+ : traits<Matrix<_StorageIndex,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+{
+  typedef PermutationStorage StorageKind;
+  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
+};
+}
+
 /** \class PermutationMatrix
   * \ingroup Core_Module
   *
   * \brief Permutation matrix
   *
-  * \param SizeAtCompileTime the number of rows/cols, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  * \param IndexType the interger type of the indices
+  * \tparam SizeAtCompileTime the number of rows/cols, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  * \tparam _StorageIndex the integer type of the indices
   *
   * This class represents a permutation matrix, internally stored as a vector of integers.
   *
   * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
   */
-
-namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
- : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
-{
-  typedef IndexType Index;
-  typedef Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
-};
-}
-
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
 {
     typedef PermutationBase<PermutationMatrix> Base;
     typedef internal::traits<PermutationMatrix> Traits;
   public:
 
+    typedef const PermutationMatrix& Nested;
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     typedef typename Traits::IndicesType IndicesType;
+    typedef typename Traits::StorageIndex StorageIndex;
     #endif
 
     inline PermutationMatrix()
@@ -324,8 +323,10 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
 
     /** Constructs an uninitialized permutation matrix of given size.
       */
-    inline PermutationMatrix(int size) : m_indices(size)
-    {}
+    explicit inline PermutationMatrix(Index size) : m_indices(size)
+    {
+      eigen_internal_assert(size <= NumTraits<StorageIndex>::highest());
+    }
 
     /** Copy constructor. */
     template<typename OtherDerived>
@@ -346,7 +347,7 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
       * array's size.
       */
     template<typename Other>
-    explicit inline PermutationMatrix(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    explicit inline PermutationMatrix(const MatrixBase<Other>& indices) : m_indices(indices)
     {}
 
     /** Convert the Transpositions \a tr to a permutation matrix */
@@ -393,10 +394,13 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename Other>
-    PermutationMatrix(const Transpose<PermutationBase<Other> >& other)
-      : m_indices(other.nestedPermutation().size())
+    PermutationMatrix(const InverseImpl<Other,PermutationStorage>& other)
+      : m_indices(other.derived().nestedExpression().size())
     {
-      for (int i=0; i<m_indices.size();++i) m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
+      eigen_internal_assert(m_indices.size() <= NumTraits<StorageIndex>::highest());
+      StorageIndex end = StorageIndex(m_indices.size());
+      for (StorageIndex i=0; i<end;++i)
+        m_indices.coeffRef(other.derived().nestedExpression().indices().coeff(i)) = i;
     }
     template<typename Lhs,typename Rhs>
     PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
@@ -413,18 +417,20 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
 
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
- : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess> >
+ : traits<Matrix<_StorageIndex,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
-  typedef IndexType Index;
-  typedef Map<const Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
+  typedef PermutationStorage StorageKind;
+  typedef Map<const Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
 };
 }
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess>
-  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess>
+  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess> >
 {
     typedef PermutationBase<Map> Base;
     typedef internal::traits<Map> Traits;
@@ -432,14 +438,14 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
     #endif
 
-    inline Map(const Index* indicesPtr)
+    inline Map(const StorageIndex* indicesPtr)
       : m_indices(indicesPtr)
     {}
 
-    inline Map(const Index* indicesPtr, Index size)
+    inline Map(const StorageIndex* indicesPtr, Index size)
       : m_indices(indicesPtr,size)
     {}
 
@@ -474,40 +480,36 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,
     IndicesType m_indices;
 };
 
-/** \class PermutationWrapper
-  * \ingroup Core_Module
-  *
-  * \brief Class to view a vector of integers as a permutation matrix
-  *
-  * \param _IndicesType the type of the vector of integer (can be any compatible expression)
-  *
-  * This class allows to view any vector expression of integers as a permutation matrix.
-  *
-  * \sa class PermutationBase, class PermutationMatrix
-  */
-
-struct PermutationStorage {};
-
 template<typename _IndicesType> class TranspositionsWrapper;
 namespace internal {
 template<typename _IndicesType>
 struct traits<PermutationWrapper<_IndicesType> >
 {
   typedef PermutationStorage StorageKind;
-  typedef typename _IndicesType::Scalar Scalar;
-  typedef typename _IndicesType::Scalar Index;
+  typedef void Scalar;
+  typedef typename _IndicesType::Scalar StorageIndex;
   typedef _IndicesType IndicesType;
   enum {
     RowsAtCompileTime = _IndicesType::SizeAtCompileTime,
     ColsAtCompileTime = _IndicesType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = IndicesType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = IndicesType::MaxColsAtCompileTime,
-    Flags = 0,
-    CoeffReadCost = _IndicesType::CoeffReadCost
+    MaxRowsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    Flags = 0
   };
 };
 }
 
+/** \class PermutationWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Class to view a vector of integers as a permutation matrix
+  *
+  * \tparam _IndicesType the type of the vector of integer (can be any compatible expression)
+  *
+  * This class allows to view any vector expression of integers as a permutation matrix.
+  *
+  * \sa class PermutationBase, class PermutationMatrix
+  */
 template<typename _IndicesType>
 class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesType> >
 {
@@ -519,8 +521,8 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesTyp
     typedef typename Traits::IndicesType IndicesType;
     #endif
 
-    inline PermutationWrapper(const IndicesType& a_indices)
-      : m_indices(a_indices)
+    inline PermutationWrapper(const IndicesType& indices)
+      : m_indices(indices)
     {}
 
     /** const version of indices(). */
@@ -532,182 +534,86 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesTyp
     typename IndicesType::Nested m_indices;
 };
 
+
 /** \returns the matrix with the permutation applied to the columns.
   */
-template<typename Derived, typename PermutationDerived>
-inline const internal::permut_matrix_product_retval<PermutationDerived, Derived, OnTheRight>
-operator*(const MatrixBase<Derived>& matrix,
-          const PermutationBase<PermutationDerived> &permutation)
+template<typename MatrixDerived, typename PermutationDerived>
+EIGEN_DEVICE_FUNC
+const Product<MatrixDerived, PermutationDerived, AliasFreeProduct>
+operator*(const MatrixBase<MatrixDerived> &matrix,
+          const PermutationBase<PermutationDerived>& permutation)
 {
-  return internal::permut_matrix_product_retval
-           <PermutationDerived, Derived, OnTheRight>
-           (permutation.derived(), matrix.derived());
+  return Product<MatrixDerived, PermutationDerived, AliasFreeProduct>
+            (matrix.derived(), permutation.derived());
 }
 
 /** \returns the matrix with the permutation applied to the rows.
   */
-template<typename Derived, typename PermutationDerived>
-inline const internal::permut_matrix_product_retval
-               <PermutationDerived, Derived, OnTheLeft>
+template<typename PermutationDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC
+const Product<PermutationDerived, MatrixDerived, AliasFreeProduct>
 operator*(const PermutationBase<PermutationDerived> &permutation,
-          const MatrixBase<Derived>& matrix)
+          const MatrixBase<MatrixDerived>& matrix)
 {
-  return internal::permut_matrix_product_retval
-           <PermutationDerived, Derived, OnTheLeft>
-           (permutation.derived(), matrix.derived());
+  return Product<PermutationDerived, MatrixDerived, AliasFreeProduct>
+            (permutation.derived(), matrix.derived());
 }
 
-namespace internal {
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct traits<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+template<typename PermutationType>
+class InverseImpl<PermutationType, PermutationStorage>
+  : public EigenBase<Inverse<PermutationType> >
 {
-  typedef typename MatrixType::PlainObject ReturnType;
-};
-
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct permut_matrix_product_retval
- : public ReturnByValue<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixType::Index Index;
-
-    permut_matrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
-      : m_permutation(perm), m_matrix(matrix)
-    {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      const Index n = Side==OnTheLeft ? rows() : cols();
-      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
-      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
-      if(    is_same<MatrixTypeNestedCleaned,Dest>::value
-          && blas_traits<MatrixTypeNestedCleaned>::HasUsableDirectAccess
-          && blas_traits<Dest>::HasUsableDirectAccess
-          && extract_data(dst) == extract_data(m_matrix))
-      {
-        // apply the permutation inplace
-        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
-        mask.fill(false);
-        Index r = 0;
-        while(r < m_permutation.size())
-        {
-          // search for the next seed
-          while(r<m_permutation.size() && mask[r]) r++;
-          if(r>=m_permutation.size())
-            break;
-          // we got one, let's follow it until we are back to the seed
-          Index k0 = r++;
-          Index kPrev = k0;
-          mask.coeffRef(k0) = true;
-          for(Index k=m_permutation.indices().coeff(k0); k!=k0; k=m_permutation.indices().coeff(k))
-          {
-                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
-            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
-
-            mask.coeffRef(k) = true;
-            kPrev = k;
-          }
-        }
-      }
-      else
-      {
-        for(int i = 0; i < n; ++i)
-        {
-          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-               (dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
-
-          =
-
-          Block<const MatrixTypeNestedCleaned,Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime>
-               (m_matrix, ((Side==OnTheRight) ^ Transposed) ? m_permutation.indices().coeff(i) : i);
-        }
-      }
-    }
-
-  protected:
-    const PermutationType& m_permutation;
-    typename MatrixType::Nested m_matrix;
-};
-
-/* Template partial specialization for transposed/inverse permutations */
-
-template<typename Derived>
-struct traits<Transpose<PermutationBase<Derived> > >
- : traits<Derived>
-{};
-
-} // end namespace internal
-
-template<typename Derived>
-class Transpose<PermutationBase<Derived> >
-  : public EigenBase<Transpose<PermutationBase<Derived> > >
-{
-    typedef Derived PermutationType;
-    typedef typename PermutationType::IndicesType IndicesType;
     typedef typename PermutationType::PlainPermutationType PlainPermutationType;
+    typedef internal::traits<PermutationType> PermTraits;
+  protected:
+    InverseImpl() {}
   public:
+    typedef Inverse<PermutationType> InverseType;
+    using EigenBase<Inverse<PermutationType> >::derived;
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef internal::traits<PermutationType> Traits;
-    typedef typename Derived::DenseMatrixType DenseMatrixType;
+    typedef typename PermutationType::DenseMatrixType DenseMatrixType;
     enum {
-      Flags = Traits::Flags,
-      CoeffReadCost = Traits::CoeffReadCost,
-      RowsAtCompileTime = Traits::RowsAtCompileTime,
-      ColsAtCompileTime = Traits::ColsAtCompileTime,
-      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
+      RowsAtCompileTime = PermTraits::RowsAtCompileTime,
+      ColsAtCompileTime = PermTraits::ColsAtCompileTime,
+      MaxRowsAtCompileTime = PermTraits::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = PermTraits::MaxColsAtCompileTime
     };
-    typedef typename Traits::Scalar Scalar;
     #endif
 
-    Transpose(const PermutationType& p) : m_permutation(p) {}
-
-    inline int rows() const { return m_permutation.rows(); }
-    inline int cols() const { return m_permutation.cols(); }
-
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
     void evalTo(MatrixBase<DenseDerived>& other) const
     {
       other.setZero();
-      for (int i=0; i<rows();++i)
-        other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
+      for (Index i=0; i<derived().rows();++i)
+        other.coeffRef(i, derived().nestedExpression().indices().coeff(i)) = typename DenseDerived::Scalar(1);
     }
     #endif
 
     /** \return the equivalent permutation matrix */
-    PlainPermutationType eval() const { return *this; }
+    PlainPermutationType eval() const { return derived(); }
 
-    DenseMatrixType toDenseMatrix() const { return *this; }
+    DenseMatrixType toDenseMatrix() const { return derived(); }
 
     /** \returns the matrix with the inverse permutation applied to the columns.
       */
     template<typename OtherDerived> friend
-    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>
-    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trPerm)
+    const Product<OtherDerived, InverseType, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const InverseType& trPerm)
     {
-      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>(trPerm.m_permutation, matrix.derived());
+      return Product<OtherDerived, InverseType, AliasFreeProduct>(matrix.derived(), trPerm.derived());
     }
 
     /** \returns the matrix with the inverse permutation applied to the rows.
       */
     template<typename OtherDerived>
-    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>
+    const Product<InverseType, OtherDerived, AliasFreeProduct>
     operator*(const MatrixBase<OtherDerived>& matrix) const
     {
-      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>(m_permutation, matrix.derived());
+      return Product<InverseType, OtherDerived, AliasFreeProduct>(derived(), matrix.derived());
     }
-
-    const PermutationType& nestedPermutation() const { return m_permutation; }
-
-  protected:
-    const PermutationType& m_permutation;
 };
 
 template<typename Derived>
@@ -716,6 +622,12 @@ const PermutationWrapper<const Derived> MatrixBase<Derived>::asPermutation() con
   return derived();
 }
 
+namespace internal {
+
+template<> struct AssignmentKind<DenseShape,PermutationShape> { typedef EigenBase2EigenBase Kind; };
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_PERMUTATIONMATRIX_H
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 8a3e4545a..77f4f6066 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -28,6 +28,7 @@ namespace internal {
 
 template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
   template<typename Index>
+  EIGEN_DEVICE_FUNC
   static EIGEN_ALWAYS_INLINE void run(Index, Index)
   {
   }
@@ -35,11 +36,12 @@ template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
 
 template<> struct check_rows_cols_for_overflow<Dynamic> {
   template<typename Index>
+  EIGEN_DEVICE_FUNC
   static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols)
   {
     // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
     // we assume Index is signed
-    Index max_index = (size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
+    Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
     bool error = (rows == 0 || cols == 0) ? false
                : (rows > max_index / cols);
     if (error)
@@ -56,33 +58,41 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m
 
 } // end namespace internal
 
-/** \class PlainObjectBase
-  * \brief %Dense storage base class for matrices and arrays.
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
-  *
-  * \sa \ref TopicClassHierarchy
-  */
 #ifdef EIGEN_PARSED_BY_DOXYGEN
-namespace internal {
+namespace doxygen {
 
-// this is a warkaround to doxygen not being able to understand the inheritence logic
+// This is a workaround to doxygen not being able to understand the inheritance logic
 // when it is hidden by the dense_xpr_base helper struct.
-template<typename Derived> struct dense_xpr_base_dispatcher_for_doxygen;// : public MatrixBase<Derived> {};
+// Moreover, doxygen fails to include members that are not documented in the declaration body of
+// MatrixBase if we inherits MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >,
+// this is why we simply inherits MatrixBase, though this does not make sense.
+
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename Derived> struct dense_xpr_base_dispatcher;
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
+struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public MatrixBase {};
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
+struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public ArrayBase {};
 
-} // namespace internal
+} // namespace doxygen
 
+/** \class PlainObjectBase
+  * \ingroup Core_Module
+  * \brief %Dense storage base class for matrices and arrays.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
+  *
+  * \tparam Derived is the derived type, e.g., a Matrix or Array
+  *
+  * \sa \ref TopicClassHierarchy
+  */
 template<typename Derived>
-class PlainObjectBase : public internal::dense_xpr_base_dispatcher_for_doxygen<Derived>
+class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher<Derived>
 #else
 template<typename Derived>
 class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
@@ -93,8 +103,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     typedef typename internal::dense_xpr_base<Derived>::type Base;
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
+    
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Derived DenseType;
@@ -113,28 +123,40 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     typedef Eigen::Map<Derived, Unaligned>  MapType;
     friend  class Eigen::Map<const Derived, Unaligned>;
     typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
-    friend  class Eigen::Map<Derived, Aligned>;
-    typedef Eigen::Map<Derived, Aligned> AlignedMapType;
-    friend  class Eigen::Map<const Derived, Aligned>;
-    typedef const Eigen::Map<const Derived, Aligned> ConstAlignedMapType;
+#if EIGEN_MAX_ALIGN_BYTES>0
+    // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice.
+    friend  class Eigen::Map<Derived, AlignedMax>;
+    friend  class Eigen::Map<const Derived, AlignedMax>;
+#endif
+    typedef Eigen::Map<Derived, AlignedMax> AlignedMapType;
+    typedef const Eigen::Map<const Derived, AlignedMax> ConstAlignedMapType;
     template<typename StrideType> struct StridedMapType { typedef Eigen::Map<Derived, Unaligned, StrideType> type; };
     template<typename StrideType> struct StridedConstMapType { typedef Eigen::Map<const Derived, Unaligned, StrideType> type; };
-    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, Aligned, StrideType> type; };
-    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, Aligned, StrideType> type; };
+    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, AlignedMax, StrideType> type; };
+    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, AlignedMax, StrideType> type; };
 
   protected:
     DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;
 
   public:
-    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::Flags & AlignedBit) != 0 };
+    enum { NeedsToAlign = (SizeAtCompileTime != Dynamic) && (internal::traits<Derived>::Alignment>0) };
     EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
 
+    EIGEN_DEVICE_FUNC
     Base& base() { return *static_cast<Base*>(this); }
+    EIGEN_DEVICE_FUNC
     const Base& base() const { return *static_cast<const Base*>(this); }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
     {
       if(Flags & RowMajorBit)
@@ -143,11 +165,21 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
     {
       return m_storage.data()[index];
     }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const for details. */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
     {
       if(Flags & RowMajorBit)
@@ -156,11 +188,19 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const for details. */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
     {
       return m_storage.data()[index];
     }
 
+    /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index).
+      * It is provided for convenience. */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
     {
       if(Flags & RowMajorBit)
@@ -169,6 +209,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index).
+      * It is provided for convenience. */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
     {
       return m_storage.data()[index];
@@ -209,11 +252,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     }
 
     /** \returns a const pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE const Scalar *data() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const
     { return m_storage.data(); }
 
     /** \returns a pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE Scalar *data()
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data()
     { return m_storage.data(); }
 
     /** Resizes \c *this to a \a rows x \a cols matrix.
@@ -232,22 +275,22 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
       */
-    EIGEN_STRONG_INLINE void resize(Index nbRows, Index nbCols)
-    {
-      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,nbRows==RowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,nbCols==ColsAtCompileTime)
-                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,nbRows<=MaxRowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,nbCols<=MaxColsAtCompileTime)
-                   && nbRows>=0 && nbCols>=0 && "Invalid sizes when resizing a matrix or array.");
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void resize(Index rows, Index cols)
+    {
+      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,rows==RowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,cols==ColsAtCompileTime)
+                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,rows<=MaxRowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,cols<=MaxColsAtCompileTime)
+                   && rows>=0 && cols>=0 && "Invalid sizes when resizing a matrix or array.");
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(rows, cols);
       #ifdef EIGEN_INITIALIZE_COEFFS
-        Index size = nbRows*nbCols;
+        Index size = rows*cols;
         bool size_changed = size != this->size();
-        m_storage.resize(size, nbRows, nbCols);
+        m_storage.resize(size, rows, cols);
         if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
       #else
-        internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
-        m_storage.resize(nbRows*nbCols, nbRows, nbCols);
+        m_storage.resize(rows*cols, rows, cols);
       #endif
     }
 
@@ -262,6 +305,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
       */
+    EIGEN_DEVICE_FUNC
     inline void resize(Index size)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
@@ -286,9 +330,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \sa resize(Index,Index)
       */
-    inline void resize(NoChange_t, Index nbCols)
+    EIGEN_DEVICE_FUNC
+    inline void resize(NoChange_t, Index cols)
     {
-      resize(rows(), nbCols);
+      resize(rows(), cols);
     }
 
     /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special value \c NoChange
@@ -299,9 +344,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \sa resize(Index,Index)
       */
-    inline void resize(Index nbRows, NoChange_t)
+    EIGEN_DEVICE_FUNC
+    inline void resize(Index rows, NoChange_t)
     {
-      resize(nbRows, cols());
+      resize(rows, cols());
     }
 
     /** Resizes \c *this to have the same dimensions as \a other.
@@ -312,6 +358,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
     {
       const OtherDerived& other = _other.derived();
@@ -339,9 +386,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * Matrices are resized relative to the top-left element. In case values need to be 
       * appended to the matrix they will be uninitialized.
       */
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, Index nbCols)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, Index cols)
     {
-      internal::conservative_resize_like_impl<Derived>::run(*this, nbRows, nbCols);
+      internal::conservative_resize_like_impl<Derived>::run(*this, rows, cols);
     }
 
     /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
@@ -351,10 +399,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * In case the matrix is growing, new rows will be uninitialized.
       */
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, NoChange_t)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, NoChange_t)
     {
       // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(nbRows, cols());
+      conservativeResize(rows, cols());
     }
 
     /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
@@ -364,10 +413,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * In case the matrix is growing, new columns will be uninitialized.
       */
-    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index nbCols)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index cols)
     {
       // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(rows(), nbCols);
+      conservativeResize(rows(), cols);
     }
 
     /** Resizes the vector to \a size while retaining old values.
@@ -378,6 +428,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * When values are appended, they will be uninitialized.
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void conservativeResize(Index size)
     {
       internal::conservative_resize_like_impl<Derived>::run(*this, size);
@@ -393,6 +444,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * appended to the matrix they will copied from \c other.
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void conservativeResizeLike(const DenseBase<OtherDerived>& other)
     {
       internal::conservative_resize_like_impl<Derived,OtherDerived>::run(*this, other);
@@ -401,6 +453,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     /** This is a special case of the templated operator=. Its purpose is to
       * prevent a default operator= from hiding the templated operator=.
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& operator=(const PlainObjectBase& other)
     {
       return _set(other);
@@ -408,6 +461,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 
     /** \sa MatrixBase::lazyAssign() */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& lazyAssign(const DenseBase<OtherDerived>& other)
     {
       _resize_to_match(other);
@@ -415,12 +469,18 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     }
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& operator=(const ReturnByValue<OtherDerived>& func)
     {
       resize(func.rows(), func.cols());
       return Base::operator=(func);
     }
 
+    // Prevent user from trying to instantiate PlainObjectBase objects
+    // by making all its constructor protected. See bug 1074.
+  protected:
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
     {
 //       _check_template_params();
@@ -430,38 +490,86 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     // FIXME is it still needed ?
     /** \internal */
-    PlainObjectBase(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC
+    explicit PlainObjectBase(internal::constructor_without_unaligned_array_assert)
       : m_storage(internal::constructor_without_unaligned_array_assert())
     {
 //       _check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 #endif
 
-    EIGEN_STRONG_INLINE PlainObjectBase(Index a_size, Index nbRows, Index nbCols)
-      : m_storage(a_size, nbRows, nbCols)
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    PlainObjectBase(PlainObjectBase&& other) EIGEN_NOEXCEPT
+      : m_storage( std::move(other.m_storage) )
+    {
+    }
+
+    EIGEN_DEVICE_FUNC
+    PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT
+    {
+      using std::swap;
+      swap(m_storage, other.m_storage);
+      return *this;
+    }
+#endif
+
+    /** Copy constructor */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const PlainObjectBase& other)
+      : Base(), m_storage(other.m_storage) { }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols)
+      : m_storage(size, rows, cols)
     {
 //       _check_template_params();
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
-    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
-      */
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived> &other)
+      : m_storage()
     {
-      _resize_to_match(other);
-      Base::operator=(other.derived());
-      return this->derived();
+      _check_template_params();
+      resizeLike(other);
+      _set_noalias(other);
     }
 
-    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
-      : m_storage(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
+      : m_storage()
+    {
+      _check_template_params();
+      resizeLike(other);
+      *this = other.derived();
+    }
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const ReturnByValue<OtherDerived>& other)
     {
       _check_template_params();
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.derived().rows(), other.derived().cols());
+      // FIXME this does not automatically transpose vectors if necessary
+      resize(other.rows(), other.cols());
+      other.evalTo(this->derived());
+    }
+
+  public:
+
+    /** \brief Copies the generic expression \a other into *this.
+      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
+    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
+    {
+      _resize_to_match(other);
       Base::operator=(other.derived());
+      return this->derived();
     }
 
     /** \name Map
@@ -538,16 +646,16 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     //@}
 
     using Base::setConstant;
-    Derived& setConstant(Index size, const Scalar& value);
-    Derived& setConstant(Index rows, Index cols, const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val);
 
     using Base::setZero;
-    Derived& setZero(Index size);
-    Derived& setZero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index size);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);
 
     using Base::setOnes;
-    Derived& setOnes(Index size);
-    Derived& setOnes(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);
 
     using Base::setRandom;
     Derived& setRandom(Index size);
@@ -566,6 +674,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
     {
       #ifdef EIGEN_NO_AUTOMATIC_RESIZING
@@ -573,8 +682,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                  : (rows() == other.rows() && cols() == other.cols())))
         && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
       EIGEN_ONLY_USED_FOR_DEBUG(other);
-      if(this->size()==0)
-        resizeLike(other);
       #else
       resizeLike(other);
       #endif
@@ -594,25 +701,23 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \internal
       */
+    // aliasing is dealt once in internall::call_assignment
+    // so at this stage we have to assume aliasing... and resising has to be done later.
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
     {
-      _set_selector(other.derived(), typename internal::conditional<static_cast<bool>(int(OtherDerived::Flags) & EvalBeforeAssigningBit), internal::true_type, internal::false_type>::type());
+      internal::call_assignment(this->derived(), other.derived());
       return this->derived();
     }
 
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::true_type&) { _set_noalias(other.eval()); }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::false_type&) { _set_noalias(other); }
-
     /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
       * is the case when creating a new matrix) so one can enforce lazy evaluation.
       *
       * \sa operator=(const MatrixBase<OtherDerived>&), _set()
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
     {
       // I don't think we need this resize call since the lazyAssign will anyways resize
@@ -620,40 +725,175 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       //_resize_to_match(other);
       // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
       // it wouldn't allow to copy a row-vector into a column-vector.
-      return internal::assign_selector<Derived,OtherDerived,false>::run(this->derived(), other.derived());
+      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+      return this->derived();
     }
 
     template<typename T0, typename T1>
-    EIGEN_STRONG_INLINE void _init2(Index nbRows, Index nbCols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
     {
       EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
                           bool(NumTraits<T1>::IsInteger),
                           FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
-      resize(nbRows,nbCols);
+      resize(rows,cols);
     }
+    
     template<typename T0, typename T1>
-    EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
+    EIGEN_DEVICE_FUNC 
+    EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
     {
       EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
+      m_storage.data()[0] = Scalar(val0);
+      m_storage.data()[1] = Scalar(val1);
+    }
+    
+    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC 
+    EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<T0,Index>::value)
+                                                                  && (internal::is_same<T1,Index>::value)
+                                                                  && Base::SizeAtCompileTime==2,T1>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
+      m_storage.data()[0] = Scalar(val0);
+      m_storage.data()[1] = Scalar(val1);
+    }
+
+    // The argument is convertible to the Index type and we either have a non 1x1 Matrix, or a dynamic-sized Array,
+    // then the argument is meant to be the size of the object.
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if<    (Base::SizeAtCompileTime!=1 || !internal::is_convertible<T, Scalar>::value)
+                                                                              && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
+    {
+      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
+      const bool is_integer = NumTraits<T>::IsInteger;
+      EIGEN_UNUSED_VARIABLE(is_integer);
+      EIGEN_STATIC_ASSERT(is_integer,
+                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
+      resize(size);
+    }
+    
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
       m_storage.data()[0] = val0;
-      m_storage.data()[1] = val1;
+    }
+    
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type)
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Index& val0,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<Index,T>::value)
+                                                                  && Base::SizeAtCompileTime==1
+                                                                  && internal::is_convertible<T, Scalar>::value,T*>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
+      m_storage.data()[0] = Scalar(val0);
+    }
+
+    // Initialize a fixed size matrix from a pointer to raw data
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar* data){
+      this->_set_noalias(ConstMapType(data));
     }
 
+    // Initialize an arbitrary matrix from a dense expression
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other){
+      this->_set_noalias(other);
+    }
+
+    // Initialize an arbitrary matrix from an object convertible to the Derived type.
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Derived& other){
+      this->_set_noalias(other);
+    }
+
+    // Initialize an arbitrary matrix from a generic Eigen expression
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other){
+      this->derived() = other;
+    }
+
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other)
+    {
+      resize(other.rows(), other.cols());
+      other.evalTo(this->derived());
+    }
+
+    template<typename T, typename OtherDerived, int ColsAtCompileTime>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
+    {
+      this->derived() = r;
+    }
+    
+    // For fixed-size Array<Scalar,...>
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
+                                    typename internal::enable_if<    Base::SizeAtCompileTime!=Dynamic
+                                                                  && Base::SizeAtCompileTime!=1
+                                                                  && internal::is_convertible<T, Scalar>::value
+                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T>::type* = 0)
+    {
+      Base::setConstant(val0);
+    }
+    
+    // For fixed-size Array<Index,...>
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Index& val0,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<Index,T>::value)
+                                                                  && Base::SizeAtCompileTime!=Dynamic
+                                                                  && Base::SizeAtCompileTime!=1
+                                                                  && internal::is_convertible<T, Scalar>::value
+                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T*>::type* = 0)
+    {
+      Base::setConstant(val0);
+    }
+    
     template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
     friend struct internal::matrix_swap_impl;
 
-    /** \internal generic implementation of swap for dense storage since for dynamic-sized matrices of same type it is enough to swap the
-      * data pointers.
+  public:
+    
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal
+      * \brief Override DenseBase::swap() since for dynamic-sized matrices
+      * of same type it is enough to swap the data pointers.
       */
     template<typename OtherDerived>
-    void _swap(DenseBase<OtherDerived> const & other)
+    EIGEN_DEVICE_FUNC
+    void swap(DenseBase<OtherDerived> & other)
     {
       enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
-      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.const_cast_derived());
+      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
     }
-
-  public:
-#ifndef EIGEN_PARSED_BY_DOXYGEN
+    
+    /** \internal
+      * \brief const version forwarded to DenseBase::swap
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    void swap(DenseBase<OtherDerived> const & other)
+    { Base::swap(other.derived()); }
+    
+    EIGEN_DEVICE_FUNC 
     static EIGEN_STRONG_INLINE void _check_template_params()
     {
       EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
@@ -667,10 +907,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                         && (Options & (DontAlign|RowMajor)) == Options),
         INVALID_MATRIX_TEMPLATE_PARAMETERS)
     }
-#endif
 
-private:
-    enum { ThisConstantIsPrivateInPlainObjectBase };
+    enum { IsPlainObjectBase = 1 };
+#endif
 };
 
 namespace internal {
@@ -678,7 +917,6 @@ namespace internal {
 template <typename Derived, typename OtherDerived, bool IsVector>
 struct conservative_resize_like_impl
 {
-  typedef typename Derived::Index Index;
   static void run(DenseBase<Derived>& _this, Index rows, Index cols)
   {
     if (_this.rows() == rows && _this.cols() == cols) return;
@@ -694,8 +932,8 @@ struct conservative_resize_like_impl
     {
       // The storage order does not allow us to use reallocation.
       typename Derived::PlainObject tmp(rows,cols);
-      const Index common_rows = (std::min)(rows, _this.rows());
-      const Index common_cols = (std::min)(cols, _this.cols());
+      const Index common_rows = numext::mini(rows, _this.rows());
+      const Index common_cols = numext::mini(cols, _this.cols());
       tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
       _this.derived().swap(tmp);
     }
@@ -728,8 +966,8 @@ struct conservative_resize_like_impl
     {
       // The storage order does not allow us to use reallocation.
       typename Derived::PlainObject tmp(other);
-      const Index common_rows = (std::min)(tmp.rows(), _this.rows());
-      const Index common_cols = (std::min)(tmp.cols(), _this.cols());
+      const Index common_rows = numext::mini(tmp.rows(), _this.rows());
+      const Index common_cols = numext::mini(tmp.cols(), _this.cols());
       tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
       _this.derived().swap(tmp);
     }
@@ -744,7 +982,6 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 {
   using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
   
-  typedef typename Derived::Index Index;
   static void run(DenseBase<Derived>& _this, Index size)
   {
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
@@ -770,6 +1007,7 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
 struct matrix_swap_impl
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(MatrixTypeA& a, MatrixTypeB& b)
   {
     a.base().swap(b);
@@ -779,6 +1017,7 @@ struct matrix_swap_impl
 template<typename MatrixTypeA, typename MatrixTypeB>
 struct matrix_swap_impl<MatrixTypeA, MatrixTypeB, true>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(MatrixTypeA& a, MatrixTypeB& b)
   {
     static_cast<typename MatrixTypeA::Base&>(a).m_storage.swap(static_cast<typename MatrixTypeB::Base&>(b).m_storage);
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
new file mode 100644
index 000000000..ae0c94b38
--- /dev/null
+++ b/Eigen/src/Core/Product.h
@@ -0,0 +1,186 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PRODUCT_H
+#define EIGEN_PRODUCT_H
+
+namespace Eigen {
+
+template<typename Lhs, typename Rhs, int Option, typename StorageKind> class ProductImpl;
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, int Option>
+struct traits<Product<Lhs, Rhs, Option> >
+{
+  typedef typename remove_all<Lhs>::type LhsCleaned;
+  typedef typename remove_all<Rhs>::type RhsCleaned;
+  typedef traits<LhsCleaned> LhsTraits;
+  typedef traits<RhsCleaned> RhsTraits;
+  
+  typedef MatrixXpr XprKind;
+  
+  typedef typename ScalarBinaryOpTraits<typename traits<LhsCleaned>::Scalar, typename traits<RhsCleaned>::Scalar>::ReturnType Scalar;
+  typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
+                                                typename RhsTraits::StorageKind,
+                                                internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
+  typedef typename promote_index_type<typename LhsTraits::StorageIndex,
+                                      typename RhsTraits::StorageIndex>::type StorageIndex;
+  
+  enum {
+    RowsAtCompileTime    = LhsTraits::RowsAtCompileTime,
+    ColsAtCompileTime    = RhsTraits::ColsAtCompileTime,
+    MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime,
+    
+    // FIXME: only needed by GeneralMatrixMatrixTriangular
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime),
+    
+    // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
+    Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit
+          : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+          : (   ((LhsTraits::Flags&NoPreferredStorageOrderBit) && (RhsTraits::Flags&RowMajorBit))
+             || ((RhsTraits::Flags&NoPreferredStorageOrderBit) && (LhsTraits::Flags&RowMajorBit)) ) ? RowMajorBit
+          : NoPreferredStorageOrderBit
+  };
+};
+
+} // end namespace internal
+
+/** \class Product
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the product of two arbitrary matrices or vectors
+  *
+  * \tparam _Lhs the type of the left-hand side expression
+  * \tparam _Rhs the type of the right-hand side expression
+  *
+  * This class represents an expression of the product of two arbitrary matrices.
+  *
+  * The other template parameters are:
+  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
+  *
+  */
+template<typename _Lhs, typename _Rhs, int Option>
+class Product : public ProductImpl<_Lhs,_Rhs,Option,
+                                   typename internal::product_promote_storage_type<typename internal::traits<_Lhs>::StorageKind,
+                                                                                   typename internal::traits<_Rhs>::StorageKind,
+                                                                                   internal::product_type<_Lhs,_Rhs>::ret>::ret>
+{
+  public:
+    
+    typedef _Lhs Lhs;
+    typedef _Rhs Rhs;
+    
+    typedef typename ProductImpl<
+        Lhs, Rhs, Option,
+        typename internal::product_promote_storage_type<typename internal::traits<Lhs>::StorageKind,
+                                                        typename internal::traits<Rhs>::StorageKind,
+                                                        internal::product_type<Lhs,Rhs>::ret>::ret>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
+
+    typedef typename internal::ref_selector<Lhs>::type LhsNested;
+    typedef typename internal::ref_selector<Rhs>::type RhsNested;
+    typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+    typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+
+    EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+    {
+      eigen_assert(lhs.cols() == rhs.rows()
+        && "invalid matrix product"
+        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+    }
+
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+
+    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
+
+  protected:
+
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+};
+
+namespace internal {
+  
+template<typename Lhs, typename Rhs, int Option, int ProductTag = internal::product_type<Lhs,Rhs>::ret>
+class dense_product_base
+ : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
+{};
+
+/** Convertion to scalar for inner-products */
+template<typename Lhs, typename Rhs, int Option>
+class dense_product_base<Lhs, Rhs, Option, InnerProduct>
+ : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
+{
+  typedef Product<Lhs,Rhs,Option> ProductXpr;
+  typedef typename internal::dense_xpr_base<ProductXpr>::type Base;
+public:
+  using Base::derived;
+  typedef typename Base::Scalar Scalar;
+  
+  operator const Scalar() const
+  {
+    return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
+  }
+};
+
+} // namespace internal
+
+// Generic API dispatcher
+template<typename Lhs, typename Rhs, int Option, typename StorageKind>
+class ProductImpl : public internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type
+{
+  public:
+    typedef typename internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type Base;
+};
+
+template<typename Lhs, typename Rhs, int Option>
+class ProductImpl<Lhs,Rhs,Option,Dense>
+  : public internal::dense_product_base<Lhs,Rhs,Option>
+{
+    typedef Product<Lhs, Rhs, Option> Derived;
+    
+  public:
+    
+    typedef typename internal::dense_product_base<Lhs, Rhs, Option> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+  protected:
+    enum {
+      IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && 
+                   (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic),
+      EnableCoeff = IsOneByOne || Option==LazyProduct
+    };
+    
+  public:
+  
+    EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
+    {
+      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
+      
+      return internal::evaluator<Derived>(derived()).coeff(row,col);
+    }
+
+    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
+    {
+      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
+      
+      return internal::evaluator<Derived>(derived()).coeff(i);
+    }
+    
+  
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_H
diff --git a/Eigen/src/Core/ProductBase.h b/Eigen/src/Core/ProductBase.h
deleted file mode 100644
index cf74470a9..000000000
--- a/Eigen/src/Core/ProductBase.h
+++ /dev/null
@@ -1,290 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PRODUCTBASE_H
-#define EIGEN_PRODUCTBASE_H
-
-namespace Eigen { 
-
-/** \class ProductBase
-  * \ingroup Core_Module
-  *
-  */
-
-namespace internal {
-template<typename Derived, typename _Lhs, typename _Rhs>
-struct traits<ProductBase<Derived,_Lhs,_Rhs> >
-{
-  typedef MatrixXpr XprKind;
-  typedef typename remove_all<_Lhs>::type Lhs;
-  typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
-                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
-  enum {
-    RowsAtCompileTime = traits<Lhs>::RowsAtCompileTime,
-    ColsAtCompileTime = traits<Rhs>::ColsAtCompileTime,
-    MaxRowsAtCompileTime = traits<Lhs>::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = traits<Rhs>::MaxColsAtCompileTime,
-    Flags = (MaxRowsAtCompileTime==1 ? RowMajorBit : 0)
-          | EvalBeforeNestingBit | EvalBeforeAssigningBit | NestByRefBit,
-                  // Note that EvalBeforeNestingBit and NestByRefBit
-                  // are not used in practice because nested is overloaded for products
-    CoeffReadCost = 0 // FIXME why is it needed ?
-  };
-};
-}
-
-#define EIGEN_PRODUCT_PUBLIC_INTERFACE(Derived) \
-  typedef ProductBase<Derived, Lhs, Rhs > Base; \
-  EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
-  typedef typename Base::LhsNested LhsNested; \
-  typedef typename Base::_LhsNested _LhsNested; \
-  typedef typename Base::LhsBlasTraits LhsBlasTraits; \
-  typedef typename Base::ActualLhsType ActualLhsType; \
-  typedef typename Base::_ActualLhsType _ActualLhsType; \
-  typedef typename Base::RhsNested RhsNested; \
-  typedef typename Base::_RhsNested _RhsNested; \
-  typedef typename Base::RhsBlasTraits RhsBlasTraits; \
-  typedef typename Base::ActualRhsType ActualRhsType; \
-  typedef typename Base::_ActualRhsType _ActualRhsType; \
-  using Base::m_lhs; \
-  using Base::m_rhs;
-
-template<typename Derived, typename Lhs, typename Rhs>
-class ProductBase : public MatrixBase<Derived>
-{
-  public:
-    typedef MatrixBase<Derived> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ProductBase)
-    
-    typedef typename Lhs::Nested LhsNested;
-    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
-    typedef internal::blas_traits<_LhsNested> LhsBlasTraits;
-    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
-    typedef typename internal::remove_all<ActualLhsType>::type _ActualLhsType;
-    typedef typename internal::traits<Lhs>::Scalar LhsScalar;
-
-    typedef typename Rhs::Nested RhsNested;
-    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
-    typedef internal::blas_traits<_RhsNested> RhsBlasTraits;
-    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-    typedef typename internal::remove_all<ActualRhsType>::type _ActualRhsType;
-    typedef typename internal::traits<Rhs>::Scalar RhsScalar;
-
-    // Diagonal of a product: no need to evaluate the arguments because they are going to be evaluated only once
-    typedef CoeffBasedProduct<LhsNested, RhsNested, 0> FullyLazyCoeffBaseProductType;
-
-  public:
-
-#ifndef EIGEN_NO_MALLOC
-    typedef typename Base::PlainObject BasePlainObject;
-    typedef Matrix<Scalar,RowsAtCompileTime==1?1:Dynamic,ColsAtCompileTime==1?1:Dynamic,BasePlainObject::Options> DynPlainObject;
-    typedef typename internal::conditional<(BasePlainObject::SizeAtCompileTime==Dynamic) || (BasePlainObject::SizeAtCompileTime*int(sizeof(Scalar)) < int(EIGEN_STACK_ALLOCATION_LIMIT)),
-                                           BasePlainObject, DynPlainObject>::type PlainObject;
-#else
-    typedef typename Base::PlainObject PlainObject;
-#endif
-
-    ProductBase(const Lhs& a_lhs, const Rhs& a_rhs)
-      : m_lhs(a_lhs), m_rhs(a_rhs)
-    {
-      eigen_assert(a_lhs.cols() == a_rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    inline Index rows() const { return m_lhs.rows(); }
-    inline Index cols() const { return m_rhs.cols(); }
-
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst,Scalar(1)); }
-
-    template<typename Dest>
-    inline void addTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(1)); }
-
-    template<typename Dest>
-    inline void subTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(-1)); }
-
-    template<typename Dest>
-    inline void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { derived().scaleAndAddTo(dst,alpha); }
-
-    const _LhsNested& lhs() const { return m_lhs; }
-    const _RhsNested& rhs() const { return m_rhs; }
-
-    // Implicit conversion to the nested type (trigger the evaluation of the product)
-    operator const PlainObject& () const
-    {
-      m_result.resize(m_lhs.rows(), m_rhs.cols());
-      derived().evalTo(m_result);
-      return m_result;
-    }
-
-    const Diagonal<const FullyLazyCoeffBaseProductType,0> diagonal() const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
-
-    template<int Index>
-    const Diagonal<FullyLazyCoeffBaseProductType,Index> diagonal() const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
-
-    const Diagonal<FullyLazyCoeffBaseProductType,Dynamic> diagonal(Index index) const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs).diagonal(index); }
-
-    // restrict coeff accessors to 1x1 expressions. No need to care about mutators here since this isnt a Lvalue expression
-    typename Base::CoeffReturnType coeff(Index row, Index col) const
-    {
-#ifdef EIGEN2_SUPPORT
-      return lhs().row(row).cwiseProduct(rhs().col(col).transpose()).sum();
-#else
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      Matrix<Scalar,1,1> result = *this;
-      return result.coeff(row,col);
-#endif
-    }
-
-    typename Base::CoeffReturnType coeff(Index i) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      Matrix<Scalar,1,1> result = *this;
-      return result.coeff(i);
-    }
-
-    const Scalar& coeffRef(Index row, Index col) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      return derived().coeffRef(row,col);
-    }
-
-    const Scalar& coeffRef(Index i) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      return derived().coeffRef(i);
-    }
-
-  protected:
-
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-
-    mutable PlainObject m_result;
-};
-
-// here we need to overload the nested rule for products
-// such that the nested type is a const reference to a plain matrix
-namespace internal {
-template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
-struct nested<GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
-{
-  typedef typename GeneralProduct<Lhs,Rhs,Mode>::PlainObject const& type;
-};
-template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
-struct nested<const GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
-{
-  typedef typename GeneralProduct<Lhs,Rhs,Mode>::PlainObject const& type;
-};
-}
-
-template<typename NestedProduct>
-class ScaledProduct;
-
-// Note that these two operator* functions are not defined as member
-// functions of ProductBase, because, otherwise we would have to
-// define all overloads defined in MatrixBase. Furthermore, Using
-// "using Base::operator*" would not work with MSVC.
-//
-// Also note that here we accept any compatible scalar types
-template<typename Derived,typename Lhs,typename Rhs>
-const ScaledProduct<Derived>
-operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::Scalar& x)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-template<typename Derived,typename Lhs,typename Rhs>
-typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
-                      const ScaledProduct<Derived> >::type
-operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::RealScalar& x)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-
-template<typename Derived,typename Lhs,typename Rhs>
-const ScaledProduct<Derived>
-operator*(const typename Derived::Scalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-template<typename Derived,typename Lhs,typename Rhs>
-typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
-                      const ScaledProduct<Derived> >::type
-operator*(const typename Derived::RealScalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-namespace internal {
-template<typename NestedProduct>
-struct traits<ScaledProduct<NestedProduct> >
- : traits<ProductBase<ScaledProduct<NestedProduct>,
-                         typename NestedProduct::_LhsNested,
-                         typename NestedProduct::_RhsNested> >
-{
-  typedef typename traits<NestedProduct>::StorageKind StorageKind;
-};
-}
-
-template<typename NestedProduct>
-class ScaledProduct
-  : public ProductBase<ScaledProduct<NestedProduct>,
-                       typename NestedProduct::_LhsNested,
-                       typename NestedProduct::_RhsNested>
-{
-  public:
-    typedef ProductBase<ScaledProduct<NestedProduct>,
-                       typename NestedProduct::_LhsNested,
-                       typename NestedProduct::_RhsNested> Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::PlainObject PlainObject;
-//     EIGEN_PRODUCT_PUBLIC_INTERFACE(ScaledProduct)
-
-    ScaledProduct(const NestedProduct& prod, const Scalar& x)
-    : Base(prod.lhs(),prod.rhs()), m_prod(prod), m_alpha(x) {}
-
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst, Scalar(1)); }
-
-    template<typename Dest>
-    inline void addTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(1)); }
-
-    template<typename Dest>
-    inline void subTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(-1)); }
-
-    template<typename Dest>
-    inline void scaleAndAddTo(Dest& dst, const Scalar& a_alpha) const { m_prod.derived().scaleAndAddTo(dst,a_alpha * m_alpha); }
-
-    const Scalar& alpha() const { return m_alpha; }
-    
-  protected:
-    const NestedProduct& m_prod;
-    Scalar m_alpha;
-};
-
-/** \internal
-  * Overloaded to perform an efficient C = (A*B).lazy() */
-template<typename Derived>
-template<typename ProductDerived, typename Lhs, typename Rhs>
-Derived& MatrixBase<Derived>::lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-{
-  other.derived().evalTo(derived());
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_PRODUCTBASE_H
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
new file mode 100644
index 000000000..583b7f59e
--- /dev/null
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -0,0 +1,1099 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_PRODUCTEVALUATORS_H
+#define EIGEN_PRODUCTEVALUATORS_H
+
+namespace Eigen {
+  
+namespace internal {
+
+/** \internal
+  * Evaluator of a product expression.
+  * Since products require special treatments to handle all possible cases,
+  * we simply deffer the evaluation logic to a product_evaluator class
+  * which offers more partial specialization possibilities.
+  * 
+  * \sa class product_evaluator
+  */
+template<typename Lhs, typename Rhs, int Options>
+struct evaluator<Product<Lhs, Rhs, Options> > 
+ : public product_evaluator<Product<Lhs, Rhs, Options> >
+{
+  typedef Product<Lhs, Rhs, Options> XprType;
+  typedef product_evaluator<XprType> Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+ 
+// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
+// TODO we should apply that rule only if that's really helpful
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                                               const Product<Lhs, Rhs, DefaultProduct> > >
+{
+  static const bool value = true;
+};
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct> > >
+ : public evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> >
+{
+  typedef CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct> > XprType;
+  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
+  {}
+};
+
+
+template<typename Lhs, typename Rhs, int DiagIndex>
+struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> > 
+ : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
+{
+  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
+  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
+        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
+        xpr.index() ))
+  {}
+};
+
+
+// Helper class to perform a matrix product with the destination at hand.
+// Depending on the sizes of the factors, there are different evaluation strategies
+// as controlled by internal::product_type.
+template< typename Lhs, typename Rhs,
+          typename LhsShape = typename evaluator_traits<Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<Rhs>::Shape,
+          int ProductType = internal::product_type<Lhs,Rhs>::value>
+struct generic_product_impl;
+
+template<typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct> > {
+  static const bool value = true;
+};
+
+// This is the default evaluator implementation for products:
+// It creates a temporary and call generic_product_impl
+template<typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
+struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape>
+  : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject>
+{
+  typedef Product<Lhs, Rhs, Options> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    
+// FIXME shall we handle nested_eval here?,
+// if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
+//     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+//     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+//     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+//     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+//     
+//     const LhsNested lhs(xpr.lhs());
+//     const RhsNested rhs(xpr.rhs());
+//   
+//     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
+
+    generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+// The following three shortcuts are enabled only if the scalar types match excatly.
+// TODO: we could enable them for different scalar types when the product is not vectorized.
+
+// Dense = Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+// Dense += Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+// Dense -= Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+
+// Dense ?= scalar * Product
+// TODO we should apply that rule if that's really helpful
+// for instance, this is not good for inner products
+template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis, typename Plain>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>, const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense>
+{
+  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
+                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
+                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
+  static EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
+  {
+    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
+  }
+};
+
+//----------------------------------------
+// Catch "Dense ?= xpr + Product<>" expression to save one temporary
+// FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};
+
+template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
+struct assignment_from_xpr_op_product
+{
+  template<typename SrcXprType, typename InitialFunc>
+  static EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
+  {
+    call_assignment_no_alias(dst, src.lhs(), Func1());
+    call_assignment_no_alias(dst, src.rhs(), Func2());
+  }
+};
+
+#define EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(ASSIGN_OP,BINOP,ASSIGN_OP2) \
+  template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename DstScalar, typename SrcScalar, typename OtherScalar,typename ProdScalar> \
+  struct Assignment<DstXprType, CwiseBinaryOp<internal::BINOP<OtherScalar,ProdScalar>, const OtherXpr, \
+                                            const Product<Lhs,Rhs,DefaultProduct> >, internal::ASSIGN_OP<DstScalar,SrcScalar>, Dense2Dense> \
+    : assignment_from_xpr_op_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, internal::ASSIGN_OP<DstScalar,OtherScalar>, internal::ASSIGN_OP2<DstScalar,ProdScalar> > \
+  {}
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_sum_op,sub_assign_op);
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_difference_op,add_assign_op);
+
+//----------------------------------------
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
+{
+  template<typename Dst>
+  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
+  }
+  
+  template<typename Dst>
+  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
+  }
+  
+  template<typename Dst>
+  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
+};
+
+
+/***********************************************************************
+*  Implementation of outer dense * dense vector product
+***********************************************************************/
+
+// Column major result
+template<typename Dst, typename Lhs, typename Rhs, typename Func>
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+{
+  evaluator<Rhs> rhsEval(rhs);
+  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+  // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
+  // FIXME not very good if rhs is real and lhs complex while alpha is real too
+  const Index cols = dst.cols();
+  for (Index j=0; j<cols; ++j)
+    func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
+}
+
+// Row major result
+template<typename Dst, typename Lhs, typename Rhs, typename Func>
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+{
+  evaluator<Lhs> lhsEval(lhs);
+  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+  // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
+  // FIXME not very good if lhs is real and rhs complex while alpha is real too
+  const Index rows = dst.rows();
+  for (Index i=0; i<rows; ++i)
+    func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
+}
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
+{
+  template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
+  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct adds {
+    Scalar m_scale;
+    explicit adds(const Scalar& s) : m_scale(s) {}
+    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() += m_scale * src;
+    }
+  };
+  
+  template<typename Dst>
+  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
+  }
+  
+  template<typename Dst>
+  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
+  }
+  
+  template<typename Dst>
+  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
+  }
+  
+  template<typename Dst>
+  static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
+  }
+  
+};
+
+
+// This base class provides default implementations for evalTo, addTo, subTo, in terms of scaleAndAddTo
+template<typename Lhs, typename Rhs, typename Derived>
+struct generic_product_impl_base
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dst>
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
+
+  template<typename Dst>
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
+
+  template<typename Dst>
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
+  
+  template<typename Dst>
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
+
+};
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct> >
+{
+  typedef typename nested_eval<Lhs,1>::type LhsNested;
+  typedef typename nested_eval<Rhs,1>::type RhsNested;
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
+  typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
+
+  template<typename Dest>
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    LhsNested actual_lhs(lhs);
+    RhsNested actual_rhs(rhs);
+    internal::gemv_dense_selector<Side,
+                            (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                            bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
+                           >::run(actual_lhs, actual_rhs, dst, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> 
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dst>
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
+    // but easier on the compiler side
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
+  }
+  
+  template<typename Dst>
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // dst.noalias() += lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
+  }
+  
+  template<typename Dst>
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // dst.noalias() -= lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
+  }
+  
+//   template<typename Dst>
+//   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+//   { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
+};
+
+// This specialization enforces the use of a coefficient-based evaluation strategy
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,LazyCoeffBasedProductMode>
+  : generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> {};
+
+// Case 2: Evaluate coeff by coeff
+//
+// This is mostly taken from CoeffBasedProduct.h
+// The main difference is that we add an extra argument to the etor_product_*_impl::run() function
+// for the inner dimension of the product, because evaluator object do not know their size.
+
+template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl;
+
+template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape>
+    : evaluator_base<Product<Lhs, Rhs, LazyProduct> >
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
+    : m_lhs(xpr.lhs()),
+      m_rhs(xpr.rhs()),
+      m_lhsImpl(m_lhs),     // FIXME the creation of the evaluator objects should result in a no-op, but check that!
+      m_rhsImpl(m_rhs),     //       Moreover, they are only useful for the packet path, so we could completely disable them when not needed,
+                            //       or perhaps declare them on the fly on the packet method... We have experiment to check what's best.
+      m_innerDim(xpr.lhs().cols())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+#if 0
+    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
+    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
+    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
+    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
+    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
+    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
+    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
+    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
+    std::cerr << "Alignment=            " << Alignment << "\n";
+    std::cerr << "Flags=                " << Flags << "\n";
+#endif
+  }
+
+  // Everything below here is taken from CoeffBasedProduct.h
+
+  typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+  typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+  
+  typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+  typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+
+  typedef evaluator<LhsNestedCleaned> LhsEtorType;
+  typedef evaluator<RhsNestedCleaned> RhsEtorType;
+
+  enum {
+    RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
+    ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
+    MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime
+  };
+
+  typedef typename find_best_packet<Scalar,RowsAtCompileTime>::type LhsVecPacketType;
+  typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;
+
+  enum {
+      
+    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
+    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
+    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
+                  : InnerSize == Dynamic ? HugeCost
+                  : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
+
+    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+    
+    LhsFlags = LhsEtorType::Flags,
+    RhsFlags = RhsEtorType::Flags,
+    
+    LhsRowMajor = LhsFlags & RowMajorBit,
+    RhsRowMajor = RhsFlags & RowMajorBit,
+
+    LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size,
+    RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size,
+
+    // Here, we don't care about alignment larger than the usable packet size.
+    LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
+    RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
+      
+    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
+
+    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime!=1),
+
+    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+                    : (bool(RhsRowMajor) && !CanVectorizeLhs),
+
+    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
+          | (EvalToRowMajor ? RowMajorBit : 0)
+          // TODO enable vectorization for mixed types
+          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
+          | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
+          
+    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
+    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
+
+    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
+              : 0,
+
+    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+     * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+     * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+     * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+     */
+    CanVectorizeInner =    SameType
+                        && LhsRowMajor
+                        && (!RhsRowMajor)
+                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
+                        && (InnerSize % packet_traits<Scalar>::size == 0)
+  };
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
+  {
+    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
+  }
+
+  /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+   * which is why we don't set the LinearAccessBit.
+   * TODO: this seems possible when the result is a vector
+   */
+  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
+  {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
+    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
+  }
+
+  template<int LoadMode, typename PacketType>
+  const PacketType packet(Index row, Index col) const
+  {
+    PacketType res;
+    typedef etor_product_packet_impl<bool(int(Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic,
+                                     LhsEtorType, RhsEtorType, PacketType, LoadMode> PacketImpl;
+    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
+    return res;
+  }
+
+  template<int LoadMode, typename PacketType>
+  const PacketType packet(Index index) const
+  {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
+    return packet<LoadMode,PacketType>(row,col);
+  }
+
+protected:
+  typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
+  typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
+  
+  LhsEtorType m_lhsImpl;
+  RhsEtorType m_rhsImpl;
+
+  // TODO: Get rid of m_innerDim if known at compile time
+  Index m_innerDim;
+};
+
+template<typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape>
+  : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape>
+{
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  typedef Product<Lhs, Rhs, LazyProduct> BaseProduct;
+  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape> Base;
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
+  {}
+};
+
+/****************************************
+*** Coeff based product, Packet path  ***
+****************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  {
+    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  {
+    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for(Index i = 0; i < innerDim; ++i)
+      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for(Index i = 0; i < innerDim; ++i)
+      res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+  }
+};
+
+
+/***************************************************************************
+* Triangular products
+***************************************************************************/
+template<int Mode, bool LhsIsTriangular,
+         typename Lhs, bool LhsIsVector,
+         typename Rhs, bool RhsIsVector>
+struct triangular_product_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    triangular_product_impl<Lhs::Mode,true,typename Lhs::MatrixType,false,Rhs, Rhs::ColsAtCompileTime==1>
+        ::run(dst, lhs.nestedExpression(), rhs, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag>
+: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    triangular_product_impl<Rhs::Mode,false,Lhs,Lhs::RowsAtCompileTime==1, typename Rhs::MatrixType, false>::run(dst, lhs, rhs.nestedExpression(), alpha);
+  }
+};
+
+
+/***************************************************************************
+* SelfAdjoint products
+***************************************************************************/
+template <typename Lhs, int LhsMode, bool LhsIsVector,
+          typename Rhs, int RhsMode, bool RhsIsVector>
+struct selfadjoint_product_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
+: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    selfadjoint_product_impl<Lhs,0,Lhs::IsVectorAtCompileTime,typename Rhs::MatrixType,Rhs::Mode,false>::run(dst, lhs, rhs.nestedExpression(), alpha);
+  }
+};
+
+
+/***************************************************************************
+* Diagonal products
+***************************************************************************/
+  
+template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
+struct diagonal_product_evaluator_base
+  : evaluator_base<Derived>
+{
+   typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+public:
+  enum {
+    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
+    
+    MatrixFlags = evaluator<MatrixType>::Flags,
+    DiagFlags = evaluator<DiagonalType>::Flags,
+    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
+    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
+                           ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
+    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
+    // FIXME currently we need same types, but in the future the next rule should be the one
+    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
+    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
+    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
+    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
+    Alignment = evaluator<MatrixType>::Alignment
+  };
+  
+  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
+    : m_diagImpl(diag), m_matImpl(mat)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
+  {
+    return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+  }
+  
+protected:
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
+  {
+    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
+  }
+  
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
+  {
+    enum {
+      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+      DiagonalPacketLoadMode = EIGEN_PLAIN_ENUM_MIN(LoadMode,((InnerSize%16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!!
+    };
+    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+                          m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
+  }
+  
+  evaluator<DiagonalType> m_diagImpl;
+  evaluator<MatrixType>   m_matImpl;
+};
+
+// diagonal * dense
+template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape>
+  : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft>
+{
+  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft> Base;
+  using Base::m_diagImpl;
+  using Base::m_matImpl;
+  using Base::coeff;
+  typedef typename Base::Scalar Scalar;
+  
+  typedef Product<Lhs, Rhs, ProductKind> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  
+  enum {
+    StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
+  };
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.rhs(), xpr.lhs().diagonal())
+  {
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+  {
+    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
+  }
+  
+#ifndef __CUDACC__
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
+  {
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
+    return this->template packet_impl<LoadMode,PacketType>(row,col, row,
+                                 typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
+  }
+  
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
+  {
+    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+  }
+#endif
+};
+
+// dense * diagonal
+template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape>
+  : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight>
+{
+  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight> Base;
+  using Base::m_diagImpl;
+  using Base::m_matImpl;
+  using Base::coeff;
+  typedef typename Base::Scalar Scalar;
+  
+  typedef Product<Lhs, Rhs, ProductKind> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  
+  enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.lhs(), xpr.rhs().diagonal())
+  {
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+  {
+    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
+  }
+  
+#ifndef __CUDACC__
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
+  {
+    return this->template packet_impl<LoadMode,PacketType>(row,col, col,
+                                 typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
+  }
+  
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
+  {
+    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+  }
+#endif
+};
+
+/***************************************************************************
+* Products with permutation matrices
+***************************************************************************/
+
+/** \internal
+  * \class permutation_matrix_product
+  * Internal helper class implementing the product between a permutation matrix and a matrix.
+  * This class is specialized for DenseShape below and for SparseShape in SparseCore/SparsePermutation.h
+  */
+template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct permutation_matrix_product;
+
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
+{
+    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+
+    template<typename Dest, typename PermutationType>
+    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
+    {
+      MatrixType mat(xpr);
+      const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
+      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
+      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
+      //if(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat))
+      if(is_same_dense(dst, mat))
+      {
+        // apply the permutation inplace
+        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(perm.size());
+        mask.fill(false);
+        Index r = 0;
+        while(r < perm.size())
+        {
+          // search for the next seed
+          while(r<perm.size() && mask[r]) r++;
+          if(r>=perm.size())
+            break;
+          // we got one, let's follow it until we are back to the seed
+          Index k0 = r++;
+          Index kPrev = k0;
+          mask.coeffRef(k0) = true;
+          for(Index k=perm.indices().coeff(k0); k!=k0; k=perm.indices().coeff(k))
+          {
+                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
+            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
+
+            mask.coeffRef(k) = true;
+            kPrev = k;
+          }
+        }
+      }
+      else
+      {
+        for(Index i = 0; i < n; ++i)
+        {
+          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+               (dst, ((Side==OnTheLeft) ^ Transposed) ? perm.indices().coeff(i) : i)
+
+          =
+
+          Block<const MatrixTypeCleaned,Side==OnTheLeft ? 1 : MatrixTypeCleaned::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixTypeCleaned::ColsAtCompileTime>
+               (mat, ((Side==OnTheRight) ^ Transposed) ? perm.indices().coeff(i) : i);
+        }
+      }
+    }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
+  {
+    permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
+  }
+};
+
+
+/***************************************************************************
+* Products with transpositions matrices
+***************************************************************************/
+
+// FIXME could we unify Transpositions and Permutation into a single "shape"??
+
+/** \internal
+  * \class transposition_matrix_product
+  * Internal helper class implementing the product between a permutation matrix and a matrix.
+  */
+template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct transposition_matrix_product
+{
+  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+  
+  template<typename Dest, typename TranspositionType>
+  static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
+  {
+    MatrixType mat(xpr);
+    typedef typename TranspositionType::StorageIndex StorageIndex;
+    const Index size = tr.size();
+    StorageIndex j = 0;
+
+    if(!is_same_dense(dst,mat))
+      dst = mat;
+
+    for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
+      if(Index(j=tr.coeff(k))!=k)
+      {
+        if(Side==OnTheLeft)        dst.row(k).swap(dst.row(j));
+        else if(Side==OnTheRight)  dst.col(k).swap(dst.col(j));
+      }
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  {
+    transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_EVALUATORS_H
diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index 480fea408..6faf789c7 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -16,8 +16,7 @@ namespace internal {
 
 template<typename Scalar> struct scalar_random_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_random_op)
-  template<typename Index>
-  inline const Scalar operator() (Index, Index = 0) const { return random<Scalar>(); }
+  inline const Scalar operator() () const { return random<Scalar>(); }
 };
 
 template<typename Scalar>
@@ -28,12 +27,18 @@ struct functor_traits<scalar_random_op<Scalar> >
 
 /** \returns a random matrix expression
   *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
   * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
+  * \not_reentrant
+  * 
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
   * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
   * instead.
+  * 
   *
   * Example: \include MatrixBase_random_int_int.cpp
   * Output: \verbinclude MatrixBase_random_int_int.out
@@ -41,11 +46,13 @@ struct functor_traits<scalar_random_op<Scalar> >
   * This expression has the "evaluate before nesting" flag so that it will be evaluated into
   * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
   * behavior with expressions involving random matrices.
+  * 
+  * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
   *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index), MatrixBase::Random()
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index rows, Index cols)
 {
   return NullaryExpr(rows, cols, internal::scalar_random_op<Scalar>());
@@ -53,10 +60,14 @@ DenseBase<Derived>::Random(Index rows, Index cols)
 
 /** \returns a random vector expression
   *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
   * The parameter \a size is the size of the returned vector.
   * Must be compatible with this MatrixBase type.
   *
   * \only_for_vectors
+  * \not_reentrant
   *
   * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
   * it is redundant to pass \a size as argument, so Random() should be used
@@ -69,10 +80,10 @@ DenseBase<Derived>::Random(Index rows, Index cols)
   * a temporary vector whenever it is nested in a larger expression. This prevents unexpected
   * behavior with expressions involving random matrices.
   *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random()
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index size)
 {
   return NullaryExpr(size, internal::scalar_random_op<Scalar>());
@@ -80,6 +91,9 @@ DenseBase<Derived>::Random(Index size)
 
 /** \returns a fixed-size random matrix or vector expression
   *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
   * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
   * need to use the variants taking size arguments.
   *
@@ -89,11 +103,13 @@ DenseBase<Derived>::Random(Index size)
   * This expression has the "evaluate before nesting" flag so that it will be evaluated into
   * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
   * behavior with expressions involving random matrices.
+  * 
+  * \not_reentrant
   *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random(Index)
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random()
 {
   return NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_random_op<Scalar>());
@@ -101,6 +117,11 @@ DenseBase<Derived>::Random()
 
 /** Sets all coefficients in this expression to random values.
   *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
+  * \not_reentrant
+  * 
   * Example: \include MatrixBase_setRandom.cpp
   * Output: \verbinclude MatrixBase_setRandom.out
   *
@@ -114,12 +135,16 @@ inline Derived& DenseBase<Derived>::setRandom()
 
 /** Resizes to the given \a newSize, and sets all coefficients in this expression to random values.
   *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
   * \only_for_vectors
+  * \not_reentrant
   *
   * Example: \include Matrix_setRandom_int.cpp
   * Output: \verbinclude Matrix_setRandom_int.out
   *
-  * \sa MatrixBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, MatrixBase::Random()
+  * \sa DenseBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, DenseBase::Random()
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
@@ -131,19 +156,24 @@ PlainObjectBase<Derived>::setRandom(Index newSize)
 
 /** Resizes to the given size, and sets all coefficients in this expression to random values.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  * 
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setRandom_int_int.cpp
   * Output: \verbinclude Matrix_setRandom_int_int.out
   *
-  * \sa MatrixBase::setRandom(), setRandom(Index), class CwiseNullaryOp, MatrixBase::Random()
+  * \sa DenseBase::setRandom(), setRandom(Index), class CwiseNullaryOp, DenseBase::Random()
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setRandom(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setRandom(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setRandom();
 }
 
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 50548fa9a..b6e8f8887 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -27,8 +27,9 @@ template<typename Func, typename Derived>
 struct redux_traits
 {
 public:
+    typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
   enum {
-    PacketSize = packet_traits<typename Derived::Scalar>::size,
+    PacketSize = unpacket_traits<PacketType>::size,
     InnerMaxSize = int(Derived::IsRowMajor)
                  ? Derived::MaxColsAtCompileTime
                  : Derived::MaxRowsAtCompileTime
@@ -37,8 +38,8 @@ public:
   enum {
     MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
                   && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = MightVectorize && (int(Derived::Flags)&LinearAccessBit),
-    MaySliceVectorize  = MightVectorize && int(InnerMaxSize)>=3*PacketSize
+    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
+    MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
   };
 
 public:
@@ -50,21 +51,34 @@ public:
 
 public:
   enum {
-    Cost = (  Derived::SizeAtCompileTime == Dynamic
-           || Derived::CoeffReadCost == Dynamic
-           || (Derived::SizeAtCompileTime!=1 && functor_traits<Func>::Cost == Dynamic)
-           ) ? Dynamic
-           : Derived::SizeAtCompileTime * Derived::CoeffReadCost
-               + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
+         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
 public:
   enum {
-    Unrolling = Cost != Dynamic && Cost <= UnrollingLimit
-              ? CompleteUnrolling
-              : NoUnrolling
+    Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling
   };
+  
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    EIGEN_DEBUG_VAR(Derived::Flags)
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    EIGEN_DEBUG_VAR(Traversal)
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    EIGEN_DEBUG_VAR(Unrolling)
+    std::cerr << std::endl;
+  }
+#endif
 };
 
 /***************************************************************************
@@ -82,6 +96,7 @@ struct redux_novec_unroller
 
   typedef typename Derived::Scalar Scalar;
 
+  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
   {
     return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
@@ -99,6 +114,7 @@ struct redux_novec_unroller<Func, Derived, Start, 1>
 
   typedef typename Derived::Scalar Scalar;
 
+  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
   {
     return mat.coeffByOuterInner(outer, inner);
@@ -112,6 +128,7 @@ template<typename Func, typename Derived, int Start>
 struct redux_novec_unroller<Func, Derived, Start, 0>
 {
   typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC 
   static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
 };
 
@@ -121,12 +138,12 @@ template<typename Func, typename Derived, int Start, int Length>
 struct redux_vec_unroller
 {
   enum {
-    PacketSize = packet_traits<typename Derived::Scalar>::size,
+    PacketSize = redux_traits<Func, Derived>::PacketSize,
     HalfLength = Length/2
   };
 
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
 
   static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
   {
@@ -140,18 +157,18 @@ template<typename Func, typename Derived, int Start>
 struct redux_vec_unroller<Func, Derived, Start, 1>
 {
   enum {
-    index = Start * packet_traits<typename Derived::Scalar>::size,
+    index = Start * redux_traits<Func, Derived>::PacketSize,
     outer = index / int(Derived::InnerSizeAtCompileTime),
     inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
+    alignment = Derived::Alignment
   };
 
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
 
   static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
   {
-    return mat.template packetByOuterInner<alignment>(outer, inner);
+    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
   }
 };
 
@@ -169,8 +186,8 @@ template<typename Func, typename Derived>
 struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
 {
   typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Index Index;
-  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
   {
     eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
     Scalar res;
@@ -193,19 +210,19 @@ template<typename Func, typename Derived>
 struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 {
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-  typedef typename Derived::Index Index;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
 
-  static Scalar run(const Derived& mat, const Func& func)
+  static Scalar run(const Derived &mat, const Func& func)
   {
     const Index size = mat.size();
-    eigen_assert(size && "you are using an empty matrix");
-    const Index packetSize = packet_traits<Scalar>::size;
-    const Index alignedStart = internal::first_aligned(mat);
+    
+    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
+    const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
     enum {
-      alignment = bool(Derived::Flags & DirectAccessBit) || bool(Derived::Flags & AlignedBit)
-                ? Aligned : Unaligned
+      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
     };
+    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -213,19 +230,19 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
     Scalar res;
     if(alignedSize)
     {
-      PacketScalar packet_res0 = mat.template packet<alignment>(alignedStart);
+      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
       if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
       {
-        PacketScalar packet_res1 = mat.template packet<alignment>(alignedStart+packetSize);
+        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
         for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
         {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
         }
 
         packet_res0 = func.packetOp(packet_res0,packet_res1);
         if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
       }
       res = func.predux(packet_res0);
 
@@ -247,29 +264,29 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
+// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
+template<typename Func, typename Derived, int Unrolling>
+struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
 {
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-  typedef typename Derived::Index Index;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketType;
 
-  static Scalar run(const Derived& mat, const Func& func)
+  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
   {
     eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
     const Index innerSize = mat.innerSize();
     const Index outerSize = mat.outerSize();
     enum {
-      packetSize = packet_traits<Scalar>::size
+      packetSize = redux_traits<Func, Derived>::PacketSize
     };
     const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
     Scalar res;
     if(packetedInnerSize)
     {
-      PacketScalar packet_res = mat.template packet<Unaligned>(0,0);
+      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned>(j,i));
+          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
 
       res = func.predux(packet_res);
       for(Index j=0; j<outerSize; ++j)
@@ -290,22 +307,90 @@ template<typename Func, typename Derived>
 struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
 {
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
   enum {
-    PacketSize = packet_traits<Scalar>::size,
+    PacketSize = redux_traits<Func, Derived>::PacketSize,
     Size = Derived::SizeAtCompileTime,
     VectorizedSize = (Size / PacketSize) * PacketSize
   };
-  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
   {
     eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
-    if (VectorizedSize != Size)
-      res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
-    return res;
+    if (VectorizedSize > 0) {
+      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      if (VectorizedSize != Size)
+        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+      return res;
+    }
+    else {
+      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+    }
   }
 };
 
+// evaluator adaptor
+template<typename _XprType>
+class redux_evaluator
+{
+public:
+  typedef _XprType XprType;
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketScalar PacketScalar;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  
+  enum {
+    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
+    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
+    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
+    IsRowMajor = XprType::IsRowMajor,
+    SizeAtCompileTime = XprType::SizeAtCompileTime,
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
+    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
+    Alignment = evaluator<XprType>::Alignment
+  };
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
+  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
+
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeff(Index row, Index col) const
+  { return m_evaluator.coeff(row, col); }
+
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeff(Index index) const
+  { return m_evaluator.coeff(index); }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
+  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
+  
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
+  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+  template<int LoadMode, typename PacketType>
+  PacketType packetByOuterInner(Index outer, Index inner) const
+  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+  const XprType & nestedExpression() const { return m_xpr; }
+  
+protected:
+  internal::evaluator<XprType> m_evaluator;
+  const XprType &m_xpr;
+};
+
 } // end namespace internal
 
 /***************************************************************************
@@ -316,18 +401,21 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
 /** \returns the result of a full redux operation on the whole matrix or vector using \a func
   *
   * The template parameter \a BinaryOp is the type of the functor \a func which must be
-  * an associative operator. Both current STL and TR1 functor styles are handled.
+  * an associative operator. Both current C++98 and C++11 functor styles are handled.
   *
   * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
   */
 template<typename Derived>
 template<typename Func>
-EIGEN_STRONG_INLINE typename internal::result_of<Func(typename internal::traits<Derived>::Scalar)>::type
+typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
-  typedef typename internal::remove_all<typename Derived::Nested>::type ThisNested;
-  return internal::redux_impl<Func, ThisNested>
-            ::run(derived(), func);
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
 }
 
 /** \returns the minimum of all coefficients of \c *this.
@@ -337,7 +425,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
-  return this->redux(Eigen::internal::scalar_min_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
 }
 
 /** \returns the maximum of all coefficients of \c *this.
@@ -347,10 +435,12 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
-  return this->redux(Eigen::internal::scalar_max_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
 }
 
-/** \returns the sum of all coefficients of *this
+/** \returns the sum of all coefficients of \c *this
+  *
+  * If \c *this is empty, then the value 0 is returned.
   *
   * \sa trace(), prod(), mean()
   */
@@ -360,7 +450,7 @@ DenseBase<Derived>::sum() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
     return Scalar(0);
-  return this->redux(Eigen::internal::scalar_sum_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>());
 }
 
 /** \returns the mean of all coefficients of *this
@@ -371,7 +461,14 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
-  return Scalar(this->redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
 }
 
 /** \returns the product of all coefficients of *this
@@ -387,7 +484,7 @@ DenseBase<Derived>::prod() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
     return Scalar(1);
-  return this->redux(Eigen::internal::scalar_product_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_product_op<Scalar>());
 }
 
 /** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index f53674cff..bdf24f52a 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -12,79 +12,6 @@
 
 namespace Eigen { 
 
-template<typename Derived> class RefBase;
-template<typename PlainObjectType, int Options = 0,
-         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
-
-/** \class Ref
-  * \ingroup Core_Module
-  *
-  * \brief A matrix or vector expression mapping an existing expressions
-  *
-  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam Options specifies whether the pointer is \c #Aligned, or \c #Unaligned.
-  *                The default is \c #Unaligned.
-  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
-  *                   but accept a variable outer stride (leading dimension).
-  *                   This can be overridden by specifying strides.
-  *                   The type passed here must be a specialization of the Stride template, see examples below.
-  *
-  * This class permits to write non template functions taking Eigen's object as parameters while limiting the number of copies.
-  * A Ref<> object can represent either a const expression or a l-value:
-  * \code
-  * // in-out argument:
-  * void foo1(Ref<VectorXf> x);
-  *
-  * // read-only const argument:
-  * void foo2(const Ref<const VectorXf>& x);
-  * \endcode
-  *
-  * In the in-out case, the input argument must satisfies the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
-  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
-  * Likewise, a Ref<MatrixXf> can reference any column major dense matrix expression of float whose column's elements are contiguously stored with
-  * the possibility to have a constant space inbetween each column, i.e.: the inner stride mmust be equal to 1, but the outer-stride (or leading dimension),
-  * can be greater than the number of rows.
-  *
-  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
-  * Here are some examples:
-  * \code
-  * MatrixXf A;
-  * VectorXf a;
-  * foo1(a.head());             // OK
-  * foo1(A.col());              // OK
-  * foo1(A.row());              // compilation error because here innerstride!=1
-  * foo2(A.row());              // The row is copied into a contiguous temporary
-  * foo2(2*a);                  // The expression is evaluated into a temporary
-  * foo2(A.col().segment(2,4)); // No temporary
-  * \endcode
-  *
-  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameter.
-  * Here is an example accepting an innerstride!=1:
-  * \code
-  * // in-out argument:
-  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
-  * foo3(A.row());              // OK
-  * \endcode
-  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involved more
-  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overloads internally calling a
-  * template function, e.g.:
-  * \code
-  * // in the .h:
-  * void foo(const Ref<MatrixXf>& A);
-  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
-  *
-  * // in the .cpp:
-  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
-  *     ... // crazy code goes here
-  * }
-  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
-  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
-  * \endcode
-  *
-  *
-  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
-  */
-
 namespace internal {
 
 template<typename _PlainObjectType, int _Options, typename _StrideType>
@@ -95,7 +22,8 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
   typedef _StrideType StrideType;
   enum {
     Options = _Options,
-    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit
+    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit,
+    Alignment = traits<Map<_PlainObjectType, _Options, _StrideType> >::Alignment
   };
 
   template<typename Derived> struct match {
@@ -107,7 +35,13 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
                       || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
       OuterStrideMatch = Derived::IsVectorAtCompileTime
                       || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
-      AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
+      // NOTE, this indirection of evaluator<Derived>::Alignment is needed
+      // to workaround a very strange bug in MSVC related to the instantiation
+      // of has_*ary_operator in evaluator<CwiseNullaryOp>.
+      // This line is surprisingly very sensitive. For instance, simply adding parenthesis
+      // as "DerivedAlignment = (int(evaluator<Derived>::Alignment))," will make MSVC fail...
+      DerivedAlignment = int(evaluator<Derived>::Alignment),
+      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (DerivedAlignment >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
       ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
       MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
     };
@@ -132,12 +66,12 @@ public:
   typedef MapBase<Derived> Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
 
-  inline Index innerStride() const
+  EIGEN_DEVICE_FUNC inline Index innerStride() const
   {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  inline Index outerStride() const
+  EIGEN_DEVICE_FUNC inline Index outerStride() const
   {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
          : IsVectorAtCompileTime ? this->size()
@@ -145,7 +79,7 @@ public:
          : this->rows();
   }
 
-  RefBase()
+  EIGEN_DEVICE_FUNC RefBase()
     : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
       // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
       m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
@@ -159,7 +93,7 @@ protected:
   typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
 
   template<typename Expression>
-  void construct(Expression& expr)
+  EIGEN_DEVICE_FUNC void construct(Expression& expr)
   {
     if(PlainObjectType::RowsAtCompileTime==1)
     {
@@ -184,15 +118,83 @@ protected:
   StrideBase m_stride;
 };
 
-
+/** \class Ref
+  * \ingroup Core_Module
+  *
+  * \brief A matrix or vector expression mapping an existing expression
+  *
+  * \tparam PlainObjectType the equivalent matrix type of the mapped data
+  * \tparam Options specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
+  *                 The default is \c #Unaligned.
+  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
+  *                   but accepts a variable outer stride (leading dimension).
+  *                   This can be overridden by specifying strides.
+  *                   The type passed here must be a specialization of the Stride template, see examples below.
+  *
+  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
+  * A Ref<> object can represent either a const expression or a l-value:
+  * \code
+  * // in-out argument:
+  * void foo1(Ref<VectorXf> x);
+  *
+  * // read-only const argument:
+  * void foo2(const Ref<const VectorXf>& x);
+  * \endcode
+  *
+  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
+  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
+  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
+  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
+  * can be greater than the number of rows.
+  *
+  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
+  * Here are some examples:
+  * \code
+  * MatrixXf A;
+  * VectorXf a;
+  * foo1(a.head());             // OK
+  * foo1(A.col());              // OK
+  * foo1(A.row());              // Compilation error because here innerstride!=1
+  * foo2(A.row());              // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object
+  * foo2(A.row().transpose());  // The row is copied into a contiguous temporary
+  * foo2(2*a);                  // The expression is evaluated into a temporary
+  * foo2(A.col().segment(2,4)); // No temporary
+  * \endcode
+  *
+  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
+  * Here is an example accepting an innerstride!=1:
+  * \code
+  * // in-out argument:
+  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
+  * foo3(A.row());              // OK
+  * \endcode
+  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
+  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
+  * template function, e.g.:
+  * \code
+  * // in the .h:
+  * void foo(const Ref<MatrixXf>& A);
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
+  *
+  * // in the .cpp:
+  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
+  *     ... // crazy code goes here
+  * }
+  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
+  * \endcode
+  *
+  *
+  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
+  */
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
   : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
   private:
     typedef internal::traits<Ref> Traits;
     template<typename Derived>
-    inline Ref(const PlainObjectBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
+    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
   public:
 
     typedef RefBase<Ref> Base;
@@ -201,23 +203,24 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename Derived>
-    inline Ref(PlainObjectBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
+    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     {
-      EIGEN_STATIC_ASSERT(static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
       Base::construct(expr.derived());
     }
     template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     #else
+    /** Implicit constructor from any dense expression */
     template<typename Derived>
     inline Ref(DenseBase<Derived>& expr)
     #endif
     {
-      EIGEN_STATIC_ASSERT(static_cast<bool>(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
-      EIGEN_STATIC_ASSERT(static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
-      enum { THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY = Derived::ThisConstantIsPrivateInPlainObjectBase};
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
       Base::construct(expr.const_cast_derived());
     }
 
@@ -236,8 +239,8 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
     EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
 
     template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
     {
 //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
 //      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
@@ -245,18 +248,27 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
       construct(expr.derived(), typename Traits::template match<Derived>::type());
     }
 
+    EIGEN_DEVICE_FUNC inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    EIGEN_DEVICE_FUNC inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
   protected:
 
     template<typename Expression>
-    void construct(const Expression& expr,internal::true_type)
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type)
     {
       Base::construct(expr);
     }
 
     template<typename Expression>
-    void construct(const Expression& expr, internal::false_type)
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type)
     {
-      m_object.lazyAssign(expr);
+      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar,Scalar>());
       Base::construct(m_object);
     }
 
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index ac4537c14..9960ef884 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -12,21 +12,6 @@
 
 namespace Eigen { 
 
-/**
-  * \class Replicate
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the multiple replication of a matrix or vector
-  *
-  * \param MatrixType the type of the object we are replicating
-  *
-  * This class represents an expression of the multiple replication of a matrix or vector.
-  * It is the return type of DenseBase::replicate() and most of the time
-  * this is the only way it is used.
-  *
-  * \sa DenseBase::replicate()
-  */
-
 namespace internal {
 template<typename MatrixType,int RowFactor,int ColFactor>
 struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
@@ -35,10 +20,7 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
   typedef typename MatrixType::Scalar Scalar;
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
-  enum {
-    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
-  };
-  typedef typename nested<MatrixType,Factor>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsAtCompileTime = RowFactor==Dynamic || int(MatrixType::RowsAtCompileTime)==Dynamic
@@ -53,12 +35,29 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
     IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
                : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
                : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    Flags = (_MatrixTypeNested::Flags & HereditaryBits & ~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0),
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    
+    // FIXME enable DirectAccess with negative strides?
+    Flags = IsRowMajor ? RowMajorBit : 0
   };
 };
 }
 
+/**
+  * \class Replicate
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the multiple replication of a matrix or vector
+  *
+  * \tparam MatrixType the type of the object we are replicating
+  * \tparam RowFactor number of repetitions at compile time along the vertical direction, can be Dynamic.
+  * \tparam ColFactor number of repetitions at compile time along the horizontal direction, can be Dynamic.
+  *
+  * This class represents an expression of the multiple replication of a matrix or vector.
+  * It is the return type of DenseBase::replicate() and most of the time
+  * this is the only way it is used.
+  *
+  * \sa DenseBase::replicate()
+  */
 template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
   : public internal::dense_xpr_base< Replicate<MatrixType,RowFactor,ColFactor> >::type
 {
@@ -68,10 +67,12 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
 
     typedef typename internal::dense_xpr_base<Replicate>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Replicate)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
     template<typename OriginalMatrixType>
-    inline explicit Replicate(const OriginalMatrixType& a_matrix)
-      : m_matrix(a_matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
+    EIGEN_DEVICE_FUNC
+    inline explicit Replicate(const OriginalMatrixType& matrix)
+      : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
     {
       EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
                           THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
@@ -79,41 +80,20 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
     }
 
     template<typename OriginalMatrixType>
-    inline Replicate(const OriginalMatrixType& a_matrix, Index rowFactor, Index colFactor)
-      : m_matrix(a_matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
+    EIGEN_DEVICE_FUNC
+    inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
+      : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
     {
       EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
                           THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
     }
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
 
-    inline Scalar coeff(Index rowId, Index colId) const
-    {
-      // try to avoid using modulo; this is a pure optimization strategy
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.coeff(actual_row, actual_col);
-    }
-    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
-    {
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.template packet<LoadMode>(actual_row, actual_col);
-    }
-
+    EIGEN_DEVICE_FUNC
     const _MatrixTypeNested& nestedExpression() const
     { 
       return m_matrix; 
@@ -142,21 +122,6 @@ DenseBase<Derived>::replicate() const
 }
 
 /**
-  * \return an expression of the replication of \c *this
-  *
-  * Example: \include MatrixBase_replicate_int_int.cpp
-  * Output: \verbinclude MatrixBase_replicate_int_int.out
-  *
-  * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
-  */
-template<typename Derived>
-const typename DenseBase<Derived>::ReplicateReturnType
-DenseBase<Derived>::replicate(Index rowFactor,Index colFactor) const
-{
-  return Replicate<Derived,Dynamic,Dynamic>(derived(),rowFactor,colFactor);
-}
-
-/**
   * \return an expression of the replication of each column (or row) of \c *this
   *
   * Example: \include DirectionWise_replicate_int.cpp
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index f635598dc..c44b7673b 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -13,11 +13,6 @@
 
 namespace Eigen {
 
-/** \class ReturnByValue
-  * \ingroup Core_Module
-  *
-  */
-
 namespace internal {
 
 template<typename Derived>
@@ -38,17 +33,22 @@ struct traits<ReturnByValue<Derived> >
  * So internal::nested always gives the plain return matrix type.
  *
  * FIXME: I don't understand why we need this specialization: isn't this taken care of by the EvalBeforeNestingBit ??
+ * Answer: EvalBeforeNestingBit should be deprecated since we have the evaluators
  */
 template<typename Derived,int n,typename PlainObject>
-struct nested<ReturnByValue<Derived>, n, PlainObject>
+struct nested_eval<ReturnByValue<Derived>, n, PlainObject>
 {
   typedef typename traits<Derived>::ReturnType type;
 };
 
 } // end namespace internal
 
+/** \class ReturnByValue
+  * \ingroup Core_Module
+  *
+  */
 template<typename Derived> class ReturnByValue
-  : internal::no_assignment_operator, public internal::dense_xpr_base< ReturnByValue<Derived> >::type
+  : public internal::dense_xpr_base< ReturnByValue<Derived> >::type, internal::no_assignment_operator
 {
   public:
     typedef typename internal::traits<Derived>::ReturnType ReturnType;
@@ -57,10 +57,11 @@ template<typename Derived> class ReturnByValue
     EIGEN_DENSE_PUBLIC_INTERFACE(ReturnByValue)
 
     template<typename Dest>
+    EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const
     { static_cast<const Derived*>(this)->evalTo(dst); }
-    inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
-    inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
@@ -72,8 +73,7 @@ template<typename Derived> class ReturnByValue
     const Unusable& coeff(Index,Index) const { return *reinterpret_cast<const Unusable*>(this); }
     Unusable& coeffRef(Index) { return *reinterpret_cast<Unusable*>(this); }
     Unusable& coeffRef(Index,Index) { return *reinterpret_cast<Unusable*>(this); }
-    template<int LoadMode>  Unusable& packet(Index) const;
-    template<int LoadMode>  Unusable& packet(Index, Index) const;
+#undef Unusable
 #endif
 };
 
@@ -85,14 +85,32 @@ Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
   return derived();
 }
 
+namespace internal {
+
+// Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
+// when a ReturnByValue expression is assigned, the evaluator is not constructed.
+// TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
+  
 template<typename Derived>
-template<typename OtherDerived>
-Derived& DenseBase<Derived>::lazyAssign(const ReturnByValue<OtherDerived>& other)
+struct evaluator<ReturnByValue<Derived> >
+  : public evaluator<typename internal::traits<Derived>::ReturnType>
 {
-  other.evalTo(derived());
-  return derived();
-}
+  typedef ReturnByValue<Derived> XprType;
+  typedef typename internal::traits<Derived>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    xpr.evalTo(m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
 
+} // end namespace internal
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index e30ae3d28..0640cda2a 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -14,20 +14,6 @@
 
 namespace Eigen { 
 
-/** \class Reverse
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the reverse of a vector or matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the reverse
-  *
-  * This class represents an expression of the reverse of a vector.
-  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
-  */
-
 namespace internal {
 
 template<typename MatrixType, int Direction>
@@ -37,36 +23,43 @@ struct traits<Reverse<MatrixType, Direction> >
   typedef typename MatrixType::Scalar Scalar;
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-
-    // let's enable LinearAccess only with vectorization because of the product overhead
-    LinearAccess = ( (Direction==BothDirections) && (int(_MatrixTypeNested::Flags)&PacketAccessBit) )
-                 ? LinearAccessBit : 0,
-
-    Flags = int(_MatrixTypeNested::Flags) & (HereditaryBits | LvalueBit | PacketAccessBit | LinearAccess),
-
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    Flags = _MatrixTypeNested::Flags & (RowMajorBit | LvalueBit)
   };
 };
 
-template<typename PacketScalar, bool ReversePacket> struct reverse_packet_cond
+template<typename PacketType, bool ReversePacket> struct reverse_packet_cond
 {
-  static inline PacketScalar run(const PacketScalar& x) { return preverse(x); }
+  static inline PacketType run(const PacketType& x) { return preverse(x); }
 };
 
-template<typename PacketScalar> struct reverse_packet_cond<PacketScalar,false>
+template<typename PacketType> struct reverse_packet_cond<PacketType,false>
 {
-  static inline PacketScalar run(const PacketScalar& x) { return x; }
+  static inline PacketType run(const PacketType& x) { return x; }
 };
 
 } // end namespace internal 
 
+/** \class Reverse
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the reverse of a vector or matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the reverse
+  * \tparam Direction defines the direction of the reverse operation, can be Vertical, Horizontal, or BothDirections
+  *
+  * This class represents an expression of the reverse of a vector.
+  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
+  */
 template<typename MatrixType, int Direction> class Reverse
   : public internal::dense_xpr_base< Reverse<MatrixType, Direction> >::type
 {
@@ -74,12 +67,9 @@ template<typename MatrixType, int Direction> class Reverse
 
     typedef typename internal::dense_xpr_base<Reverse>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
     using Base::IsRowMajor;
 
-    // next line is necessary because otherwise const version of operator()
-    // is hidden by non-const version defined in this file
-    using Base::operator(); 
-
   protected:
     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
@@ -95,82 +85,19 @@ template<typename MatrixType, int Direction> class Reverse
     typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
   public:
 
-    inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
+    EIGEN_DEVICE_FUNC explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
 
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC inline Index innerStride() const
     {
       return -m_matrix.innerStride();
     }
 
-    inline Scalar& operator()(Index row, Index col)
-    {
-      eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
-      return coeffRef(row, col);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
-
-    inline CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                            ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_matrix.coeff(m_matrix.size() - index - 1);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
-    }
-
-    inline Scalar& operator()(Index index)
-    {
-      eigen_assert(index >= 0 && index < m_matrix.size());
-      return coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return reverse_packet::run(m_matrix.template packet<LoadMode>(
-                                    ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                    ReverseCol ? m_matrix.cols() - col - OffsetCol : col));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(
-                                      ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                      ReverseCol ? m_matrix.cols() - col - OffsetCol : col,
-                                      reverse_packet::run(x));
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return internal::preverse(m_matrix.template packet<LoadMode>( m_matrix.size() - index - PacketSize ));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
-    }
-
-    const typename internal::remove_all<typename MatrixType::Nested>::type& 
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
     nestedExpression() const 
     {
       return m_matrix;
@@ -190,33 +117,93 @@ template<typename Derived>
 inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
-  return derived();
+  return ReverseReturnType(derived());
 }
 
-/** This is the const version of reverse(). */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstReverseReturnType
-DenseBase<Derived>::reverse() const
-{
-  return derived();
-}
+
+//reverse const overload moved DenseBase.h due to a CUDA compiler bug
 
 /** This is the "in place" version of reverse: it reverses \c *this.
   *
   * In most cases it is probably better to simply use the reversed expression
   * of a matrix. However, when reversing the matrix data itself is really needed,
   * then this "in-place" version is probably the right choice because it provides
-  * the following additional features:
+  * the following additional benefits:
   *  - less error prone: doing the same operation with .reverse() requires special care:
   *    \code m = m.reverse().eval(); \endcode
-  *  - this API allows to avoid creating a temporary (the current implementation creates a temporary, but that could be avoided using swap)
+  *  - this API enables reverse operations without the need for a temporary
   *  - it allows future optimizations (cache friendliness, etc.)
   *
-  * \sa reverse() */
+  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
 inline void DenseBase<Derived>::reverseInPlace()
 {
-  derived() = derived().reverse().eval();
+  if(cols()>rows())
+  {
+    Index half = cols()/2;
+    leftCols(half).swap(rightCols(half).reverse());
+    if((cols()%2)==1)
+    {
+      Index half2 = rows()/2;
+      col(half).head(half2).swap(col(half).tail(half2).reverse());
+    }
+  }
+  else
+  {
+    Index half = rows()/2;
+    topRows(half).swap(bottomRows(half).reverse());
+    if((rows()%2)==1)
+    {
+      Index half2 = cols()/2;
+      row(half).head(half2).swap(row(half).tail(half2).reverse());
+    }
+  }
+}
+
+namespace internal {
+  
+template<int Direction>
+struct vectorwise_reverse_inplace_impl;
+
+template<>
+struct vectorwise_reverse_inplace_impl<Vertical>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    Index half = xpr.rows()/2;
+    xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
+  }
+};
+
+template<>
+struct vectorwise_reverse_inplace_impl<Horizontal>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    Index half = xpr.cols()/2;
+    xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
+  }
+};
+
+} // end namespace internal
+
+/** This is the "in place" version of VectorwiseOp::reverse: it reverses each column or row of \c *this.
+  *
+  * In most cases it is probably better to simply use the reversed expression
+  * of a matrix. However, when reversing the matrix data itself is really needed,
+  * then this "in-place" version is probably the right choice because it provides
+  * the following additional benefits:
+  *  - less error prone: doing the same operation with .reverse() requires special care:
+  *    \code m = m.reverse().eval(); \endcode
+  *  - this API enables reverse operations without the need for a temporary
+  *
+  * \sa DenseBase::reverseInPlace(), reverse() */
+template<typename ExpressionType, int Direction>
+void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+{
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h
index 87993bbb5..79eec1b5b 100644
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h
@@ -43,23 +43,21 @@ struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
     ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
-    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & HereditaryBits,
-    CoeffReadCost = traits<typename remove_all<ConditionMatrixNested>::type>::CoeffReadCost
-                  + EIGEN_SIZE_MAX(traits<typename remove_all<ThenMatrixNested>::type>::CoeffReadCost,
-                                   traits<typename remove_all<ElseMatrixNested>::type>::CoeffReadCost)
+    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit
   };
 };
 }
 
 template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-class Select : internal::no_assignment_operator,
-  public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type
+class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type,
+               internal::no_assignment_operator
 {
   public:
 
     typedef typename internal::dense_xpr_base<Select>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Select)
 
+    inline EIGEN_DEVICE_FUNC
     Select(const ConditionMatrixType& a_conditionMatrix,
            const ThenMatrixType& a_thenMatrix,
            const ElseMatrixType& a_elseMatrix)
@@ -69,9 +67,10 @@ class Select : internal::no_assignment_operator,
       eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
     }
 
-    Index rows() const { return m_condition.rows(); }
-    Index cols() const { return m_condition.cols(); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return m_condition.rows(); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return m_condition.cols(); }
 
+    inline EIGEN_DEVICE_FUNC
     const Scalar coeff(Index i, Index j) const
     {
       if (m_condition.coeff(i,j))
@@ -80,6 +79,7 @@ class Select : internal::no_assignment_operator,
         return m_else.coeff(i,j);
     }
 
+    inline EIGEN_DEVICE_FUNC
     const Scalar coeff(Index i) const
     {
       if (m_condition.coeff(i))
@@ -88,17 +88,17 @@ class Select : internal::no_assignment_operator,
         return m_else.coeff(i);
     }
 
-    const ConditionMatrixType& conditionMatrix() const
+    inline EIGEN_DEVICE_FUNC const ConditionMatrixType& conditionMatrix() const
     {
       return m_condition;
     }
 
-    const ThenMatrixType& thenMatrix() const
+    inline EIGEN_DEVICE_FUNC const ThenMatrixType& thenMatrix() const
     {
       return m_then;
     }
 
-    const ElseMatrixType& elseMatrix() const
+    inline EIGEN_DEVICE_FUNC const ElseMatrixType& elseMatrix() const
     {
       return m_else;
     }
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 6fa7cd15e..504c98f0e 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -32,54 +32,60 @@ namespace internal {
 template<typename MatrixType, unsigned int UpLo>
 struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
   typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
   typedef MatrixType ExpressionType;
-  typedef typename MatrixType::PlainObject DenseMatrixType;
+  typedef typename MatrixType::PlainObject FullMatrixType;
   enum {
     Mode = UpLo | SelfAdjoint,
-    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits)
-           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)), // FIXME these flags should be preserved
-    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits|FlagsLvalueBit)
+           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)) // FIXME these flags should be preserved
   };
 };
 }
 
-template <typename Lhs, int LhsMode, bool LhsIsVector,
-          typename Rhs, int RhsMode, bool RhsIsVector>
-struct SelfadjointProductMatrix;
 
-// FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
-template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
-  : public TriangularBase<SelfAdjointView<MatrixType, UpLo> >
+template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
+  : public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
 {
   public:
 
+    typedef _MatrixType MatrixType;
     typedef TriangularBase<SelfAdjointView> Base;
     typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
     typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
+    typedef MatrixTypeNestedCleaned NestedExpression;
 
     /** \brief The type of coefficients in this matrix */
     typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
-
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
 
     enum {
-      Mode = internal::traits<SelfAdjointView>::Mode
+      Mode = internal::traits<SelfAdjointView>::Mode,
+      Flags = internal::traits<SelfAdjointView>::Flags,
+      TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0)
     };
     typedef typename MatrixType::PlainObject PlainObject;
 
-    inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
+    EIGEN_DEVICE_FUNC
+    explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
     {}
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const { return m_matrix.innerStride(); }
 
     /** \sa MatrixBase::coeff()
       * \warning the coordinates must fit into the referenced triangular part
       */
+    EIGEN_DEVICE_FUNC
     inline Scalar coeff(Index row, Index col) const
     {
       Base::check_coordinates_internal(row, col);
@@ -89,36 +95,46 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
     /** \sa MatrixBase::coeffRef()
       * \warning the coordinates must fit into the referenced triangular part
       */
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index col)
     {
+      EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
       Base::check_coordinates_internal(row, col);
-      return m_matrix.const_cast_derived().coeffRef(row, col);
+      return m_matrix.coeffRef(row, col);
     }
 
     /** \internal */
+    EIGEN_DEVICE_FUNC
     const MatrixTypeNestedCleaned& _expression() const { return m_matrix; }
 
+    EIGEN_DEVICE_FUNC
     const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
+    EIGEN_DEVICE_FUNC
+    MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; }
 
-    /** Efficient self-adjoint matrix times vector/matrix product */
+    /** Efficient triangular matrix times vector/matrix product */
     template<typename OtherDerived>
-    SelfadjointProductMatrix<MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
+    EIGEN_DEVICE_FUNC
+    const Product<SelfAdjointView,OtherDerived>
     operator*(const MatrixBase<OtherDerived>& rhs) const
     {
-      return SelfadjointProductMatrix
-              <MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
-              (m_matrix, rhs.derived());
+      return Product<SelfAdjointView,OtherDerived>(*this, rhs.derived());
     }
 
-    /** Efficient vector/matrix times self-adjoint matrix product */
+    /** Efficient vector/matrix times triangular matrix product */
     template<typename OtherDerived> friend
-    SelfadjointProductMatrix<OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
+    EIGEN_DEVICE_FUNC
+    const Product<OtherDerived,SelfAdjointView>
     operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
     {
-      return SelfadjointProductMatrix
-              <OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
-              (lhs.derived(),rhs.m_matrix);
+      return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
+    }
+    
+    friend EIGEN_DEVICE_FUNC
+    const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
+    operator*(const Scalar& s, const SelfAdjointView& mat)
+    {
+      return (s*mat.nestedExpression()).template selfadjointView<UpLo>();
     }
 
     /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this:
@@ -132,6 +148,7 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
       * \sa rankUpdate(const MatrixBase<DerivedU>&, Scalar)
       */
     template<typename DerivedU, typename DerivedV>
+    EIGEN_DEVICE_FUNC
     SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha = Scalar(1));
 
     /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
@@ -145,8 +162,74 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
       * \sa rankUpdate(const MatrixBase<DerivedU>&, const MatrixBase<DerivedV>&, Scalar)
       */
     template<typename DerivedU>
+    EIGEN_DEVICE_FUNC
     SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
 
+    /** \returns an expression of a triangular view extracted from the current selfadjoint view of a given triangular part
+      *
+      * The parameter \a TriMode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
+      * \c #Lower, \c #StrictlyLower, \c #UnitLower.
+      *
+      * If \c TriMode references the same triangular part than \c *this, then this method simply return a \c TriangularView of the nested expression,
+      * otherwise, the nested expression is first transposed, thus returning a \c TriangularView<Transpose<MatrixType>> object.
+      *
+      * \sa MatrixBase::triangularView(), class TriangularView
+      */
+    template<unsigned int TriMode>
+    EIGEN_DEVICE_FUNC
+    typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
+                                   TriangularView<MatrixType,TriMode>,
+                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type
+    triangularView() const
+    {
+      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::ConstTransposeReturnType>::type tmp1(m_matrix);
+      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::AdjointReturnType>::type tmp2(tmp1);
+      return typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
+                                   TriangularView<MatrixType,TriMode>,
+                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
+    }
+
+    typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
+    /** \sa MatrixBase::conjugate() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConjugateReturnType conjugate() const
+    { return ConjugateReturnType(m_matrix.conjugate()); }
+
+    typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
+    /** \sa MatrixBase::adjoint() const */
+    EIGEN_DEVICE_FUNC
+    inline const AdjointReturnType adjoint() const
+    { return AdjointReturnType(m_matrix.adjoint()); }
+
+    typedef SelfAdjointView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
+     /** \sa MatrixBase::transpose() */
+    EIGEN_DEVICE_FUNC
+    inline TransposeReturnType transpose()
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
+      return TransposeReturnType(tmp);
+    }
+
+    typedef SelfAdjointView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
+    /** \sa MatrixBase::transpose() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(m_matrix.transpose());
+    }
+
+    /** \returns a const expression of the main diagonal of the matrix \c *this
+      *
+      * This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
+      *
+      * \sa MatrixBase::diagonal(), class Diagonal */
+    EIGEN_DEVICE_FUNC
+    typename MatrixType::ConstDiagonalReturnType diagonal() const
+    {
+      return typename MatrixType::ConstDiagonalReturnType(m_matrix);
+    }
+
 /////////// Cholesky module ///////////
 
     const LLT<PlainObject, UpLo> llt() const;
@@ -159,31 +242,10 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
     /** Return type of eigenvalues() */
     typedef Matrix<RealScalar, internal::traits<MatrixType>::ColsAtCompileTime, 1> EigenvaluesReturnType;
 
+    EIGEN_DEVICE_FUNC
     EigenvaluesReturnType eigenvalues() const;
+    EIGEN_DEVICE_FUNC
     RealScalar operatorNorm() const;
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    SelfAdjointView& operator=(const MatrixBase<OtherDerived>& other)
-    {
-      enum {
-        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
-      };
-      m_matrix.const_cast_derived().template triangularView<UpLo>() = other;
-      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.adjoint();
-      return *this;
-    }
-    template<typename OtherMatrixType, unsigned int OtherMode>
-    SelfAdjointView& operator=(const TriangularView<OtherMatrixType, OtherMode>& other)
-    {
-      enum {
-        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
-      };
-      m_matrix.const_cast_derived().template triangularView<UpLo>() = other.toDenseMatrix();
-      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.toDenseMatrix().adjoint();
-      return *this;
-    }
-    #endif
 
   protected:
     MatrixTypeNested m_matrix;
@@ -201,90 +263,54 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
 
 namespace internal {
 
-template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount, ClearOpposite>
-{
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
-
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    if(row == col)
-      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
-    else if(row < col)
-      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, 0, ClearOpposite>
-{
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount, ClearOpposite>
-{
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
-
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    if(row == col)
-      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
-    else if(row > col)
-      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, 0, ClearOpposite>
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SelfAdjointView<MatrixType,Mode> >
 {
-  static inline void run(Derived1 &, const Derived2 &) {}
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SelfAdjointShape Shape;
 };
 
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dynamic, ClearOpposite>
+template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version>
+class triangular_dense_assignment_kernel<UpLo,SelfAdjoint,SetOpposite,DstEvaluatorTypeT,SrcEvaluatorTypeT,Functor,Version>
+  : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
 {
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+public:
+  
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+  
+  
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+  
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
   {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = 0; i < j; ++i)
-      {
-        dst.copyCoeff(i, j, src);
-        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
-      }
-      dst.copyCoeff(j, j, src);
-    }
+    eigen_internal_assert(row!=col);
+    Scalar tmp = m_src.coeff(row,col);
+    m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
+    m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
   }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, Dynamic, ClearOpposite>
-{
-  static inline void run(Derived1 &dst, const Derived2 &src)
+  
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
   {
-  typedef typename Derived1::Index Index;
-    for(Index i = 0; i < dst.rows(); ++i)
-    {
-      for(Index j = 0; j < i; ++j)
-      {
-        dst.copyCoeff(i, j, src);
-        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
-      }
-      dst.copyCoeff(i, i, src);
-    }
+    Base::assignCoeff(id,id);
   }
+  
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
+  { eigen_internal_assert(false && "should never be called"); }
 };
 
 } // end namespace internal
@@ -293,20 +319,30 @@ struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, Dyn
 * Implementation of MatrixBase methods
 ***************************************************************************/
 
+/** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
 typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
-  return derived();
+  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
 }
 
+/** \returns an expression of a symmetric/self-adjoint view extracted from the upper or lower triangular part of the current matrix
+  *
+  * The parameter \a UpLo can be either \c #Upper or \c #Lower
+  *
+  * Example: \include MatrixBase_selfadjointView.cpp
+  * Output: \verbinclude MatrixBase_selfadjointView.out
+  *
+  * \sa class SelfAdjointView
+  */
 template<typename Derived>
 template<unsigned int UpLo>
 typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
-  return derived();
+  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index 22f3047b4..719ed72a5 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -12,183 +12,37 @@
 
 namespace Eigen { 
 
-/** \class SelfCwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \internal
-  *
-  * \brief Internal helper class for optimizing operators like +=, -=
-  *
-  * This is a pseudo expression class re-implementing the copyCoeff/copyPacket
-  * method to directly performs a +=/-= operations in an optimal way. In particular,
-  * this allows to make sure that the input/output data are loaded only once using
-  * aligned packet loads.
-  *
-  * \sa class SwapWrapper for a similar trick.
-  */
+// TODO generalize the scalar type of 'other'
 
-namespace internal {
-template<typename BinaryOp, typename Lhs, typename Rhs>
-struct traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> >
-  : traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
-  enum {
-    // Note that it is still a good idea to preserve the DirectAccessBit
-    // so that assign can correctly align the data.
-    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
-    OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime,
-    InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime
-  };
-};
+  typedef typename Derived::PlainObject PlainObject;
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
+  return derived();
 }
 
-template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
-  : public internal::dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
-  public:
-
-    typedef typename internal::dense_xpr_base<SelfCwiseBinaryOp>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SelfCwiseBinaryOp)
-
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-
-    inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
-    inline const Scalar* data() const { return m_matrix.data(); }
-
-    // note that this function is needed by assign to correctly align loads/stores
-    // TODO make Assign use .data()
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-    inline const Scalar& coeffRef(Index row, Index col) const
-    {
-      return m_matrix.coeffRef(row, col);
-    }
-
-    // note that this function is needed by assign to correctly align loads/stores
-    // TODO make Assign use .data()
-    inline Scalar& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(row >= 0 && row < rows()
-                         && col >= 0 && col < cols());
-      Scalar& tmp = m_matrix.coeffRef(row,col);
-      tmp = m_functor(tmp, _other.coeff(row,col));
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_matrix.size());
-      Scalar& tmp = m_matrix.coeffRef(index);
-      tmp = m_functor(tmp, _other.coeff(index));
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      m_matrix.template writePacket<StoreMode>(row, col,
-        m_functor.packetOp(m_matrix.template packet<StoreMode>(row, col),_other.template packet<LoadMode>(row, col)) );
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_matrix.size());
-      m_matrix.template writePacket<StoreMode>(index,
-        m_functor.packetOp(m_matrix.template packet<StoreMode>(index),_other.template packet<LoadMode>(index)) );
-    }
-
-    // reimplement lazyAssign to handle complex *= real
-    // see CwiseBinaryOp ctor for details
-    template<typename RhsDerived>
-    EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs)
-    {
-      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived)
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename RhsDerived::Scalar);
-      
-    #ifdef EIGEN_DEBUG_ASSIGN
-      internal::assign_traits<SelfCwiseBinaryOp, RhsDerived>::debug();
-    #endif
-      eigen_assert(rows() == rhs.rows() && cols() == rhs.cols());
-      internal::assign_impl<SelfCwiseBinaryOp, RhsDerived>::run(*this,rhs.derived());
-    #ifndef EIGEN_NO_DEBUG
-      this->checkTransposeAliasing(rhs.derived());
-    #endif
-      return *this;
-    }
-    
-    // overloaded to honor evaluation of special matrices
-    // maybe another solution would be to not use SelfCwiseBinaryOp
-    // at first...
-    SelfCwiseBinaryOp& operator=(const Rhs& _rhs)
-    {
-      typename internal::nested<Rhs>::type rhs(_rhs);
-      return Base::operator=(rhs);
-    }
-
-    Lhs& expression() const 
-    { 
-      return m_matrix;
-    }
-
-    const BinaryOp& functor() const 
-    { 
-      return m_functor;
-    }
-
-  protected:
-    Lhs& m_matrix;
-    const BinaryOp& m_functor;
-
-  private:
-    SelfCwiseBinaryOp& operator=(const SelfCwiseBinaryOp&);
-};
+  typedef typename Derived::PlainObject PlainObject;
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
+  return derived();
+}
 
 template<typename Derived>
-inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
   return derived();
 }
 
 template<typename Derived>
-inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
-  typedef typename internal::conditional<NumTraits<Scalar>::IsInteger,
-                                        internal::scalar_quotient_op<Scalar>,
-                                        internal::scalar_product_op<Scalar> >::type BinOp;
   typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<BinOp, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  Scalar actual_other;
-  if(NumTraits<Scalar>::IsInteger)  actual_other = other;
-  else                              actual_other = Scalar(1)/other;
-  tmp = PlainObject::Constant(rows(),cols(), actual_other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
   return derived();
 }
 
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
new file mode 100644
index 000000000..960a58597
--- /dev/null
+++ b/Eigen/src/Core/Solve.h
@@ -0,0 +1,188 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVE_H
+#define EIGEN_SOLVE_H
+
+namespace Eigen {
+
+template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
+  
+/** \class Solve
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression representing a solving operation
+  *
+  * \tparam Decomposition the type of the matrix or decomposion object
+  * \tparam Rhstype the type of the right-hand side
+  *
+  * This class represents an expression of A.solve(B)
+  * and most of the time this is the only way it is used.
+  *
+  */
+namespace internal {
+
+// this solve_traits class permits to determine the evaluation type with respect to storage kind (Dense vs Sparse)
+template<typename Decomposition, typename RhsType,typename StorageKind> struct solve_traits;
+
+template<typename Decomposition, typename RhsType>
+struct solve_traits<Decomposition,RhsType,Dense>
+{
+  typedef Matrix<typename RhsType::Scalar,
+                 Decomposition::ColsAtCompileTime,
+                 RhsType::ColsAtCompileTime,
+                 RhsType::PlainObject::Options,
+                 Decomposition::MaxColsAtCompileTime,
+                 RhsType::MaxColsAtCompileTime> PlainObject;  
+};
+
+template<typename Decomposition, typename RhsType>
+struct traits<Solve<Decomposition, RhsType> >
+  : traits<typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject>
+{
+  typedef typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject PlainObject;
+  typedef typename promote_index_type<typename Decomposition::StorageIndex, typename RhsType::StorageIndex>::type StorageIndex;
+  typedef traits<PlainObject> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit,
+    CoeffReadCost = HugeCost
+  };
+};
+
+}
+
+
+template<typename Decomposition, typename RhsType>
+class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>
+{
+public:
+  typedef typename internal::traits<Solve>::PlainObject PlainObject;
+  typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
+  
+  Solve(const Decomposition &dec, const RhsType &rhs)
+    : m_dec(dec), m_rhs(rhs)
+  {}
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; }
+  EIGEN_DEVICE_FUNC const RhsType&       rhs() const { return m_rhs; }
+
+protected:
+  const Decomposition &m_dec;
+  const RhsType       &m_rhs;
+};
+
+
+// Specialization of the Solve expression for dense results
+template<typename Decomposition, typename RhsType>
+class SolveImpl<Decomposition,RhsType,Dense>
+  : public MatrixBase<Solve<Decomposition,RhsType> >
+{
+  typedef Solve<Decomposition,RhsType> Derived;
+  
+public:
+  
+  typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+
+private:
+  
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+// Generic API dispatcher
+template<typename Decomposition, typename RhsType, typename StorageKind>
+class SolveImpl : public internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type
+{
+  public:
+    typedef typename internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type Base;
+};
+
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename Decomposition, typename RhsType>
+struct evaluator<Solve<Decomposition,RhsType> >
+  : public evaluator<typename Solve<Decomposition,RhsType>::PlainObject>
+{
+  typedef Solve<Decomposition,RhsType> SolveType;
+  typedef typename SolveType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
+    : m_result(solve.rows(), solve.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    solve.dec()._solve_impl(solve.rhs(), m_result);
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+// Specialization for "dst = dec.solve(rhs)"
+// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<DecType,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec()._solve_impl(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.transpose().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.adjoint().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>,
+                  internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    
+    src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
+  }
+};
+
+} // end namepsace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVE_H
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index ef17f288e..049890b25 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -68,7 +68,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
     if(!useRhsDirectly)
       MappedRhs(actualRhs,rhs.size()) = rhs;
 
-    triangular_solve_vector<LhsScalar, RhsScalar, typename Lhs::Index, Side, Mode, LhsProductTraits::NeedToConjugate,
+    triangular_solve_vector<LhsScalar, RhsScalar, Index, Side, Mode, LhsProductTraits::NeedToConjugate,
                             (int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor>
       ::run(actualLhs.cols(), actualLhs.data(), actualLhs.outerStride(), actualRhs);
 
@@ -82,7 +82,6 @@ template<typename Lhs, typename Rhs, int Side, int Mode>
 struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
 {
   typedef typename Rhs::Scalar Scalar;
-  typedef typename Rhs::Index Index;
   typedef blas_traits<Lhs> LhsProductTraits;
   typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
 
@@ -96,7 +95,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
     typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
               Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
 
-    BlockingType blocking(rhs.rows(), rhs.cols(), size);
+    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
 
     triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
                                (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
@@ -108,32 +107,32 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
 * meta-unrolling implementation
 ***************************************************************************/
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size,
-         bool Stop = Index==Size>
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size,
+         bool Stop = LoopIndex==Size>
 struct triangular_solver_unroller;
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,false> {
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
   enum {
     IsLower = ((Mode&Lower)==Lower),
-    I = IsLower ? Index : Size - Index - 1,
-    S = IsLower ? 0     : I+1
+    DiagIndex  = IsLower ? LoopIndex : Size - LoopIndex - 1,
+    StartIndex = IsLower ? 0         : DiagIndex+1
   };
   static void run(const Lhs& lhs, Rhs& rhs)
   {
-    if (Index>0)
-      rhs.coeffRef(I) -= lhs.row(I).template segment<Index>(S).transpose()
-                         .cwiseProduct(rhs.template segment<Index>(S)).sum();
+    if (LoopIndex>0)
+      rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment<LoopIndex>(StartIndex).transpose()
+                                .cwiseProduct(rhs.template segment<LoopIndex>(StartIndex)).sum();
 
     if(!(Mode & UnitDiag))
-      rhs.coeffRef(I) /= lhs.coeff(I,I);
+      rhs.coeffRef(DiagIndex) /= lhs.coeff(DiagIndex,DiagIndex);
 
-    triangular_solver_unroller<Lhs,Rhs,Mode,Index+1,Size>::run(lhs,rhs);
+    triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex+1,Size>::run(lhs,rhs);
   }
 };
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,true> {
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,true> {
   static void run(const Lhs&, Rhs&) {}
 };
 
@@ -162,61 +161,35 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 * TriangularView methods
 ***************************************************************************/
 
-/** "in-place" version of TriangularView::solve() where the result is written in \a other
-  *
-  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
-  * This function will const_cast it, so constness isn't honored here.
-  *
-  * See TriangularView:solve() for the details.
-  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
   OtherDerived& other = _other.const_cast_derived();
-  eigen_assert( cols() == rows() && ((Side==OnTheLeft && cols() == other.rows()) || (Side==OnTheRight && cols() == other.cols())) );
+  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
   eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
 
-  enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
+  enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit)  && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1};
   typedef typename internal::conditional<copy,
     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
   OtherCopy otherCopy(other);
 
   internal::triangular_solver_selector<MatrixType, typename internal::remove_reference<OtherCopy>::type,
-    Side, Mode>::run(nestedExpression(), otherCopy);
+    Side, Mode>::run(derived().nestedExpression(), otherCopy);
 
   if (copy)
     other = otherCopy;
 }
 
-/** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
-  *
-  * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
-  * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
-  * \a Side==OnTheRight.
-  *
-  * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
-  * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
-  * is an upper (resp. lower) triangular matrix.
-  *
-  * Example: \include MatrixBase_marked.cpp
-  * Output: \verbinclude MatrixBase_marked.out
-  *
-  * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
-  * to the same matrix or vector \a other.
-  *
-  * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
-  * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
-  *
-  * \sa TriangularView::solveInPlace()
-  */
 template<typename Derived, unsigned int Mode>
 template<int Side, typename Other>
 const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
-TriangularView<Derived,Mode>::solve(const MatrixBase<Other>& other) const
+TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) const
 {
-  return internal::triangular_solve_retval<Side,TriangularView,Other>(*this, other.derived());
+  return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
 }
+#endif
 
 namespace internal {
 
@@ -232,7 +205,6 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
 {
   typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
   typedef ReturnByValue<triangular_solve_retval> Base;
-  typedef typename Base::Index Index;
 
   triangular_solve_retval(const TriangularType& tri, const Rhs& rhs)
     : m_triangularMatrix(tri), m_rhs(rhs)
@@ -243,7 +215,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
 
   template<typename Dest> inline void evalTo(Dest& dst) const
   {
-    if(!(is_same<RhsNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_rhs)))
+    if(!is_same_dense(dst,m_rhs))
       dst = m_rhs;
     m_triangularMatrix.template solveInPlace<Side>(dst);
   }
diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
new file mode 100644
index 000000000..8a4adc229
--- /dev/null
+++ b/Eigen/src/Core/SolverBase.h
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVERBASE_H
+#define EIGEN_SOLVERBASE_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+
+} // end namespace internal
+
+/** \class SolverBase
+  * \brief A base class for matrix decomposition and solvers
+  *
+  * \tparam Derived the actual type of the decomposition/solver.
+  *
+  * Any matrix decomposition inheriting this base class provide the following API:
+  *
+  * \code
+  * MatrixType A, b, x;
+  * DecompositionType dec(A);
+  * x = dec.solve(b);             // solve A   * x = b
+  * x = dec.transpose().solve(b); // solve A^T * x = b
+  * x = dec.adjoint().solve(b);   // solve A'  * x = b
+  * \endcode
+  *
+  * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
+  *
+  * \sa class PartialPivLU, class FullPivLU
+  */
+template<typename Derived>
+class SolverBase : public EigenBase<Derived>
+{
+  public:
+
+    typedef EigenBase<Derived> Base;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef Scalar CoeffReturnType;
+
+    enum {
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                          internal::traits<Derived>::ColsAtCompileTime>::ret),
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                             internal::traits<Derived>::MaxColsAtCompileTime>::ret),
+      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1
+    };
+
+    /** Default constructor */
+    SolverBase()
+    {}
+
+    ~SolverBase()
+    {}
+
+    using Base::derived;
+
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+
+    /** \internal the return type of transpose() */
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    /** \returns an expression of the transposed of the factored matrix.
+      *
+      * A typical usage is to solve for the transposed problem A^T x = b:
+      * \code x = dec.transpose().solve(b); \endcode
+      *
+      * \sa adjoint(), solve()
+      */
+    inline ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(derived());
+    }
+
+    /** \internal the return type of adjoint() */
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
+                        ConstTransposeReturnType
+                     >::type AdjointReturnType;
+    /** \returns an expression of the adjoint of the factored matrix
+      *
+      * A typical usage is to solve for the adjoint problem A' x = b:
+      * \code x = dec.adjoint().solve(b); \endcode
+      *
+      * For real scalar types, this function is equivalent to transpose().
+      *
+      * \sa transpose(), solve()
+      */
+    inline AdjointReturnType adjoint() const
+    {
+      return AdjointReturnType(derived().transpose());
+    }
+
+  protected:
+};
+
+namespace internal {
+
+template<typename Derived>
+struct generic_xpr_base<Derived, MatrixXpr, SolverStorage>
+{
+  typedef SolverBase<Derived> type;
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVERBASE_H
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index 389d94275..d2fe1e199 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -17,10 +17,9 @@ namespace internal {
 template<typename ExpressionType, typename Scalar>
 inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale)
 {
-  using std::max;
   Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
   
-  if (maxCoeff>scale)
+  if(maxCoeff>scale)
   {
     ssq = ssq * numext::abs2(scale/maxCoeff);
     Scalar tmp = Scalar(1)/maxCoeff;
@@ -29,12 +28,21 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
       invScale = NumTraits<Scalar>::highest();
       scale = Scalar(1)/invScale;
     }
+    else if(maxCoeff>NumTraits<Scalar>::highest()) // we got a INF
+    {
+      invScale = Scalar(1);
+      scale = maxCoeff;
+    }
     else
     {
       scale = maxCoeff;
       invScale = tmp;
     }
   }
+  else if(maxCoeff!=maxCoeff) // we got a NaN
+  {
+    scale = maxCoeff;
+  }
   
   // TODO if the maxCoeff is much much smaller than the current scale,
   // then we can neglect this sub vector
@@ -47,15 +55,12 @@ inline typename NumTraits<typename traits<Derived>::Scalar>::Real
 blueNorm_impl(const EigenBase<Derived>& _vec)
 {
   typedef typename Derived::RealScalar RealScalar;  
-  typedef typename Derived::Index Index;
   using std::pow;
-  using std::min;
-  using std::max;
   using std::sqrt;
   using std::abs;
   const Derived& vec(_vec.derived());
   static bool initialized = false;
-  static RealScalar b1, b2, s1m, s2m, overfl, rbig, relerr;
+  static RealScalar b1, b2, s1m, s2m, rbig, relerr;
   if(!initialized)
   {
     int ibeta, it, iemin, iemax, iexp;
@@ -84,7 +89,6 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
     iexp  = - ((iemax+it)/2);
     s2m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for upper range
 
-    overfl  = rbig*s2m;                                             // overflow boundary for abig
     eps     = RealScalar(pow(double(ibeta), 1-it));
     relerr  = sqrt(eps);                                            // tolerance for neglecting asml
     initialized = true;
@@ -101,13 +105,13 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
     else if(ax < b1) asml += numext::abs2(ax*s1m);
     else             amed += numext::abs2(ax);
   }
+  if(amed!=amed)
+    return amed;  // we got a NaN
   if(abig > RealScalar(0))
   {
     abig = sqrt(abig);
-    if(abig > overfl)
-    {
-      return rbig;
-    }
+    if(abig > rbig) // overflow, or *this contains INF values
+      return abig;  // return INF
     if(amed > RealScalar(0))
     {
       abig = abig/s2m;
@@ -128,8 +132,8 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
   }
   else
     return sqrt(amed);
-  asml = (min)(abig, amed);
-  abig = (max)(abig, amed);
+  asml = numext::mini(abig, amed);
+  abig = numext::maxi(abig, amed);
   if(asml <= abig*relerr)
     return abig;
   else
@@ -152,21 +156,34 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  using std::min;
   using std::sqrt;
+  using std::abs;
   const Index blockSize = 4096;
   RealScalar scale(0);
   RealScalar invScale(1);
   RealScalar ssq(0); // sum of square
+  
+  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
+  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
+  DerivedCopy copy(derived());
+  
   enum {
-    Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? 1 : 0
+    CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
+                || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
   };
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
+                                                   typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
   Index n = size();
-  Index bi = internal::first_aligned(derived());
+  
+  if(n==1)
+    return abs(this->coeff(0));
+  
+  Index bi = internal::first_default_aligned(copy);
   if (bi>0)
-    internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
   for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(this->segment(bi,(min)(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
   return scale * sqrt(ssq);
 }
 
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index 1e3f5fe9f..513742f34 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -31,8 +31,8 @@ namespace Eigen {
   * arguments to the constructor.
   *
   * Indeed, this class takes two template parameters:
-  *  \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
-  *  \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
+  *  \tparam _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
+  *  \tparam _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
   *
   * Here is an example:
   * \include Map_general_stride.cpp
@@ -44,13 +44,14 @@ template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
 class Stride
 {
   public:
-    typedef DenseIndex Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
     enum {
       InnerStrideAtCompileTime = _InnerStrideAtCompileTime,
       OuterStrideAtCompileTime = _OuterStrideAtCompileTime
     };
 
     /** Default constructor, for use when strides are fixed at compile time */
+    EIGEN_DEVICE_FUNC
     Stride()
       : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
     {
@@ -58,6 +59,7 @@ class Stride
     }
 
     /** Constructor allowing to pass the strides at runtime */
+    EIGEN_DEVICE_FUNC
     Stride(Index outerStride, Index innerStride)
       : m_outer(outerStride), m_inner(innerStride)
     {
@@ -65,13 +67,16 @@ class Stride
     }
 
     /** Copy constructor */
+    EIGEN_DEVICE_FUNC
     Stride(const Stride& other)
       : m_outer(other.outer()), m_inner(other.inner())
     {}
 
     /** \returns the outer stride */
+    EIGEN_DEVICE_FUNC
     inline Index outer() const { return m_outer.value(); }
     /** \returns the inner stride */
+    EIGEN_DEVICE_FUNC
     inline Index inner() const { return m_inner.value(); }
 
   protected:
@@ -81,26 +86,24 @@ class Stride
 
 /** \brief Convenience specialization of Stride to specify only an inner stride
   * See class Map for some examples */
-template<int Value = Dynamic>
+template<int Value>
 class InnerStride : public Stride<0, Value>
 {
     typedef Stride<0, Value> Base;
   public:
-    typedef DenseIndex Index;
-    InnerStride() : Base() {}
-    InnerStride(Index v) : Base(0, v) {}
+    EIGEN_DEVICE_FUNC InnerStride() : Base() {}
+    EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {} // FIXME making this explicit could break valid code
 };
 
 /** \brief Convenience specialization of Stride to specify only an outer stride
   * See class Map for some examples */
-template<int Value = Dynamic>
+template<int Value>
 class OuterStride : public Stride<Value, 0>
 {
     typedef Stride<Value, 0> Base;
   public:
-    typedef DenseIndex Index;
-    OuterStride() : Base() {}
-    OuterStride(Index v) : Base(v,0) {}
+    EIGEN_DEVICE_FUNC OuterStride() : Base() {}
+    EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {} // FIXME making this explicit could break valid code
 };
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index bf58bd599..d70200918 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -12,115 +12,56 @@
 
 namespace Eigen { 
 
-/** \class SwapWrapper
-  * \ingroup Core_Module
-  *
-  * \internal
-  *
-  * \brief Internal helper class for swapping two expressions
-  */
 namespace internal {
-template<typename ExpressionType>
-struct traits<SwapWrapper<ExpressionType> > : traits<ExpressionType> {};
-}
 
-template<typename ExpressionType> class SwapWrapper
-  : public internal::dense_xpr_base<SwapWrapper<ExpressionType> >::type
+// Overload default assignPacket behavior for swapping them
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
+class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, Specialized>
+ : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn>
 {
-  public:
-
-    typedef typename internal::dense_xpr_base<SwapWrapper>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SwapWrapper)
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-
-    inline SwapWrapper(ExpressionType& xpr) : m_expression(xpr) {}
-
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
-    
-    typedef typename internal::conditional<
-                       internal::is_lvalue<ExpressionType>::value,
-                       Scalar,
-                       const Scalar
-                     >::type ScalarWithConstIfNotLvalue;
-                     
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
-    inline const Scalar* data() const { return m_expression.data(); }
-
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    inline Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      return m_expression.coeffRef(rowId, colId);
-    }
-
-    inline Scalar& coeffRef(Index index) const
-    {
-      return m_expression.coeffRef(index);
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(rowId >= 0 && rowId < rows()
-                         && colId >= 0 && colId < cols());
-      Scalar tmp = m_expression.coeff(rowId, colId);
-      m_expression.coeffRef(rowId, colId) = _other.coeff(rowId, colId);
-      _other.coeffRef(rowId, colId) = tmp;
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_expression.size());
-      Scalar tmp = m_expression.coeff(index);
-      m_expression.coeffRef(index) = _other.coeff(index);
-      _other.coeffRef(index) = tmp;
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(rowId >= 0 && rowId < rows()
-                        && colId >= 0 && colId < cols());
-      Packet tmp = m_expression.template packet<StoreMode>(rowId, colId);
-      m_expression.template writePacket<StoreMode>(rowId, colId,
-        _other.template packet<LoadMode>(rowId, colId)
-      );
-      _other.template writePacket<LoadMode>(rowId, colId, tmp);
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_expression.size());
-      Packet tmp = m_expression.template packet<StoreMode>(index);
-      m_expression.template writePacket<StoreMode>(index,
-        _other.template packet<LoadMode>(index)
-      );
-      _other.template writePacket<LoadMode>(index, tmp);
-    }
-
-    ExpressionType& expression() const { return m_expression; }
-
-  protected:
-    ExpressionType& m_expression;
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn> Base;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+  
+public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::DstXprType DstXprType;
+  typedef swap_assign_op<Scalar> Functor;
+  
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  void assignPacket(Index row, Index col)
+  {
+    PacketType tmp = m_src.template packet<LoadMode,PacketType>(row,col);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode,PacketType>(row,col));
+    m_dst.template writePacket<StoreMode>(row,col,tmp);
+  }
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  void assignPacket(Index index)
+  {
+    PacketType tmp = m_src.template packet<LoadMode,PacketType>(index);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode,PacketType>(index));
+    m_dst.template writePacket<StoreMode>(index,tmp);
+  }
+  
+  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
+  template<int StoreMode, int LoadMode, typename PacketType>
+  void assignPacketByOuterInner(Index outer, Index inner)
+  {
+    Index row = Base::rowIndexByOuterInner(outer, inner); 
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
+  }
 };
 
+} // namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SWAP_H
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 22096ea2f..79b767bcc 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,39 +13,21 @@
 
 namespace Eigen { 
 
-/** \class Transpose
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the transpose of a matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the transpose
-  *
-  * This class represents an expression of the transpose of a matrix.
-  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
-  */
-
 namespace internal {
 template<typename MatrixType>
-struct traits<Transpose<MatrixType> > : traits<MatrixType>
+struct traits<Transpose<MatrixType> > : public traits<MatrixType>
 {
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedPlain;
-  typedef typename traits<MatrixType>::StorageKind StorageKind;
-  typedef typename traits<MatrixType>::XprKind XprKind;
   enum {
     RowsAtCompileTime = MatrixType::ColsAtCompileTime,
     ColsAtCompileTime = MatrixType::RowsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags0 = MatrixTypeNestedPlain::Flags & ~(LvalueBit | NestByRefBit),
+    Flags0 = traits<MatrixTypeNestedPlain>::Flags & ~(LvalueBit | NestByRefBit),
     Flags1 = Flags0 | FlagsLvalueBit,
     Flags = Flags1 ^ RowMajorBit,
-    CoeffReadCost = MatrixTypeNestedPlain::CoeffReadCost,
     InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
     OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret
   };
@@ -54,31 +36,55 @@ struct traits<Transpose<MatrixType> > : traits<MatrixType>
 
 template<typename MatrixType, typename StorageKind> class TransposeImpl;
 
+/** \class Transpose
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the transpose of a matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the transpose
+  *
+  * This class represents an expression of the transpose of a matrix.
+  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
+  */
 template<typename MatrixType> class Transpose
   : public TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>
 {
   public:
 
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+
     typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-    inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
-    inline Index rows() const { return m_matrix.cols(); }
-    inline Index cols() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }
 
     /** \returns the nested expression */
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<MatrixTypeNested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
-    typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() { return m_matrix.const_cast_derived(); }
+    EIGEN_DEVICE_FUNC
+    typename internal::remove_reference<MatrixTypeNested>::type&
+    nestedExpression() { return m_matrix; }
+
+    /** \internal */
+    void resize(Index nrows, Index ncols) {
+      m_matrix.resize(ncols,nrows);
+    }
 
   protected:
-    typename MatrixType::Nested m_matrix;
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
 };
 
 namespace internal {
@@ -97,17 +103,27 @@ struct TransposeImpl_base<MatrixType, false>
 
 } // end namespace internal
 
+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class TransposeImpl
+  : public internal::generic_xpr_base<Transpose<XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<Transpose<XprType> >::type Base;
+};
+
 template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
   : public internal::TransposeImpl_base<MatrixType>::type
 {
   public:
 
     typedef typename internal::TransposeImpl_base<MatrixType>::type Base;
+    using Base::coeffRef;
     EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
 
-    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
-    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
 
     typedef typename internal::conditional<
                        internal::is_lvalue<MatrixType>::value,
@@ -115,64 +131,21 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
-    inline const Scalar* data() const { return derived().nestedExpression().data(); }
-
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index rowId, Index colId)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return derived().nestedExpression().const_cast_derived().coeffRef(colId, rowId);
-    }
-
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return derived().nestedExpression().const_cast_derived().coeffRef(index);
-    }
+    EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); }
 
+    // FIXME: shall we keep the const version of coeffRef?
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return derived().nestedExpression().coeffRef(colId, rowId);
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
       return derived().nestedExpression().coeffRef(index);
     }
-
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().coeff(colId, rowId);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return derived().nestedExpression().coeff(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().template packet<LoadMode>(colId, rowId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& x)
-    {
-      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(colId, rowId, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return derived().nestedExpression().template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
 };
 
 /** \returns an expression of the transpose of *this.
@@ -198,7 +171,7 @@ template<typename Derived>
 inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
-  return derived();
+  return TransposeReturnType(derived());
 }
 
 /** This is the const version of transpose().
@@ -236,8 +209,7 @@ template<typename Derived>
 inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
-  return this->transpose(); // in the complex case, the .conjugate() is be implicit here
-                            // due to implicit conversion to return type
+  return AdjointReturnType(this->transpose());
 }
 
 /***************************************************************************
@@ -247,18 +219,38 @@ MatrixBase<Derived>::adjoint() const
 namespace internal {
 
 template<typename MatrixType,
-  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
+  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
+  bool MatchPacketSize =
+        (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
+    &&  (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
 struct inplace_transpose_selector;
 
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,true> { // square matrix
+struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
   static void run(MatrixType& m) {
     m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
   }
 };
 
+// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,false> { // non square matrix
+struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
+  static void run(MatrixType& m) {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+    const Index PacketSize = internal::packet_traits<Scalar>::size;
+    const Index Alignment = internal::evaluator<MatrixType>::Alignment;
+    PacketBlock<Packet> A;
+    for (Index i=0; i<PacketSize; ++i)
+      A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
+    internal::ptranspose(A);
+    for (Index i=0; i<PacketSize; ++i)
+      m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
+  }
+};
+
+template<typename MatrixType,bool MatchPacketSize>
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
   static void run(MatrixType& m) {
     if (m.rows()==m.cols())
       m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
@@ -331,14 +323,6 @@ inline void MatrixBase<Derived>::adjointInPlace()
 
 namespace internal {
 
-template<typename BinOp,typename NestedXpr,typename Rhs>
-struct blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> >
- : blas_traits<NestedXpr>
-{
-  typedef SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> XprType;
-  static inline const XprType extract(const XprType& x) { return x; }
-};
-
 template<bool DestIsTransposed, typename OtherDerived>
 struct check_transpose_aliasing_compile_time_selector
 {
@@ -404,15 +388,15 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
     }
 };
 
-} // end namespace internal
-
-template<typename Derived>
-template<typename OtherDerived>
-void DenseBase<Derived>::checkTransposeAliasing(const OtherDerived& other) const
+template<typename Dst, typename Src>
+void check_for_aliasing(const Dst &dst, const Src &src)
 {
-    internal::checkTransposeAliasing_impl<Derived, OtherDerived>::run(derived(), other);
+  internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
 }
-#endif
+
+} // end namespace internal
+
+#endif // EIGEN_NO_DEBUG
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index e4ba0756f..19c17bb4a 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -12,39 +12,6 @@
 
 namespace Eigen { 
 
-/** \class Transpositions
-  * \ingroup Core_Module
-  *
-  * \brief Represents a sequence of transpositions (row/column interchange)
-  *
-  * \param SizeAtCompileTime the number of transpositions, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  *
-  * This class represents a permutation transformation as a sequence of \em n transpositions
-  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
-  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
-  * the rows \c i and \c indices[i] of the matrix \c M.
-  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
-  *
-  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
-  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
-  * 
-  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
-  * \code
-  * Transpositions tr;
-  * MatrixXf mat;
-  * mat = tr * mat;
-  * \endcode
-  * In this example, we detect that the matrix appears on both side, and so the transpositions
-  * are applied in-place without any temporary or extra copy.
-  *
-  * \sa class PermutationMatrix
-  */
-
-namespace internal {
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed=false> struct transposition_matrix_product_retval;
-}
-
 template<typename Derived>
 class TranspositionsBase
 {
@@ -53,7 +20,8 @@ class TranspositionsBase
   public:
 
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     Derived& derived() { return *static_cast<Derived*>(this); }
     const Derived& derived() const { return *static_cast<const Derived*>(this); }
@@ -65,7 +33,7 @@ class TranspositionsBase
       indices() = other.indices();
       return derived();
     }
-
+    
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** This is a special case of the templated operator=. Its purpose is to
       * prevent a default operator= from hiding the templated operator=.
@@ -78,20 +46,24 @@ class TranspositionsBase
     #endif
 
     /** \returns the number of transpositions */
-    inline Index size() const { return indices().size(); }
+    Index size() const { return indices().size(); }
+    /** \returns the number of rows of the equivalent permutation matrix */
+    Index rows() const { return indices().size(); }
+    /** \returns the number of columns of the equivalent permutation matrix */
+    Index cols() const { return indices().size(); }
 
     /** Direct access to the underlying index vector */
-    inline const Index& coeff(Index i) const { return indices().coeff(i); }
+    inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
     /** Direct access to the underlying index vector */
-    inline Index& coeffRef(Index i) { return indices().coeffRef(i); }
+    inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); }
     /** Direct access to the underlying index vector */
-    inline const Index& operator()(Index i) const { return indices()(i); }
+    inline const StorageIndex& operator()(Index i) const { return indices()(i); }
     /** Direct access to the underlying index vector */
-    inline Index& operator()(Index i) { return indices()(i); }
+    inline StorageIndex& operator()(Index i) { return indices()(i); }
     /** Direct access to the underlying index vector */
-    inline const Index& operator[](Index i) const { return indices()(i); }
+    inline const StorageIndex& operator[](Index i) const { return indices()(i); }
     /** Direct access to the underlying index vector */
-    inline Index& operator[](Index i) { return indices()(i); }
+    inline StorageIndex& operator[](Index i) { return indices()(i); }
 
     /** const version of indices(). */
     const IndicesType& indices() const { return derived().indices(); }
@@ -99,7 +71,7 @@ class TranspositionsBase
     IndicesType& indices() { return derived().indices(); }
 
     /** Resizes to given size. */
-    inline void resize(int newSize)
+    inline void resize(Index newSize)
     {
       indices().resize(newSize);
     }
@@ -107,7 +79,7 @@ class TranspositionsBase
     /** Sets \c *this to represents an identity transformation */
     void setIdentity()
     {
-      for(int i = 0; i < indices().size(); ++i)
+      for(StorageIndex i = 0; i < indices().size(); ++i)
         coeffRef(i) = i;
     }
 
@@ -144,23 +116,53 @@ class TranspositionsBase
 };
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
+ : traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
-  typedef IndexType Index;
-  typedef Matrix<Index, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef TranspositionsStorage StorageKind;
 };
 }
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
+/** \class Transpositions
+  * \ingroup Core_Module
+  *
+  * \brief Represents a sequence of transpositions (row/column interchange)
+  *
+  * \tparam SizeAtCompileTime the number of transpositions, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  *
+  * This class represents a permutation transformation as a sequence of \em n transpositions
+  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
+  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
+  * the rows \c i and \c indices[i] of the matrix \c M.
+  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
+  *
+  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
+  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
+  *
+  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
+  * \code
+  * Transpositions tr;
+  * MatrixXf mat;
+  * mat = tr * mat;
+  * \endcode
+  * In this example, we detect that the matrix appears on both side, and so the transpositions
+  * are applied in-place without any temporary or extra copy.
+  *
+  * \sa class PermutationMatrix
+  */
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
     typedef internal::traits<Transpositions> Traits;
   public:
 
     typedef TranspositionsBase<Transpositions> Base;
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
 
     inline Transpositions() {}
 
@@ -177,7 +179,7 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
 
     /** Generic constructor from expression of the transposition indices. */
     template<typename Other>
-    explicit inline Transpositions(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
     {}
 
     /** Copies the \a other transpositions into \c *this */
@@ -215,30 +217,32 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
 
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,_PacketAccess> >
+ : traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
-  typedef IndexType Index;
-  typedef Map<const Matrix<Index,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+  typedef Map<const Matrix<_StorageIndex,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef TranspositionsStorage StorageKind;
 };
 }
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int PacketAccess>
-class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess>
- : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int PacketAccess>
+class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess>
+ : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess> >
 {
     typedef internal::traits<Map> Traits;
   public:
 
     typedef TranspositionsBase<Map> Base;
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
 
-    inline Map(const Index* indicesPtr)
+    explicit inline Map(const StorageIndex* indicesPtr)
       : m_indices(indicesPtr)
     {}
 
-    inline Map(const Index* indicesPtr, Index size)
+    inline Map(const StorageIndex* indicesPtr, Index size)
       : m_indices(indicesPtr,size)
     {}
 
@@ -274,9 +278,9 @@ class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,Packe
 namespace internal {
 template<typename _IndicesType>
 struct traits<TranspositionsWrapper<_IndicesType> >
+ : traits<PermutationWrapper<_IndicesType> >
 {
-  typedef typename _IndicesType::Scalar Index;
-  typedef _IndicesType IndicesType;
+  typedef TranspositionsStorage StorageKind;
 };
 }
 
@@ -289,10 +293,10 @@ class TranspositionsWrapper
 
     typedef TranspositionsBase<TranspositionsWrapper> Base;
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
 
-    inline TranspositionsWrapper(IndicesType& a_indices)
-      : m_indices(a_indices)
+    explicit inline TranspositionsWrapper(IndicesType& indices)
+      : m_indices(indices)
     {}
 
     /** Copies the \a other transpositions into \c *this */
@@ -321,83 +325,46 @@ class TranspositionsWrapper
 
   protected:
 
-    const typename IndicesType::Nested m_indices;
+    typename IndicesType::Nested m_indices;
 };
 
+
+
 /** \returns the \a matrix with the \a transpositions applied to the columns.
   */
-template<typename Derived, typename TranspositionsDerived>
-inline const internal::transposition_matrix_product_retval<TranspositionsDerived, Derived, OnTheRight>
-operator*(const MatrixBase<Derived>& matrix,
-          const TranspositionsBase<TranspositionsDerived> &transpositions)
+template<typename MatrixDerived, typename TranspositionsDerived>
+EIGEN_DEVICE_FUNC
+const Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
+operator*(const MatrixBase<MatrixDerived> &matrix,
+          const TranspositionsBase<TranspositionsDerived>& transpositions)
 {
-  return internal::transposition_matrix_product_retval
-           <TranspositionsDerived, Derived, OnTheRight>
-           (transpositions.derived(), matrix.derived());
+  return Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
+            (matrix.derived(), transpositions.derived());
 }
 
 /** \returns the \a matrix with the \a transpositions applied to the rows.
   */
-template<typename Derived, typename TranspositionDerived>
-inline const internal::transposition_matrix_product_retval
-               <TranspositionDerived, Derived, OnTheLeft>
-operator*(const TranspositionsBase<TranspositionDerived> &transpositions,
-          const MatrixBase<Derived>& matrix)
+template<typename TranspositionsDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC
+const Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
+operator*(const TranspositionsBase<TranspositionsDerived> &transpositions,
+          const MatrixBase<MatrixDerived>& matrix)
 {
-  return internal::transposition_matrix_product_retval
-           <TranspositionDerived, Derived, OnTheLeft>
-           (transpositions.derived(), matrix.derived());
+  return Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
+            (transpositions.derived(), matrix.derived());
 }
 
-namespace internal {
-
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
-struct traits<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
-
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
-struct transposition_matrix_product_retval
- : public ReturnByValue<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename TranspositionType::Index Index;
-
-    transposition_matrix_product_retval(const TranspositionType& tr, const MatrixType& matrix)
-      : m_transpositions(tr), m_matrix(matrix)
-    {}
+// Template partial specialization for transposed/inverse transpositions
 
-    inline int rows() const { return m_matrix.rows(); }
-    inline int cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      const int size = m_transpositions.size();
-      Index j = 0;
-
-      if(!(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix)))
-        dst = m_matrix;
-
-      for(int k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
-        if((j=m_transpositions.coeff(k))!=k)
-        {
-          if(Side==OnTheLeft)
-            dst.row(k).swap(dst.row(j));
-          else if(Side==OnTheRight)
-            dst.col(k).swap(dst.col(j));
-        }
-    }
+namespace internal {
 
-  protected:
-    const TranspositionType& m_transpositions;
-    typename MatrixType::Nested m_matrix;
-};
+template<typename Derived>
+struct traits<Transpose<TranspositionsBase<Derived> > >
+ : traits<Derived>
+{};
 
 } // end namespace internal
 
-/* Template partial specialization for transposed/inverse transpositions */
-
 template<typename TranspositionsDerived>
 class Transpose<TranspositionsBase<TranspositionsDerived> >
 {
@@ -405,27 +372,31 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
     typedef typename TranspositionType::IndicesType IndicesType;
   public:
 
-    Transpose(const TranspositionType& t) : m_transpositions(t) {}
+    explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
 
-    inline int size() const { return m_transpositions.size(); }
+    Index size() const { return m_transpositions.size(); }
+    Index rows() const { return m_transpositions.size(); }
+    Index cols() const { return m_transpositions.size(); }
 
     /** \returns the \a matrix with the inverse transpositions applied to the columns.
       */
-    template<typename Derived> friend
-    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>
-    operator*(const MatrixBase<Derived>& matrix, const Transpose& trt)
+    template<typename OtherDerived> friend
+    const Product<OtherDerived, Transpose, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
     {
-      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>(trt.m_transpositions, matrix.derived());
+      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
     }
 
     /** \returns the \a matrix with the inverse transpositions applied to the rows.
       */
-    template<typename Derived>
-    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>
-    operator*(const MatrixBase<Derived>& matrix) const
+    template<typename OtherDerived>
+    const Product<Transpose, OtherDerived, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix) const
     {
-      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>(m_transpositions, matrix.derived());
+      return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
     }
+    
+    const TranspositionType& nestedExpression() const { return m_transpositions; }
 
   protected:
     const TranspositionType& m_transpositions;
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 4d65392c6..667ef09dc 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -19,9 +19,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
   
 }
 
-/** \internal
-  *
-  * \class TriangularBase
+/** \class TriangularBase
   * \ingroup Core_Module
   *
   * \brief Base class for triangular part in a matrix
@@ -32,41 +30,69 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
 
     enum {
       Mode = internal::traits<Derived>::Mode,
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
       RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
       ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
       MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+      
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
+      /**< This is equal to the number of coefficients, i.e. the number of
+          * rows times the number of columns, or to \a Dynamic if this is not
+          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+      
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                   internal::traits<Derived>::MaxColsAtCompileTime>::ret)
+        
     };
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::DenseMatrixType DenseMatrixType;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+    typedef typename internal::traits<Derived>::FullMatrixType DenseMatrixType;
     typedef DenseMatrixType DenseType;
+    typedef Derived const& Nested;
 
+    EIGEN_DEVICE_FUNC
     inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return derived().rows(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return derived().cols(); }
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const { return derived().outerStride(); }
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const { return derived().innerStride(); }
+    
+    // dummy resize function
+    void resize(Index rows, Index cols)
+    {
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+      eigen_assert(rows==this->rows() && cols==this->cols());
+    }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar coeff(Index row, Index col) const  { return derived().coeff(row,col); }
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index col) { return derived().coeffRef(row,col); }
 
     /** \see MatrixBase::copyCoeff(row,col)
       */
     template<typename Other>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, Other& other)
     {
       derived().coeffRef(row, col) = other.coeff(row, col);
     }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar operator()(Index row, Index col) const
     {
       check_coordinates(row, col);
       return coeff(row,col);
     }
+    EIGEN_DEVICE_FUNC
     inline Scalar& operator()(Index row, Index col)
     {
       check_coordinates(row, col);
@@ -74,15 +100,20 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
+    EIGEN_DEVICE_FUNC
     inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    EIGEN_DEVICE_FUNC
     inline Derived& derived() { return *static_cast<Derived*>(this); }
     #endif // not EIGEN_PARSED_BY_DOXYGEN
 
     template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
     void evalTo(MatrixBase<DenseDerived> &other) const;
     template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
     void evalToLazy(MatrixBase<DenseDerived> &other) const;
 
+    EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const
     {
       DenseMatrixType res(rows(), cols());
@@ -119,17 +150,17 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
 /** \class TriangularView
   * \ingroup Core_Module
   *
-  * \brief Base class for triangular part in a matrix
+  * \brief Expression of a triangular part in a matrix
   *
   * \param MatrixType the type of the object in which we are taking the triangular part
   * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
   *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
   *             This is in fact a bit field; it must have either #Upper or #Lower, 
-  *             and additionnaly it may have #UnitDiag or #ZeroDiag or neither.
+  *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
   *
   * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
   * matrices one should speak of "trapezoid" parts. This class is the return type
-  * of MatrixBase::triangularView() and most of the time this is the only way it is used.
+  * of MatrixBase::triangularView() and SparseMatrixBase::triangularView(), and most of the time this is the only way it is used.
   *
   * \sa MatrixBase::triangularView()
   */
@@ -137,499 +168,405 @@ namespace internal {
 template<typename MatrixType, unsigned int _Mode>
 struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
   typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+  typedef typename MatrixType::PlainObject FullMatrixType;
   typedef MatrixType ExpressionType;
-  typedef typename MatrixType::PlainObject DenseMatrixType;
   enum {
     Mode = _Mode,
-    Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit))) | Mode,
-    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits | FlagsLvalueBit) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)))
   };
 };
 }
 
-template<int Mode, bool LhsIsTriangular,
-         typename Lhs, bool LhsIsVector,
-         typename Rhs, bool RhsIsVector>
-struct TriangularProduct;
+template<typename _MatrixType, unsigned int _Mode, typename StorageKind> class TriangularViewImpl;
 
 template<typename _MatrixType, unsigned int _Mode> class TriangularView
-  : public TriangularBase<TriangularView<_MatrixType, _Mode> >
+  : public TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind >
 {
   public:
 
-    typedef TriangularBase<TriangularView> Base;
+    typedef TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind > Base;
     typedef typename internal::traits<TriangularView>::Scalar Scalar;
-
     typedef _MatrixType MatrixType;
-    typedef typename internal::traits<TriangularView>::DenseMatrixType DenseMatrixType;
-    typedef DenseMatrixType PlainObject;
 
   protected:
     typedef typename internal::traits<TriangularView>::MatrixTypeNested MatrixTypeNested;
     typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
-    typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
 
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
     
   public:
-    using Base::evalToLazy;
-  
 
     typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
-    typedef typename internal::traits<TriangularView>::Index Index;
+    typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned NestedExpression;
 
     enum {
       Mode = _Mode,
+      Flags = internal::traits<TriangularView>::Flags,
       TransposeMode = (Mode & Upper ? Lower : 0)
                     | (Mode & Lower ? Upper : 0)
                     | (Mode & (UnitDiag))
-                    | (Mode & (ZeroDiag))
+                    | (Mode & (ZeroDiag)),
+      IsVectorAtCompileTime = false
     };
 
-    inline TriangularView(const MatrixType& matrix) : m_matrix(matrix)
+    EIGEN_DEVICE_FUNC
+    explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
     {}
+    
+    using Base::operator=;
+    TriangularView& operator=(const TriangularView &other)
+    { return Base::operator=(other); }
 
+    /** \copydoc EigenBase::rows() */
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_matrix.rows(); }
+    /** \copydoc EigenBase::cols() */
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
+
+    /** \returns a const reference to the nested expression */
+    EIGEN_DEVICE_FUNC
+    const NestedExpression& nestedExpression() const { return m_matrix; }
+
+    /** \returns a reference to the nested expression */
+    EIGEN_DEVICE_FUNC
+    NestedExpression& nestedExpression() { return m_matrix; }
+    
+    typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
+    /** \sa MatrixBase::conjugate() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConjugateReturnType conjugate() const
+    { return ConjugateReturnType(m_matrix.conjugate()); }
+
+    typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
+    /** \sa MatrixBase::adjoint() const */
+    EIGEN_DEVICE_FUNC
+    inline const AdjointReturnType adjoint() const
+    { return AdjointReturnType(m_matrix.adjoint()); }
+
+    typedef TriangularView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
+     /** \sa MatrixBase::transpose() */
+    EIGEN_DEVICE_FUNC
+    inline TransposeReturnType transpose()
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
+      return TransposeReturnType(tmp);
+    }
+    
+    typedef TriangularView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
+    /** \sa MatrixBase::transpose() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(m_matrix.transpose());
+    }
+
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    inline const Solve<TriangularView, Other> 
+    solve(const MatrixBase<Other>& other) const
+    { return Solve<TriangularView, Other>(*this, other.derived()); }
+    
+  // workaround MSVC ICE
+  #if EIGEN_COMP_MSVC
+    template<int Side, typename Other>
+    EIGEN_DEVICE_FUNC
+    inline const internal::triangular_solve_retval<Side,TriangularView, Other>
+    solve(const MatrixBase<Other>& other) const
+    { return Base::template solve<Side>(other); }
+  #else
+    using Base::solve;
+  #endif
+
+    /** \returns a selfadjoint view of the referenced triangular part which must be either \c #Upper or \c #Lower.
+      *
+      * This is a shortcut for \code this->nestedExpression().selfadjointView<(*this)::Mode>() \endcode
+      * \sa MatrixBase::selfadjointView() */
+    EIGEN_DEVICE_FUNC
+    SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
+    {
+      EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
+      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
+    }
+
+    /** This is the const version of selfadjointView() */
+    EIGEN_DEVICE_FUNC
+    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
+    {
+      EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
+      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
+    }
+
+
+    /** \returns the determinant of the triangular matrix
+      * \sa MatrixBase::determinant() */
+    EIGEN_DEVICE_FUNC
+    Scalar determinant() const
+    {
+      if (Mode & UnitDiag)
+        return 1;
+      else if (Mode & ZeroDiag)
+        return 0;
+      else
+        return m_matrix.diagonal().prod();
+    }
+      
+  protected:
+
+    MatrixTypeNested m_matrix;
+};
+
+/** \ingroup Core_Module
+  *
+  * \brief Base class for a triangular part in a \b dense matrix
+  *
+  * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
+  * It extends class TriangularView with additional methods which available for dense expressions only.
+  *
+  * \sa class TriangularView, MatrixBase::triangularView()
+  */
+template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_MatrixType,_Mode,Dense>
+  : public TriangularBase<TriangularView<_MatrixType, _Mode> >
+{
+  public:
+
+    typedef TriangularView<_MatrixType, _Mode> TriangularViewType;
+    typedef TriangularBase<TriangularViewType> Base;
+    typedef typename internal::traits<TriangularViewType>::Scalar Scalar;
+
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::PlainObject DenseMatrixType;
+    typedef DenseMatrixType PlainObject;
+
+  public:
+    using Base::evalToLazy;
+    using Base::derived;
+
+    typedef typename internal::traits<TriangularViewType>::StorageKind StorageKind;
+
+    enum {
+      Mode = _Mode,
+      Flags = internal::traits<TriangularViewType>::Flags
+    };
+
+    /** \returns the outer-stride of the underlying dense matrix
+      * \sa DenseCoeffsBase::outerStride() */
+    EIGEN_DEVICE_FUNC
+    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    /** \returns the inner-stride of the underlying dense matrix
+      * \sa DenseCoeffsBase::innerStride() */
+    EIGEN_DEVICE_FUNC
+    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
 
     /** \sa MatrixBase::operator+=() */
-    template<typename Other> TriangularView&  operator+=(const DenseBase<Other>& other) { return *this = m_matrix + other.derived(); }
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator+=(const DenseBase<Other>& other) {
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar,typename Other::Scalar>());
+      return derived();
+    }
     /** \sa MatrixBase::operator-=() */
-    template<typename Other> TriangularView&  operator-=(const DenseBase<Other>& other) { return *this = m_matrix - other.derived(); }
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator-=(const DenseBase<Other>& other) {
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
+      return derived();
+    }
+    
     /** \sa MatrixBase::operator*=() */
-    TriangularView&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix * other; }
-    /** \sa MatrixBase::operator/=() */
-    TriangularView&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix / other; }
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
+    /** \sa DenseBase::operator/=() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() / other; }
 
     /** \sa MatrixBase::fill() */
+    EIGEN_DEVICE_FUNC
     void fill(const Scalar& value) { setConstant(value); }
     /** \sa MatrixBase::setConstant() */
-    TriangularView& setConstant(const Scalar& value)
-    { return *this = MatrixType::Constant(rows(), cols(), value); }
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setConstant(const Scalar& value)
+    { return *this = MatrixType::Constant(derived().rows(), derived().cols(), value); }
     /** \sa MatrixBase::setZero() */
-    TriangularView& setZero() { return setConstant(Scalar(0)); }
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setZero() { return setConstant(Scalar(0)); }
     /** \sa MatrixBase::setOnes() */
-    TriangularView& setOnes() { return setConstant(Scalar(1)); }
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setOnes() { return setConstant(Scalar(1)); }
 
     /** \sa MatrixBase::coeff()
       * \warning the coordinates must fit into the referenced triangular part
       */
+    EIGEN_DEVICE_FUNC
     inline Scalar coeff(Index row, Index col) const
     {
       Base::check_coordinates_internal(row, col);
-      return m_matrix.coeff(row, col);
+      return derived().nestedExpression().coeff(row, col);
     }
 
     /** \sa MatrixBase::coeffRef()
       * \warning the coordinates must fit into the referenced triangular part
       */
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index col)
     {
+      EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType);
       Base::check_coordinates_internal(row, col);
-      return m_matrix.const_cast_derived().coeffRef(row, col);
+      return derived().nestedExpression().coeffRef(row, col);
     }
 
-    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
-
     /** Assigns a triangular matrix to a triangular part of a dense matrix */
     template<typename OtherDerived>
-    TriangularView& operator=(const TriangularBase<OtherDerived>& other);
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const TriangularBase<OtherDerived>& other);
 
+    /** Shortcut for\code *this = other.other.triangularView<(*this)::Mode>() \endcode */
     template<typename OtherDerived>
-    TriangularView& operator=(const MatrixBase<OtherDerived>& other);
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const MatrixBase<OtherDerived>& other);
 
-    TriangularView& operator=(const TriangularView& other)
-    { return *this = other.nestedExpression(); }
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const TriangularViewImpl& other)
+    { return *this = other.derived().nestedExpression(); }
 
+    /** \deprecated */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void lazyAssign(const TriangularBase<OtherDerived>& other);
 
+    /** \deprecated */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void lazyAssign(const MatrixBase<OtherDerived>& other);
-
-    /** \sa MatrixBase::conjugate() */
-    inline TriangularView<MatrixConjugateReturnType,Mode> conjugate()
-    { return m_matrix.conjugate(); }
-    /** \sa MatrixBase::conjugate() const */
-    inline const TriangularView<MatrixConjugateReturnType,Mode> conjugate() const
-    { return m_matrix.conjugate(); }
-
-    /** \sa MatrixBase::adjoint() const */
-    inline const TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> adjoint() const
-    { return m_matrix.adjoint(); }
-
-    /** \sa MatrixBase::transpose() */
-    inline TriangularView<Transpose<MatrixType>,TransposeMode> transpose()
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().transpose();
-    }
-    /** \sa MatrixBase::transpose() const */
-    inline const TriangularView<Transpose<MatrixType>,TransposeMode> transpose() const
-    {
-      return m_matrix.transpose();
-    }
+#endif
 
     /** Efficient triangular matrix times vector/matrix product */
     template<typename OtherDerived>
-    TriangularProduct<Mode, true, MatrixType, false, OtherDerived, OtherDerived::ColsAtCompileTime==1>
+    EIGEN_DEVICE_FUNC
+    const Product<TriangularViewType,OtherDerived>
     operator*(const MatrixBase<OtherDerived>& rhs) const
     {
-      return TriangularProduct
-              <Mode, true, MatrixType, false, OtherDerived, OtherDerived::ColsAtCompileTime==1>
-              (m_matrix, rhs.derived());
+      return Product<TriangularViewType,OtherDerived>(derived(), rhs.derived());
     }
 
     /** Efficient vector/matrix times triangular matrix product */
     template<typename OtherDerived> friend
-    TriangularProduct<Mode, false, OtherDerived, OtherDerived::RowsAtCompileTime==1, MatrixType, false>
-    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularView& rhs)
-    {
-      return TriangularProduct
-              <Mode, false, OtherDerived, OtherDerived::RowsAtCompileTime==1, MatrixType, false>
-              (lhs.derived(),rhs.m_matrix);
-    }
-
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    struct eigen2_product_return_type
-    {
-      typedef typename TriangularView<MatrixType,Mode>::DenseMatrixType DenseMatrixType;
-      typedef typename OtherDerived::PlainObject::DenseType OtherPlainObject;
-      typedef typename ProductReturnType<DenseMatrixType, OtherPlainObject>::Type ProdRetType;
-      typedef typename ProdRetType::PlainObject type;
-    };
-    template<typename OtherDerived>
-    const typename eigen2_product_return_type<OtherDerived>::type
-    operator*(const EigenBase<OtherDerived>& rhs) const
-    {
-      typename OtherDerived::PlainObject::DenseType rhsPlainObject;
-      rhs.evalTo(rhsPlainObject);
-      return this->toDenseMatrix() * rhsPlainObject;
-    }
-    template<typename OtherMatrixType>
-    bool isApprox(const TriangularView<OtherMatrixType, Mode>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return this->toDenseMatrix().isApprox(other.toDenseMatrix(), precision);
-    }
-    template<typename OtherDerived>
-    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return this->toDenseMatrix().isApprox(other, precision);
-    }
-    #endif // EIGEN2_SUPPORT
-
+    EIGEN_DEVICE_FUNC
+    const Product<OtherDerived,TriangularViewType>
+    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularViewImpl& rhs)
+    {
+      return Product<OtherDerived,TriangularViewType>(lhs.derived(),rhs.derived());
+    }
+
+    /** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
+      *
+      * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
+      * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
+      * \a Side==OnTheRight.
+      *
+      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      *
+      * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
+      * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
+      * is an upper (resp. lower) triangular matrix.
+      *
+      * Example: \include Triangular_solve.cpp
+      * Output: \verbinclude Triangular_solve.out
+      *
+      * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
+      * to the same matrix or vector \a other.
+      *
+      * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
+      * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
+      *
+      * \sa TriangularView::solveInPlace()
+      */
     template<int Side, typename Other>
-    inline const internal::triangular_solve_retval<Side,TriangularView, Other>
+    EIGEN_DEVICE_FUNC
+    inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
     solve(const MatrixBase<Other>& other) const;
 
+    /** "in-place" version of TriangularView::solve() where the result is written in \a other
+      *
+      * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+      * This function will const_cast it, so constness isn't honored here.
+      *
+      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      *
+      * See TriangularView:solve() for the details.
+      */
     template<int Side, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void solveInPlace(const MatrixBase<OtherDerived>& other) const;
 
-    template<typename Other>
-    inline const internal::triangular_solve_retval<OnTheLeft,TriangularView, Other> 
-    solve(const MatrixBase<Other>& other) const
-    { return solve<OnTheLeft>(other); }
-
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void solveInPlace(const MatrixBase<OtherDerived>& other) const
     { return solveInPlace<OnTheLeft>(other); }
 
-    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
-    {
-      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
-      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
-    }
-    SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
-    {
-      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
-      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
-    }
-
+    /** Swaps the coefficients of the common triangular parts of two matrices */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+    void swap(TriangularBase<OtherDerived> &other)
+#else
     void swap(TriangularBase<OtherDerived> const & other)
+#endif
     {
-      TriangularView<SwapWrapper<MatrixType>,Mode>(const_cast<MatrixType&>(m_matrix)).lazyAssign(other.derived());
+      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
+    /** \deprecated
+      * Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void swap(MatrixBase<OtherDerived> const & other)
     {
-      SwapWrapper<MatrixType> swaper(const_cast<MatrixType&>(m_matrix));
-      TriangularView<SwapWrapper<MatrixType>,Mode>(swaper).lazyAssign(other.derived());
+      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
-    Scalar determinant() const
-    {
-      if (Mode & UnitDiag)
-        return 1;
-      else if (Mode & ZeroDiag)
-        return 0;
-      else
-        return m_matrix.diagonal().prod();
-    }
-    
-    // TODO simplify the following:
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& operator=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      setZero();
-      return assignProduct(other.derived(),1);
-    }
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      return assignProduct(other.derived(),1);
-    }
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      return assignProduct(other.derived(),-1);
-    }
-    
-    
-    template<typename ProductDerived>
-    EIGEN_STRONG_INLINE TriangularView& operator=(const ScaledProduct<ProductDerived>& other)
-    {
-      setZero();
-      return assignProduct(other.derived(),other.alpha());
-    }
-    
-    template<typename ProductDerived>
-    EIGEN_STRONG_INLINE TriangularView& operator+=(const ScaledProduct<ProductDerived>& other)
-    {
-      return assignProduct(other.derived(),other.alpha());
-    }
-    
-    template<typename ProductDerived>
-    EIGEN_STRONG_INLINE TriangularView& operator-=(const ScaledProduct<ProductDerived>& other)
-    {
-      return assignProduct(other.derived(),-other.alpha());
-    }
-    
-  protected:
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& assignProduct(const ProductBase<ProductDerived, Lhs,Rhs>& prod, const Scalar& alpha);
-    
-    template<int Mode, bool LhsIsTriangular,
-         typename Lhs, bool LhsIsVector,
-         typename Rhs, bool RhsIsVector>
-    EIGEN_STRONG_INLINE TriangularView& assignProduct(const TriangularProduct<Mode, LhsIsTriangular, Lhs, LhsIsVector, Rhs, RhsIsVector>& prod, const Scalar& alpha)
-    {
-      lazyAssign(alpha*prod.eval());
-      return *this;
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const {
+      if(!internal::is_same_dense(dst,rhs))
+        dst = rhs;
+      this->solveInPlace(dst);
     }
 
-    MatrixTypeNested m_matrix;
+    template<typename ProductType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
 };
 
 /***************************************************************************
 * Implementation of triangular evaluation/assignment
 ***************************************************************************/
 
-namespace internal {
-
-template<typename Derived1, typename Derived2, unsigned int Mode, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector
-{
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
-  
-  typedef typename Derived1::Scalar Scalar;
-
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    triangular_assignment_selector<Derived1, Derived2, Mode, UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    eigen_assert( Mode == Upper || Mode == Lower
-            || Mode == StrictlyUpper || Mode == StrictlyLower
-            || Mode == UnitUpper || Mode == UnitLower);
-    if((Mode == Upper && row <= col)
-    || (Mode == Lower && row >= col)
-    || (Mode == StrictlyUpper && row < col)
-    || (Mode == StrictlyLower && row > col)
-    || (Mode == UnitUpper && row < col)
-    || (Mode == UnitLower && row > col))
-      dst.copyCoeff(row, col, src);
-    else if(ClearOpposite)
-    {
-      if (Mode&UnitDiag && row==col)
-        dst.coeffRef(row, col) = Scalar(1);
-      else
-        dst.coeffRef(row, col) = Scalar(0);
-    }
-  }
-};
-
-// prevent buggy user code from causing an infinite recursion
-template<typename Derived1, typename Derived2, unsigned int Mode, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Mode, 0, ClearOpposite>
-{
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Upper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  typedef typename Derived1::Scalar Scalar;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows()-1);
-      for(Index i = 0; i <= maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-        for(Index i = maxi+1; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = Scalar(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Lower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = j; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      Index maxi = (std::min)(j, dst.rows());
-      if (ClearOpposite)
-        for(Index i = 0; i < maxi; ++i)
-          dst.coeffRef(i, j) = static_cast<typename Derived1::Scalar>(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, StrictlyUpper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  typedef typename Derived1::Scalar Scalar;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = 0; i < maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-        for(Index i = maxi; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = Scalar(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, StrictlyLower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = j+1; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      Index maxi = (std::min)(j, dst.rows()-1);
-      if (ClearOpposite)
-        for(Index i = 0; i <= maxi; ++i)
-          dst.coeffRef(i, j) = static_cast<typename Derived1::Scalar>(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, UnitUpper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = 0; i < maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-      {
-        for(Index i = maxi+1; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = 0;
-      }
-    }
-    dst.diagonal().setOnes();
-  }
-};
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, UnitLower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = maxi+1; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-      {
-        for(Index i = 0; i < maxi; ++i)
-          dst.coeffRef(i, j) = 0;
-      }
-    }
-    dst.diagonal().setOnes();
-  }
-};
-
-} // end namespace internal
-
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
 inline TriangularView<MatrixType, Mode>&
-TriangularView<MatrixType, Mode>::operator=(const MatrixBase<OtherDerived>& other)
+TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
-  if(OtherDerived::Flags & EvalBeforeAssigningBit)
-  {
-    typename internal::plain_matrix_type<OtherDerived>::type other_evaluated(other.rows(), other.cols());
-    other_evaluated.template triangularView<Mode>().lazyAssign(other.derived());
-    lazyAssign(other_evaluated);
-  }
-  else
-    lazyAssign(other.derived());
-  return *this;
+  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
 }
 
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularView<MatrixType, Mode>::lazyAssign(const MatrixBase<OtherDerived>& other)
+void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
-  enum {
-    unroll = MatrixType::SizeAtCompileTime != Dynamic
-          && internal::traits<OtherDerived>::CoeffReadCost != Dynamic
-          && MatrixType::SizeAtCompileTime*internal::traits<OtherDerived>::CoeffReadCost/2 <= EIGEN_UNROLLING_LIMIT
-  };
-  eigen_assert(m_matrix.rows() == other.rows() && m_matrix.cols() == other.cols());
-
-  internal::triangular_assignment_selector
-    <MatrixType, OtherDerived, int(Mode),
-    unroll ? int(MatrixType::SizeAtCompileTime) : Dynamic,
-    false // do not change the opposite triangular part
-    >::run(m_matrix.const_cast_derived(), other.derived());
+  internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
 
 
@@ -637,38 +574,21 @@ void TriangularView<MatrixType, Mode>::lazyAssign(const MatrixBase<OtherDerived>
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
 inline TriangularView<MatrixType, Mode>&
-TriangularView<MatrixType, Mode>::operator=(const TriangularBase<OtherDerived>& other)
+TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
-  if(internal::traits<OtherDerived>::Flags & EvalBeforeAssigningBit)
-  {
-    typename OtherDerived::DenseMatrixType other_evaluated(other.rows(), other.cols());
-    other_evaluated.template triangularView<Mode>().lazyAssign(other.derived().nestedExpression());
-    lazyAssign(other_evaluated);
-  }
-  else
-    lazyAssign(other.derived().nestedExpression());
-  return *this;
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularView<MatrixType, Mode>::lazyAssign(const TriangularBase<OtherDerived>& other)
+void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
-  enum {
-    unroll = MatrixType::SizeAtCompileTime != Dynamic
-                   && internal::traits<OtherDerived>::CoeffReadCost != Dynamic
-                   && MatrixType::SizeAtCompileTime * internal::traits<OtherDerived>::CoeffReadCost / 2
-                        <= EIGEN_UNROLLING_LIMIT
-  };
-  eigen_assert(m_matrix.rows() == other.rows() && m_matrix.cols() == other.cols());
-
-  internal::triangular_assignment_selector
-    <MatrixType, OtherDerived, int(Mode),
-    unroll ? int(MatrixType::SizeAtCompileTime) : Dynamic,
-    false // preserve the opposite triangular part
-    >::run(m_matrix.const_cast_derived(), other.derived().nestedExpression());
+  eigen_assert(Mode == int(OtherDerived::Mode));
+  internal::call_assignment_no_alias(derived(), other.derived());
 }
+#endif
 
 /***************************************************************************
 * Implementation of TriangularBase methods
@@ -680,35 +600,7 @@ template<typename Derived>
 template<typename DenseDerived>
 void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
-  if(internal::traits<Derived>::Flags & EvalBeforeAssigningBit)
-  {
-    typename internal::plain_matrix_type<Derived>::type other_evaluated(rows(), cols());
-    evalToLazy(other_evaluated);
-    other.derived().swap(other_evaluated);
-  }
-  else
-    evalToLazy(other.derived());
-}
-
-/** Assigns a triangular or selfadjoint matrix to a dense matrix.
-  * If the matrix is triangular, the opposite part is set to zero. */
-template<typename Derived>
-template<typename DenseDerived>
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
-{
-  enum {
-    unroll = DenseDerived::SizeAtCompileTime != Dynamic
-                   && internal::traits<Derived>::CoeffReadCost != Dynamic
-                   && DenseDerived::SizeAtCompileTime * internal::traits<Derived>::CoeffReadCost / 2
-                        <= EIGEN_UNROLLING_LIMIT
-  };
-  other.derived().resize(this->rows(), this->cols());
-
-  internal::triangular_assignment_selector
-    <DenseDerived, typename internal::traits<Derived>::MatrixTypeNestedCleaned, Derived::Mode,
-    unroll ? int(DenseDerived::SizeAtCompileTime) : Dynamic,
-    true // clear the opposite triangular part
-    >::run(other.derived(), derived().nestedExpression());
+  evalToLazy(other.derived());
 }
 
 /***************************************************************************
@@ -719,49 +611,14 @@ void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 * Implementation of MatrixBase methods
 ***************************************************************************/
 
-#ifdef EIGEN2_SUPPORT
-
-// implementation of part<>(), including the SelfAdjoint case.
-
-namespace internal {
-template<typename MatrixType, unsigned int Mode>
-struct eigen2_part_return_type
-{
-  typedef TriangularView<MatrixType, Mode> type;
-};
-
-template<typename MatrixType>
-struct eigen2_part_return_type<MatrixType, SelfAdjoint>
-{
-  typedef SelfAdjointView<MatrixType, Upper> type;
-};
-}
-
-/** \deprecated use MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-const typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part() const
-{
-  return derived();
-}
-
-/** \deprecated use MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part()
-{
-  return derived();
-}
-#endif
-
 /**
   * \returns an expression of a triangular view extracted from the current matrix
   *
   * The parameter \a Mode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
   * \c #Lower, \c #StrictlyLower, \c #UnitLower.
   *
-  * Example: \include MatrixBase_extract.cpp
-  * Output: \verbinclude MatrixBase_extract.out
+  * Example: \include MatrixBase_triangularView.cpp
+  * Output: \verbinclude MatrixBase_triangularView.out
   *
   * \sa class TriangularView
   */
@@ -770,7 +627,7 @@ template<unsigned int Mode>
 typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
-  return derived();
+  return typename TriangularViewReturnType<Mode>::Type(derived());
 }
 
 /** This is the const version of MatrixBase::triangularView() */
@@ -779,7 +636,7 @@ template<unsigned int Mode>
 typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
-  return derived();
+  return typename ConstTriangularViewReturnType<Mode>::Type(derived());
 }
 
 /** \returns true if *this is approximately equal to an upper triangular matrix,
@@ -790,21 +647,20 @@ MatrixBase<Derived>::triangularView() const
 template<typename Derived>
 bool MatrixBase<Derived>::isUpperTriangular(const RealScalar& prec) const
 {
-  using std::abs;
   RealScalar maxAbsOnUpperPart = static_cast<RealScalar>(-1);
   for(Index j = 0; j < cols(); ++j)
   {
-    Index maxi = (std::min)(j, rows()-1);
+    Index maxi = numext::mini(j, rows()-1);
     for(Index i = 0; i <= maxi; ++i)
     {
-      RealScalar absValue = abs(coeff(i,j));
+      RealScalar absValue = numext::abs(coeff(i,j));
       if(absValue > maxAbsOnUpperPart) maxAbsOnUpperPart = absValue;
     }
   }
   RealScalar threshold = maxAbsOnUpperPart * prec;
   for(Index j = 0; j < cols(); ++j)
     for(Index i = j+1; i < rows(); ++i)
-      if(abs(coeff(i, j)) > threshold) return false;
+      if(numext::abs(coeff(i, j)) > threshold) return false;
   return true;
 }
 
@@ -816,24 +672,312 @@ bool MatrixBase<Derived>::isUpperTriangular(const RealScalar& prec) const
 template<typename Derived>
 bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const
 {
-  using std::abs;
   RealScalar maxAbsOnLowerPart = static_cast<RealScalar>(-1);
   for(Index j = 0; j < cols(); ++j)
     for(Index i = j; i < rows(); ++i)
     {
-      RealScalar absValue = abs(coeff(i,j));
+      RealScalar absValue = numext::abs(coeff(i,j));
       if(absValue > maxAbsOnLowerPart) maxAbsOnLowerPart = absValue;
     }
   RealScalar threshold = maxAbsOnLowerPart * prec;
   for(Index j = 1; j < cols(); ++j)
   {
-    Index maxi = (std::min)(j, rows()-1);
+    Index maxi = numext::mini(j, rows()-1);
     for(Index i = 0; i < maxi; ++i)
-      if(abs(coeff(i, j)) > threshold) return false;
+      if(numext::abs(coeff(i, j)) > threshold) return false;
   }
   return true;
 }
 
+
+/***************************************************************************
+****************************************************************************
+* Evaluators and Assignment of triangular expressions
+***************************************************************************
+***************************************************************************/
+
+namespace internal {
+
+  
+// TODO currently a triangular expression has the form TriangularView<.,.>
+//      in the future triangular-ness should be defined by the expression traits
+//      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<TriangularView<MatrixType,Mode> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef typename glue_shapes<typename evaluator_traits<MatrixType>::Shape, TriangularShape>::type Shape;
+};
+
+template<typename MatrixType, unsigned int Mode>
+struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
+ : evaluator<typename internal::remove_all<MatrixType>::type>
+{
+  typedef TriangularView<MatrixType,Mode> XprType;
+  typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
+  unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
+};
+
+// Additional assignment kinds:
+struct Triangular2Triangular    {};
+struct Triangular2Dense         {};
+struct Dense2Triangular         {};
+
+
+template<typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite> struct triangular_assignment_loop;
+
+ 
+/** \internal Specialization of the dense assignment kernel for triangular matrices.
+  * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions.
+  * \tparam UpLo must be either Lower or Upper
+  * \tparam Mode must be either 0, UnitDiag, ZeroDiag, or SelfAdjoint
+  */
+template<int UpLo, int Mode, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
+class triangular_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+public:
+  
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+  
+  
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+  
+#ifdef EIGEN_INTERNAL_DEBUGGING
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
+  {
+    eigen_internal_assert(row!=col);
+    Base::assignCoeff(row,col);
+  }
+#else
+  using Base::assignCoeff;
+#endif
+  
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
+  {
+         if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
+    else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
+    else if(Mode==0)                       Base::assignCoeff(id,id);
+  }
+  
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
+  { 
+    eigen_internal_assert(row!=col);
+    if(SetOpposite)
+      m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0));
+  }
+};
+
+template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
+{
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+    dst.resize(dstRows, dstCols);
+  DstEvaluatorType dstEvaluator(dst);
+    
+  typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
+                                              DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+  
+  enum {
+      unroll = DstXprType::SizeAtCompileTime != Dynamic
+            && SrcEvaluatorType::CoeffReadCost < HugeCost
+            && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
+    };
+  
+  triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
+}
+
+template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src)
+{
+  call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+}
+
+template<> struct AssignmentKind<TriangularShape,TriangularShape> { typedef Triangular2Triangular Kind; };
+template<> struct AssignmentKind<DenseShape,TriangularShape>      { typedef Triangular2Dense      Kind; };
+template<> struct AssignmentKind<TriangularShape,DenseShape>      { typedef Dense2Triangular      Kind; };
+
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
+    
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+  }
+};
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);  
+  }
+};
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+  }
+};
+
+
+template<typename Kernel, unsigned int Mode, int UnrollCount, bool SetOpposite>
+struct triangular_assignment_loop
+{
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+  
+  enum {
+    col = (UnrollCount-1) / DstXprType::RowsAtCompileTime,
+    row = (UnrollCount-1) % DstXprType::RowsAtCompileTime
+  };
+  
+  typedef typename Kernel::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &kernel)
+  {
+    triangular_assignment_loop<Kernel, Mode, UnrollCount-1, SetOpposite>::run(kernel);
+    
+    if(row==col)
+      kernel.assignDiagonalCoeff(row);
+    else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row<col) )
+      kernel.assignCoeff(row,col);
+    else if(SetOpposite)
+      kernel.assignOppositeCoeff(row,col);
+  }
+};
+
+// prevent buggy user code from causing an infinite recursion
+template<typename Kernel, unsigned int Mode, bool SetOpposite>
+struct triangular_assignment_loop<Kernel, Mode, 0, SetOpposite>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &) {}
+};
+
+
+
+// TODO: experiment with a recursive assignment procedure splitting the current
+//       triangular part into one rectangular and two triangular parts.
+
+
+template<typename Kernel, unsigned int Mode, bool SetOpposite>
+struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
+{
+  typedef typename Kernel::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &kernel)
+  {
+    for(Index j = 0; j < kernel.cols(); ++j)
+    {
+      Index maxi = numext::mini(j, kernel.rows());
+      Index i = 0;
+      if (((Mode&Lower) && SetOpposite) || (Mode&Upper))
+      {
+        for(; i < maxi; ++i)
+          if(Mode&Upper) kernel.assignCoeff(i, j);
+          else           kernel.assignOppositeCoeff(i, j);
+      }
+      else
+        i = maxi;
+      
+      if(i<kernel.rows()) // then i==j
+        kernel.assignDiagonalCoeff(i++);
+      
+      if (((Mode&Upper) && SetOpposite) || (Mode&Lower))
+      {
+        for(; i < kernel.rows(); ++i)
+          if(Mode&Lower) kernel.assignCoeff(i, j);
+          else           kernel.assignOppositeCoeff(i, j);
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+/** Assigns a triangular or selfadjoint matrix to a dense matrix.
+  * If the matrix is triangular, the opposite part is set to zero. */
+template<typename Derived>
+template<typename DenseDerived>
+void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+{
+  other.derived().resize(this->rows(), this->cols());
+  internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
+}
+
+namespace internal {
+  
+// Triangular = Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    dst._assignProduct(src, 1, 0);
+  }
+};
+
+// Triangular += Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
+  {
+    dst._assignProduct(src, 1, 1);
+  }
+};
+
+// Triangular -= Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
+  {
+    dst._assignProduct(src, -1, 1);
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_TRIANGULARMATRIX_H
diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h
index 1a7330f3c..d72fbf7e9 100644
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -13,13 +13,23 @@
 
 namespace Eigen { 
 
+namespace internal {
+template<typename VectorType, int Size>
+struct traits<VectorBlock<VectorType, Size> >
+  : public traits<Block<VectorType,
+                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
+{
+};
+}
+
 /** \class VectorBlock
   * \ingroup Core_Module
   *
   * \brief Expression of a fixed-size or dynamic-size sub-vector
   *
-  * \param VectorType the type of the object in which we are taking a sub-vector
-  * \param Size size of the sub-vector we are taking at compile time (optional)
+  * \tparam VectorType the type of the object in which we are taking a sub-vector
+  * \tparam Size size of the sub-vector we are taking at compile time (optional)
   *
   * This class represents an expression of either a fixed-size or dynamic-size sub-vector.
   * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
@@ -43,17 +53,6 @@ namespace Eigen {
   *
   * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index)
   */
-
-namespace internal {
-template<typename VectorType, int Size>
-struct traits<VectorBlock<VectorType, Size> >
-  : public traits<Block<VectorType,
-                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
-                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
-{
-};
-}
-
 template<typename VectorType, int Size> class VectorBlock
   : public Block<VectorType,
                      internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
@@ -72,6 +71,7 @@ template<typename VectorType, int Size> class VectorBlock
 
     /** Dynamic-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline VectorBlock(VectorType& vector, Index start, Index size)
       : Base(vector,
              IsColVector ? start : 0, IsColVector ? 0 : start,
@@ -82,6 +82,7 @@ template<typename VectorType, int Size> class VectorBlock
 
     /** Fixed-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline VectorBlock(VectorType& vector, Index start)
       : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
     {
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index d5ab03664..4fe267e9f 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_PARTIAL_REDUX_H
 #define EIGEN_PARTIAL_REDUX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class PartialReduxExpr
   * \ingroup Core_Module
@@ -41,64 +41,43 @@ struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
   typedef typename MatrixType::Scalar InputScalar;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
-    Flags0 = (unsigned int)_MatrixTypeNested::Flags & HereditaryBits,
-    Flags = (Flags0 & ~RowMajorBit) | (RowsAtCompileTime == 1 ? RowMajorBit : 0),
+    Flags = RowsAtCompileTime == 1 ? RowMajorBit : 0,
     TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime :  MatrixType::ColsAtCompileTime
   };
-  #if EIGEN_GNUC_AT_LEAST(3,4)
-  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
-  #else
-  typedef typename MemberOp::template Cost<InputScalar,TraversalSize> CostOpType;
-  #endif
-  enum {
-    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
-                  : TraversalSize * traits<_MatrixTypeNested>::CoeffReadCost + int(CostOpType::value)
-  };
 };
 }
 
 template< typename MatrixType, typename MemberOp, int Direction>
-class PartialReduxExpr : internal::no_assignment_operator,
-  public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type
+class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type,
+                         internal::no_assignment_operator
 {
   public:
 
     typedef typename internal::dense_xpr_base<PartialReduxExpr>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(PartialReduxExpr)
-    typedef typename internal::traits<PartialReduxExpr>::MatrixTypeNested MatrixTypeNested;
-    typedef typename internal::traits<PartialReduxExpr>::_MatrixTypeNested _MatrixTypeNested;
 
-    PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
+    EIGEN_DEVICE_FUNC
+    explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
       : m_matrix(mat), m_functor(func) {}
 
+    EIGEN_DEVICE_FUNC
     Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
+    EIGEN_DEVICE_FUNC
     Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
 
-    EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
-    {
-      if (Direction==Vertical)
-        return m_functor(m_matrix.col(j));
-      else
-        return m_functor(m_matrix.row(i));
-    }
+    EIGEN_DEVICE_FUNC
+    typename MatrixType::Nested nestedExpression() const { return m_matrix; }
 
-    const Scalar coeff(Index index) const
-    {
-      if (Direction==Vertical)
-        return m_functor(m_matrix.col(index));
-      else
-        return m_functor(m_matrix.row(index));
-    }
+    EIGEN_DEVICE_FUNC
+    const MemberOp& functor() const { return m_functor; }
 
   protected:
-    MatrixTypeNested m_matrix;
+    typename MatrixType::Nested m_matrix;
     const MemberOp m_functor;
 };
 
@@ -110,7 +89,8 @@ class PartialReduxExpr : internal::no_assignment_operator,
     template<typename Scalar, int Size> struct Cost                     \
     { enum { value = COST }; };                                         \
     template<typename XprType>                                          \
-    EIGEN_STRONG_INLINE ResultType operator()(const XprType& mat) const \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                               \
+    ResultType operator()(const XprType& mat) const                     \
     { return mat.MEMBER(); } \
   }
 
@@ -130,17 +110,27 @@ EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
 
+template <int p, typename ResultType>
+struct member_lpnorm {
+  typedef ResultType result_type;
+  template<typename Scalar, int Size> struct Cost
+  { enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; };
+  EIGEN_DEVICE_FUNC member_lpnorm() {}
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC inline ResultType operator()(const XprType& mat) const
+  { return mat.template lpNorm<p>(); }
+};
 
 template <typename BinaryOp, typename Scalar>
 struct member_redux {
   typedef typename result_of<
-                     BinaryOp(Scalar)
+                     BinaryOp(const Scalar&,const Scalar&)
                    >::type  result_type;
   template<typename _Scalar, int Size> struct Cost
   { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
-  member_redux(const BinaryOp func) : m_functor(func) {}
+  EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
   template<typename Derived>
-  inline result_type operator()(const DenseBase<Derived>& mat) const
+  EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const
   { return mat.redux(m_functor); }
   const BinaryOp m_functor;
 };
@@ -151,8 +141,8 @@ struct member_redux {
   *
   * \brief Pseudo expression providing partial reduction operations
   *
-  * \param ExpressionType the type of the object on which to do partial reductions
-  * \param Direction indicates the direction of the redux (#Vertical or #Horizontal)
+  * \tparam ExpressionType the type of the object on which to do partial reductions
+  * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
   *
   * This class represents a pseudo expression with partial reduction features.
   * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
@@ -169,16 +159,15 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     typedef typename ExpressionType::Scalar Scalar;
     typedef typename ExpressionType::RealScalar RealScalar;
-    typedef typename ExpressionType::Index Index;
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, ExpressionType&>::type ExpressionTypeNested;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
     typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;
 
     template<template<typename _Scalar> class Functor,
-                      typename Scalar=typename internal::traits<ExpressionType>::Scalar> struct ReturnType
+                      typename Scalar_=Scalar> struct ReturnType
     {
       typedef PartialReduxExpr<ExpressionType,
-                               Functor<Scalar>,
+                               Functor<Scalar_>,
                                Direction
                               > Type;
     };
@@ -186,23 +175,24 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     template<typename BinaryOp> struct ReduxReturnType
     {
       typedef PartialReduxExpr<ExpressionType,
-                               internal::member_redux<BinaryOp,typename internal::traits<ExpressionType>::Scalar>,
+                               internal::member_redux<BinaryOp,Scalar>,
                                Direction
                               > Type;
     };
 
     enum {
-      IsVertical   = (Direction==Vertical) ? 1 : 0,
-      IsHorizontal = (Direction==Horizontal) ? 1 : 0
+      isVertical   = (Direction==Vertical) ? 1 : 0,
+      isHorizontal = (Direction==Horizontal) ? 1 : 0
     };
 
   protected:
 
-    /** \internal
-      * \returns the i-th subvector according to the \c Direction */
-    typedef typename internal::conditional<Direction==Vertical,
+    typedef typename internal::conditional<isVertical,
                                typename ExpressionType::ColXpr,
                                typename ExpressionType::RowXpr>::type SubVector;
+    /** \internal
+      * \returns the i-th subvector according to the \c Direction */
+    EIGEN_DEVICE_FUNC
     SubVector subVector(Index i)
     {
       return SubVector(m_matrix.derived(),i);
@@ -210,58 +200,62 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** \internal
       * \returns the number of subvectors in the direction \c Direction */
+    EIGEN_DEVICE_FUNC
     Index subVectors() const
-    { return Direction==Vertical?m_matrix.cols():m_matrix.rows(); }
+    { return isVertical?m_matrix.cols():m_matrix.rows(); }
 
     template<typename OtherDerived> struct ExtendedType {
       typedef Replicate<OtherDerived,
-                        Direction==Vertical   ? 1 : ExpressionType::RowsAtCompileTime,
-                        Direction==Horizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
+                        isVertical   ? 1 : ExpressionType::RowsAtCompileTime,
+                        isHorizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
     };
 
     /** \internal
       * Replicates a vector to match the size of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     typename ExtendedType<OtherDerived>::Type
     extendedTo(const DenseBase<OtherDerived>& other) const
     {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxColsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxColsAtCompileTime==1),
                           YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxRowsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxRowsAtCompileTime==1),
                           YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
       return typename ExtendedType<OtherDerived>::Type
                       (other.derived(),
-                       Direction==Vertical   ? 1 : m_matrix.rows(),
-                       Direction==Horizontal ? 1 : m_matrix.cols());
+                       isVertical   ? 1 : m_matrix.rows(),
+                       isHorizontal ? 1 : m_matrix.cols());
     }
-    
+
     template<typename OtherDerived> struct OppositeExtendedType {
       typedef Replicate<OtherDerived,
-                        Direction==Horizontal ? 1 : ExpressionType::RowsAtCompileTime,
-                        Direction==Vertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
+                        isHorizontal ? 1 : ExpressionType::RowsAtCompileTime,
+                        isVertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
     };
 
     /** \internal
       * Replicates a vector in the opposite direction to match the size of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     typename OppositeExtendedType<OtherDerived>::Type
     extendedToOpposite(const DenseBase<OtherDerived>& other) const
     {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxColsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxColsAtCompileTime==1),
                           YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxRowsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxRowsAtCompileTime==1),
                           YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
       return typename OppositeExtendedType<OtherDerived>::Type
                       (other.derived(),
-                       Direction==Horizontal  ? 1 : m_matrix.rows(),
-                       Direction==Vertical    ? 1 : m_matrix.cols());
+                       isHorizontal  ? 1 : m_matrix.rows(),
+                       isVertical    ? 1 : m_matrix.cols());
     }
 
   public:
-
-    inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
 
     /** \internal */
+    EIGEN_DEVICE_FUNC
     inline const ExpressionType& _expression() const { return m_matrix; }
 
     /** \returns a row or column vector expression of \c *this reduxed by \a func
@@ -272,80 +266,126 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
       */
     template<typename BinaryOp>
+    EIGEN_DEVICE_FUNC
     const typename ReduxReturnType<BinaryOp>::Type
     redux(const BinaryOp& func = BinaryOp()) const
-    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), func); }
+    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
+
+    typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
+    typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
+    typedef typename ReturnType<internal::member_squaredNorm,RealScalar>::Type SquaredNormReturnType;
+    typedef typename ReturnType<internal::member_norm,RealScalar>::Type NormReturnType;
+    typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
+    typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
+    typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
+    typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
+    typedef typename ReturnType<internal::member_mean>::Type MeanReturnType;
+    typedef typename ReturnType<internal::member_all>::Type AllReturnType;
+    typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
+    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
+    typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
+    typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
+    typedef Reverse<ExpressionType, Direction> ReverseReturnType;
+
+    template<int p> struct LpNormReturnType {
+      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar>,Direction> Type;
+    };
 
     /** \returns a row (or column) vector expression of the smallest coefficient
       * of each column (or row) of the referenced expression.
-      * 
+      *
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_minCoeff.cpp
       * Output: \verbinclude PartialRedux_minCoeff.out
       *
       * \sa DenseBase::minCoeff() */
-    const typename ReturnType<internal::member_minCoeff>::Type minCoeff() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const MinCoeffReturnType minCoeff() const
+    { return MinCoeffReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the largest coefficient
       * of each column (or row) of the referenced expression.
-      * 
+      *
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_maxCoeff.cpp
       * Output: \verbinclude PartialRedux_maxCoeff.out
       *
       * \sa DenseBase::maxCoeff() */
-    const typename ReturnType<internal::member_maxCoeff>::Type maxCoeff() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const MaxCoeffReturnType maxCoeff() const
+    { return MaxCoeffReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the squared norm
       * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * Example: \include PartialRedux_squaredNorm.cpp
       * Output: \verbinclude PartialRedux_squaredNorm.out
       *
       * \sa DenseBase::squaredNorm() */
-    const typename ReturnType<internal::member_squaredNorm,RealScalar>::Type squaredNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const SquaredNormReturnType squaredNorm() const
+    { return SquaredNormReturnType(_expression()); }
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
+      *
+      * Example: \include PartialRedux_norm.cpp
+      * Output: \verbinclude PartialRedux_norm.out
+      *
+      * \sa DenseBase::norm() */
+    EIGEN_DEVICE_FUNC
+    const NormReturnType norm() const
+    { return NormReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * Example: \include PartialRedux_norm.cpp
       * Output: \verbinclude PartialRedux_norm.out
       *
       * \sa DenseBase::norm() */
-    const typename ReturnType<internal::member_norm,RealScalar>::Type norm() const
-    { return _expression(); }
+    template<int p>
+    EIGEN_DEVICE_FUNC
+    const typename LpNormReturnType<p>::Type lpNorm() const
+    { return typename LpNormReturnType<p>::Type(_expression()); }
 
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression, using
-      * blue's algorithm.
+      * Blue's algorithm.
+      * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * \sa DenseBase::blueNorm() */
-    const typename ReturnType<internal::member_blueNorm,RealScalar>::Type blueNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const BlueNormReturnType blueNorm() const
+    { return BlueNormReturnType(_expression()); }
 
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression, avoiding
       * underflow and overflow.
+      * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * \sa DenseBase::stableNorm() */
-    const typename ReturnType<internal::member_stableNorm,RealScalar>::Type stableNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const StableNormReturnType stableNorm() const
+    { return StableNormReturnType(_expression()); }
 
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression, avoiding
       * underflow and overflow using a concatenation of hypot() calls.
+      * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * \sa DenseBase::hypotNorm() */
-    const typename ReturnType<internal::member_hypotNorm,RealScalar>::Type hypotNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const HypotNormReturnType hypotNorm() const
+    { return HypotNormReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the sum
       * of each column (or row) of the referenced expression.
@@ -354,39 +394,48 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * Output: \verbinclude PartialRedux_sum.out
       *
       * \sa DenseBase::sum() */
-    const typename ReturnType<internal::member_sum>::Type sum() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const SumReturnType sum() const
+    { return SumReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the mean
     * of each column (or row) of the referenced expression.
     *
     * \sa DenseBase::mean() */
-    const typename ReturnType<internal::member_mean>::Type mean() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const MeanReturnType mean() const
+    { return MeanReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression representing
       * whether \b all coefficients of each respective column (or row) are \c true.
+      * This expression can be assigned to a vector with entries of type \c bool.
       *
       * \sa DenseBase::all() */
-    const typename ReturnType<internal::member_all>::Type all() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const AllReturnType all() const
+    { return AllReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression representing
       * whether \b at \b least one coefficient of each respective column (or row) is \c true.
+      * This expression can be assigned to a vector with entries of type \c bool.
       *
       * \sa DenseBase::any() */
-    const typename ReturnType<internal::member_any>::Type any() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const AnyReturnType any() const
+    { return AnyReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression representing
       * the number of \c true coefficients of each respective column (or row).
+      * This expression can be assigned to a vector whose entries have the same type as is used to
+      * index entries of the original matrix; for dense matrices, this is \c std::ptrdiff_t .
       *
       * Example: \include PartialRedux_count.cpp
       * Output: \verbinclude PartialRedux_count.out
       *
       * \sa DenseBase::count() */
-    const PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> count() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const CountReturnType count() const
+    { return CountReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the product
       * of each column (or row) of the referenced expression.
@@ -395,8 +444,9 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * Output: \verbinclude PartialRedux_prod.out
       *
       * \sa DenseBase::prod() */
-    const typename ReturnType<internal::member_prod>::Type prod() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const ProdReturnType prod() const
+    { return ProdReturnType(_expression()); }
 
 
     /** \returns a matrix expression
@@ -406,10 +456,20 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * Output: \verbinclude Vectorwise_reverse.out
       *
       * \sa DenseBase::reverse() */
-    const Reverse<ExpressionType, Direction> reverse() const
-    { return Reverse<ExpressionType, Direction>( _expression() ); }
+    EIGEN_DEVICE_FUNC
+    const ConstReverseReturnType reverse() const
+    { return ConstReverseReturnType( _expression() ); }
 
-    typedef Replicate<ExpressionType,Direction==Vertical?Dynamic:1,Direction==Horizontal?Dynamic:1> ReplicateReturnType;
+    /** \returns a writable matrix expression
+      * where each column (or row) are reversed.
+      *
+      * \sa reverse() const */
+    EIGEN_DEVICE_FUNC
+    ReverseReturnType reverse()
+    { return ReverseReturnType( _expression() ); }
+
+    typedef Replicate<ExpressionType,(isVertical?Dynamic:1),(isHorizontal?Dynamic:1)> ReplicateReturnType;
+    EIGEN_DEVICE_FUNC
     const ReplicateReturnType replicate(Index factor) const;
 
     /**
@@ -421,17 +481,20 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa VectorwiseOp::replicate(Index), DenseBase::replicate(), class Replicate
       */
     // NOTE implemented here because of sunstudio's compilation errors
-    template<int Factor> const Replicate<ExpressionType,(IsVertical?Factor:1),(IsHorizontal?Factor:1)>
+    // isVertical*Factor+isHorizontal instead of (isVertical?Factor:1) to handle CUDA bug with ternary operator
+    template<int Factor> const Replicate<ExpressionType,isVertical*Factor+isHorizontal,isHorizontal*Factor+isVertical>
+    EIGEN_DEVICE_FUNC
     replicate(Index factor = Factor) const
     {
-      return Replicate<ExpressionType,Direction==Vertical?Factor:1,Direction==Horizontal?Factor:1>
-          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
+      return Replicate<ExpressionType,(isVertical?Factor:1),(isHorizontal?Factor:1)>
+          (_expression(),isVertical?factor:1,isHorizontal?factor:1);
     }
 
 /////////// Artithmetic operators ///////////
 
     /** Copies the vector \a other to each subvector of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -442,6 +505,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Adds the vector \a other to each subvector of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator+=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -451,6 +515,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Substracts the vector \a other to each subvector of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator-=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -460,6 +525,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Multiples each subvector of \c *this by the vector \a other */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator*=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -471,6 +537,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Divides each subvector of \c *this by the vector \a other */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator/=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -481,8 +548,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     }
 
     /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
-    template<typename OtherDerived> EIGEN_STRONG_INLINE
-    CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+    CwiseBinaryOp<internal::scalar_sum_op<Scalar,typename OtherDerived::Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
     operator+(const DenseBase<OtherDerived>& other) const
@@ -494,7 +561,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
     template<typename OtherDerived>
-    CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
+    EIGEN_DEVICE_FUNC
+    CwiseBinaryOp<internal::scalar_difference_op<Scalar,typename OtherDerived::Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
     operator-(const DenseBase<OtherDerived>& other) const
@@ -506,10 +574,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Returns the expression where each subvector is the product of the vector \a other
       * by the corresponding subvector of \c *this */
-    template<typename OtherDerived> EIGEN_STRONG_INLINE
+    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_product_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
+    EIGEN_DEVICE_FUNC
     operator*(const DenseBase<OtherDerived>& other) const
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -521,6 +590,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     /** Returns the expression where each subvector is the quotient of the corresponding
       * subvector of \c *this by the vector \a other */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
@@ -531,32 +601,36 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       return m_matrix / extendedTo(other.derived());
     }
-    
-    /** \returns an expression where each column of row of the referenced matrix are normalized.
+
+    /** \returns an expression where each column (or row) of the referenced matrix are normalized.
       * The referenced matrix is \b not modified.
       * \sa MatrixBase::normalized(), normalize()
       */
+    EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
     normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }
-    
-    
+
+
     /** Normalize in-place each row or columns of the referenced matrix.
       * \sa MatrixBase::normalize(), normalized()
       */
-    void normalize() {
+    EIGEN_DEVICE_FUNC void normalize() {
       m_matrix = this->normalized();
     }
 
+    EIGEN_DEVICE_FUNC inline void reverseInPlace();
+
 /////////// Geometry module ///////////
 
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    Homogeneous<ExpressionType,Direction> homogeneous() const;
-    #endif
+    typedef Homogeneous<ExpressionType,Direction> HomogeneousReturnType;
+    EIGEN_DEVICE_FUNC
+    HomogeneousReturnType homogeneous() const;
 
     typedef typename ExpressionType::PlainObject CrossReturnType;
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const;
 
     enum {
@@ -581,25 +655,15 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
                   Direction==Horizontal ? HNormalized_SizeMinusOne : 1> >
             HNormalizedReturnType;
 
+    EIGEN_DEVICE_FUNC
     const HNormalizedReturnType hnormalized() const;
 
   protected:
     ExpressionTypeNested m_matrix;
 };
 
-/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * Example: \include MatrixBase_colwise.cpp
-  * Output: \verbinclude MatrixBase_colwise.out
-  *
-  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstColwiseReturnType
-DenseBase<Derived>::colwise() const
-{
-  return derived();
-}
+//const colwise moved to DenseBase.h due to CUDA compiler bug
+
 
 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
   *
@@ -609,22 +673,11 @@ template<typename Derived>
 inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
-  return derived();
+  return ColwiseReturnType(derived());
 }
 
-/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * Example: \include MatrixBase_rowwise.cpp
-  * Output: \verbinclude MatrixBase_rowwise.out
-  *
-  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstRowwiseReturnType
-DenseBase<Derived>::rowwise() const
-{
-  return derived();
-}
+//const rowwise moved to DenseBase.h due to CUDA compiler bug
+
 
 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
   *
@@ -634,7 +687,7 @@ template<typename Derived>
 inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
-  return derived();
+  return RowwiseReturnType(derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 64867b7a2..54c1883d9 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -22,6 +22,7 @@ struct visitor_impl
     row = (UnrollCount-1) % Derived::RowsAtCompileTime
   };
 
+  EIGEN_DEVICE_FUNC
   static inline void run(const Derived &mat, Visitor& visitor)
   {
     visitor_impl<Visitor, Derived, UnrollCount-1>::run(mat, visitor);
@@ -32,6 +33,7 @@ struct visitor_impl
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, 1>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const Derived &mat, Visitor& visitor)
   {
     return visitor.init(mat.coeff(0, 0), 0, 0);
@@ -41,7 +43,7 @@ struct visitor_impl<Visitor, Derived, 1>
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, Dynamic>
 {
-  typedef typename Derived::Index Index;
+  EIGEN_DEVICE_FUNC
   static inline void run(const Derived& mat, Visitor& visitor)
   {
     visitor.init(mat.coeff(0,0), 0, 0);
@@ -53,6 +55,33 @@ struct visitor_impl<Visitor, Derived, Dynamic>
   }
 };
 
+// evaluator adaptor
+template<typename XprType>
+class visitor_evaluator
+{
+public:
+  EIGEN_DEVICE_FUNC
+  explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  
+  enum {
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
+  };
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  { return m_evaluator.coeff(row, col); }
+  
+protected:
+  internal::evaluator<XprType> m_evaluator;
+  const XprType &m_xpr;
+};
 } // end namespace internal
 
 /** Applies the visitor \a visitor to the whole coefficients of the matrix or vector.
@@ -74,16 +103,17 @@ struct visitor_impl<Visitor, Derived, Dynamic>
   */
 template<typename Derived>
 template<typename Visitor>
+EIGEN_DEVICE_FUNC
 void DenseBase<Derived>::visit(Visitor& visitor) const
 {
-  enum { unroll = SizeAtCompileTime != Dynamic
-                   && CoeffReadCost != Dynamic
-                   && (SizeAtCompileTime == 1 || internal::functor_traits<Visitor>::Cost != Dynamic)
-                   && SizeAtCompileTime * CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost
-                      <= EIGEN_UNROLLING_LIMIT };
-  return internal::visitor_impl<Visitor, Derived,
-      unroll ? int(SizeAtCompileTime) : Dynamic
-    >::run(derived(), visitor);
+  typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+  
+  enum {
+    unroll =  SizeAtCompileTime != Dynamic
+           && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost <= EIGEN_UNROLLING_LIMIT
+  };
+  return internal::visitor_impl<Visitor, ThisEvaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(thisEval, visitor);
 }
 
 namespace internal {
@@ -94,10 +124,10 @@ namespace internal {
 template <typename Derived>
 struct coeff_visitor
 {
-  typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
   Index row, col;
   Scalar res;
+  EIGEN_DEVICE_FUNC
   inline void init(const Scalar& value, Index i, Index j)
   {
     res = value;
@@ -114,8 +144,8 @@ struct coeff_visitor
 template <typename Derived>
 struct min_coeff_visitor : coeff_visitor<Derived>
 {
-  typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
   void operator() (const Scalar& value, Index i, Index j)
   {
     if(value < this->res)
@@ -142,8 +172,8 @@ struct functor_traits<min_coeff_visitor<Scalar> > {
 template <typename Derived>
 struct max_coeff_visitor : coeff_visitor<Derived>
 {
-  typedef typename Derived::Index Index;
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar; 
+  EIGEN_DEVICE_FUNC
   void operator() (const Scalar& value, Index i, Index j)
   {
     if(value > this->res)
@@ -164,13 +194,15 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
 
 } // end namespace internal
 
-/** \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
+  * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
   * \warning the result is undefined if \c *this contains NaN.
   *
-  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visitor(), DenseBase::minCoeff()
+  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
   */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 {
@@ -184,27 +216,30 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 /** \returns the minimum of all coefficients of *this and puts in *index its location.
   * \warning the result is undefined if \c *this contains NaN. 
   *
-  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::minCoeff()
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
   */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* index) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   internal::min_coeff_visitor<Derived> minVisitor;
   this->visit(minVisitor);
-  *index = (RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row;
+  *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
   return minVisitor.res;
 }
 
-/** \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
+  * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
   * \warning the result is undefined if \c *this contains NaN. 
   *
-  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
   */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 {
@@ -222,6 +257,7 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
   */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* index) const
 {
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
new file mode 100644
index 000000000..99439c8aa
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -0,0 +1,483 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX_H
+#define EIGEN_COMPLEX_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet4cf
+{
+  EIGEN_STRONG_INLINE Packet4cf() {}
+  EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
+  __m256  v;
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet4cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; };
+
+template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
+{
+  return Packet4cf(pnegate(a.v));
+}
+template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
+{
+  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
+  return Packet4cf(_mm256_xor_ps(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
+{
+  __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
+  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+  __m256 result = _mm256_addsub_ps(tmp1, tmp2);
+  return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
+{
+  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
+{
+  // FIXME The following might be optimized using _mm256_movedup_pd
+  Packet2cf a = ploaddup<Packet2cf>(from);
+  Packet2cf b = ploaddup<Packet2cf>(from+1);
+  return  Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, Index stride)
+{
+  return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
+                                 std::imag(from[2*stride]), std::real(from[2*stride]),
+                                 std::imag(from[1*stride]), std::real(from[1*stride]),
+                                 std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index stride)
+{
+  __m128 low = _mm256_extractf128_ps(from.v, 0);
+  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
+  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
+
+  __m128 high = _mm256_extractf128_ps(from.v, 1);
+  to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
+  to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
+
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet4cf>(const Packet4cf& a)
+{
+  return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
+  __m128 low  = _mm256_extractf128_ps(a.v, 0);
+  __m128 high = _mm256_extractf128_ps(a.v, 1);
+  __m128d lowd  = _mm_castps_pd(low);
+  __m128d highd = _mm_castps_pd(high);
+  low  = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
+  high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
+  __m256 result = _mm256_setzero_ps();
+  result = _mm256_insertf128_ps(result, low, 1);
+  result = _mm256_insertf128_ps(result, high, 0);
+  return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
+{
+  return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
+                     Packet2cf(_mm256_extractf128_ps(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
+{
+  Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  t0 = _mm256_hadd_ps(t0,t1);
+  Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  t2 = _mm256_hadd_ps(t2,t3);
+  
+  t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
+  t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
+
+  return Packet4cf(_mm256_add_ps(t1,t3));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
+{
+  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
+                         Packet2cf(_mm256_extractf128_ps(a.v, 1))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet4cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
+  {
+    if (Offset==0) return;
+    palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
+  }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> struct conj_helper<Packet8f, Packet4cf, false,false>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
+  { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet4cf, Packet8f, false,false>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
+  { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
+{
+  Packet4cf num = pmul(a, pconj(b));
+  __m256 tmp = _mm256_mul_ps(b.v, b.v);
+  __m256 tmp2    = _mm256_shuffle_ps(tmp,tmp,0xB1);
+  __m256 denom = _mm256_add_ps(tmp, tmp2);
+  return Packet4cf(_mm256_div_ps(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
+{
+  return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+}
+
+//---------- double ----------
+struct Packet2cd
+{
+  EIGEN_STRONG_INLINE Packet2cd() {}
+  EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
+  __m256d  v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet2cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 2,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; };
+
+template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
+{
+  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
+  return Packet2cd(_mm256_xor_pd(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
+{
+  __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
+  __m256d even = _mm256_mul_pd(tmp1, b.v);
+  __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
+  __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
+  __m256d odd  = _mm256_mul_pd(tmp2, tmp3);
+  return Packet2cd(_mm256_addsub_pd(even, odd));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
+{
+  // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
+//   return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
+    return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, Index stride)
+{
+  return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
+				 std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, Index stride)
+{
+  __m128d low = _mm256_extractf128_pd(from.v, 0);
+  to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
+  __m128d high = _mm256_extractf128_pd(from.v, 1);
+  to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
+{
+  __m128d low = _mm256_extractf128_pd(a.v, 0);
+  EIGEN_ALIGN16 double res[2];
+  _mm_store_pd(res, low);
+  return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
+  __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
+  return Packet2cd(result);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
+{
+  return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
+                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
+{
+  Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
+  Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
+
+  return Packet2cd(_mm256_add_pd(t0,t1));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
+{
+  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
+                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
+  {
+    if (Offset==0) return;
+    palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
+  }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> struct conj_helper<Packet4d, Packet2cd, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
+  { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet2cd, Packet4d, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
+  { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
+{
+  Packet2cd num = pmul(a, pconj(b));
+  __m256d tmp = _mm256_mul_pd(b.v, b.v);
+  __m256d denom = _mm256_hadd_pd(tmp, tmp);
+  return Packet2cd(_mm256_div_pd(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
+{
+  return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4cf,4>& kernel) {
+  __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
+  __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
+  __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
+  __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
+
+  __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
+  __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
+  __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
+  __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
+
+  kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
+  kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
+  kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
+  kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cd,2>& kernel) {
+  __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
+  kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
+ kernel.packet[0].v = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex<float> b)
+{
+  return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,1|2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex<double> b)
+{
+  return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,1|2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex<float> b)
+{
+  return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,(1<<7)|(1<<6)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex<double> b)
+{
+  return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,(1<<3)|(1<<2)));
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_AVX_H
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
new file mode 100644
index 000000000..6af67ce2d
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -0,0 +1,439 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
+#define EIGEN_MATH_FUNCTIONS_AVX_H
+
+/* The sin, cos, exp, and log functions of this file are loosely derived from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+namespace Eigen {
+
+namespace internal {
+
+inline Packet8i pshiftleft(Packet8i v, int n)
+{
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_slli_epi32(v, n);
+#else
+  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n);
+  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+inline Packet8f pshiftright(Packet8f v, int n)
+{
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
+#else
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
+  return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
+#endif
+}
+
+// Sine function
+// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
+// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
+// are (anti-)symmetric and thus have only odd/even coefficients
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+psin<Packet8f>(const Packet8f& _x) {
+  Packet8f x = _x;
+
+  // Some useful values.
+  _EIGEN_DECLARE_CONST_Packet8i(one, 1);
+  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
+  _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
+  _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
+
+  // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
+  Packet8f z = pmul(x, p8f_one_over_pi);
+  Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
+  x = pmadd(shift, p8f_neg_pi_first, x);
+  x = pmadd(shift, p8f_neg_pi_second, x);
+  x = pmadd(shift, p8f_neg_pi_third, x);
+  z = pmul(x, p8f_four_over_pi);
+
+  // Make a mask for the entries that need flipping, i.e. wherever the shift
+  // is odd.
+  Packet8i shift_ints = _mm256_cvtps_epi32(shift);
+  Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
+  Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31);
+
+  // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
+  // is set to ones for that entry.
+  Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
+
+  // Evaluate the polynomial for the interval [1,3] in z.
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
+  Packet8f z_minus_two = psub(z, p8f_two);
+  Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
+  Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
+  right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
+  right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
+
+  // Evaluate the polynomial for the interval [-1,1] in z.
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
+  Packet8f z2 = pmul(z, z);
+  Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
+  left = pmadd(left, z2, p8f_coeff_left_3);
+  left = pmadd(left, z2, p8f_coeff_left_1);
+  left = pmul(left, z);
+
+  // Assemble the results, i.e. select the left and right polynomials.
+  left = _mm256_andnot_ps(ival_mask, left);
+  right = _mm256_and_ps(ival_mask, right);
+  Packet8f res = _mm256_or_ps(left, right);
+
+  // Flip the sign on the odd intervals and return the result.
+  res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
+  return res;
+}
+
+// Natural logarithm
+// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+// be easily approximated by a polynomial centered on m=1 for stability.
+// TODO(gonnet): Further reduce the interval allowing for lower-degree
+//               polynomial interpolants -> ... -> profit!
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+plog<Packet8f>(const Packet8f& _x) {
+  Packet8f x = _x;
+  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
+
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+  // The smallest non denormalized float number.
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
+
+  // Polynomial coefficients.
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
+
+  Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
+  Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
+
+  // Truncate input values to the minimum positive normal.
+  x = pmax(x, p8f_min_norm_pos);
+
+  Packet8f emm0 = pshiftright(x,23);
+  Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
+
+  // Set the exponents to -1, i.e. x are in the range [0.5,1).
+  x = _mm256_and_ps(x, p8f_inv_mant_mask);
+  x = _mm256_or_ps(x, p8f_half);
+
+  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
+  Packet8f tmp = _mm256_and_ps(x, mask);
+  x = psub(x, p8f_1);
+  e = psub(e, _mm256_and_ps(p8f_1, mask));
+  x = padd(x, tmp);
+
+  Packet8f x2 = pmul(x, x);
+  Packet8f x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant of degree 8 in three parts, probably
+  // to improve instruction-level parallelism.
+  Packet8f y, y1, y2;
+  y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
+  y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
+  y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
+  y = pmadd(y, x, p8f_cephes_log_p2);
+  y1 = pmadd(y1, x, p8f_cephes_log_p5);
+  y2 = pmadd(y2, x, p8f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  y1 = pmul(e, p8f_cephes_log_q1);
+  tmp = pmul(x2, p8f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p8f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+
+  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+  return _mm256_or_ps(
+      _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
+      _mm256_and_ps(iszero_mask, p8f_minus_inf));
+}
+
+// Exponential function. Works by writing "x = m*log(2) + r" where
+// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+pexp<Packet8f>(const Packet8f& _x) {
+  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
+
+  // Clamp x.
+  Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
+
+  // Express exp(x) as exp(m*ln(2) + r), start by extracting
+  // m = floor(x/ln(2) + 0.5).
+  Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
+
+// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
+// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
+// truncation errors. Note that we don't use the "pmadd" function here to
+// ensure that a precision-preserving FMA instruction is used.
+#ifdef EIGEN_VECTORIZE_FMA
+  _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
+  Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
+#else
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
+  Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
+  r = psub(r, pmul(m, p8f_cephes_exp_C2));
+#endif
+
+  Packet8f r2 = pmul(r, r);
+
+  // TODO(gonnet): Split into odd/even polynomials and try to exploit
+  //               instruction-level parallelism.
+  Packet8f y = p8f_cephes_exp_p0;
+  y = pmadd(y, r, p8f_cephes_exp_p1);
+  y = pmadd(y, r, p8f_cephes_exp_p2);
+  y = pmadd(y, r, p8f_cephes_exp_p3);
+  y = pmadd(y, r, p8f_cephes_exp_p4);
+  y = pmadd(y, r, p8f_cephes_exp_p5);
+  y = pmadd(y, r2, r);
+  y = padd(y, p8f_1);
+
+  // Build emm0 = 2^m.
+  Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
+  emm0 = pshiftleft(emm0, 23);
+
+  // Return 2^m * exp(r).
+  return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+ptanh<Packet8f>(const Packet8f& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+pexp<Packet4d>(const Packet4d& _x) {
+  Packet4d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
+  _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
+  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
+  _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+
+  Packet4d tmp, fx;
+
+  // clamp x
+  x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
+  // Express exp(x) as exp(g + n*log(2)).
+  fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
+
+  // Get the integer modulus of log(2), i.e. the "n" described above.
+  fx = _mm256_floor_pd(fx);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  tmp = pmul(fx, p4d_cephes_exp_C1);
+  Packet4d z = pmul(fx, p4d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet4d x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet4d px = p4d_cephes_exp_p0;
+  px = pmadd(px, x2, p4d_cephes_exp_p1);
+  px = pmadd(px, x2, p4d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet4d qx = p4d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p4d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = _mm256_div_pd(px, psub(qx, px));
+  x = pmadd(p4d_2, x, p4d_1);
+
+  // Build e=2^n by constructing the exponents in a 128-bit vector and
+  // shifting them to where they belong in double-precision values.
+  __m128i emm0 = _mm256_cvtpd_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_1023);
+  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+  __m128i lo = _mm_slli_epi64(emm0, 52);
+  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+  e = _mm256_insertf128_si256(e, hi, 1);
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
+}
+
+// Functions for sqrt.
+// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
+// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
+// exact solution. It does not handle +inf, or denormalized numbers correctly.
+// The main advantage of this approach is not just speed, but also the fact that
+// it can be inlined and pipelined with other computations, further reducing its
+// effective latency. This is similar to Quake3's fast inverse square root.
+// For detail see here: http://www.beyond3d.com/content/articles/8/
+#if EIGEN_FAST_MATH
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+psqrt<Packet8f>(const Packet8f& _x) {
+  Packet8f half = pmul(_x, pset1<Packet8f>(.5f));
+  Packet8f denormal_mask = _mm256_and_ps(
+      _mm256_cmp_ps(_x, pset1<Packet8f>((std::numeric_limits<float>::min)()),
+                    _CMP_LT_OQ),
+      _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
+
+  // Compute approximate reciprocal sqrt.
+  Packet8f x = _mm256_rsqrt_ps(_x);
+  // Do a single step of Newton's iteration.
+  x = pmul(x, psub(pset1<Packet8f>(1.5f), pmul(half, pmul(x,x))));
+  // Flush results for denormals to zero.
+  return _mm256_andnot_ps(denormal_mask, pmul(_x,x));
+}
+#else
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& x) {
+  return _mm256_sqrt_ps(x);
+}
+#endif
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d psqrt<Packet4d>(const Packet4d& x) {
+  return _mm256_sqrt_pd(x);
+}
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
+
+  Packet8f neg_half = pmul(_x, p8f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+  Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
+  Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
+  Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
+                                        _mm256_and_ps(zero_mask, p8f_inf));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm256_or_ps(x, infs_and_nans);
+}
+
+#else
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& x) {
+  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
+}
+#endif
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d prsqrt<Packet4d>(const Packet4d& x) {
+  _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
+  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
+}
+
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_AVX_H
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
new file mode 100644
index 000000000..195d40fb4
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -0,0 +1,633 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_AVX_H
+#define EIGEN_PACKET_MATH_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#endif
+
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+#endif
+
+typedef __m256  Packet8f;
+typedef __m256i Packet8i;
+typedef __m256d Packet4d;
+
+template<> struct is_arithmetic<__m256>  { enum { value = true }; };
+template<> struct is_arithmetic<__m256i> { enum { value = true }; };
+template<> struct is_arithmetic<__m256d> { enum { value = true }; };
+
+#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
+  const Packet8f p8f_##NAME = pset1<Packet8f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
+  const Packet4d p4d_##NAME = pset1<Packet4d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
+  const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
+  const Packet8i p8i_##NAME = pset1<Packet8i>(X)
+
+// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
+// to leverage AVX512 instructions.
+#ifndef EIGEN_VECTORIZE_AVX512
+template<> struct packet_traits<float>  : default_packet_traits
+{
+  typedef Packet8f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8,
+    HasHalfPacket = 1,
+
+    HasDiv  = 1,
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh  = EIGEN_FAST_MATH,
+    HasBlend = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
+  };
+};
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet4d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 1,
+
+    HasDiv  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasBlend = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
+  };
+};
+#endif
+
+template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
+template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
+
+/* Proper support for integers is only provided by AVX2. In the meantime, we'll
+   use SSE instructions and packets to deal with integers.
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet8i type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8
+  };
+};
+*/
+
+template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
+
+template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
+
+template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
+template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
+{
+  return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
+{
+  return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
+{ eigen_assert(false && "packet integer division are not supported by AVX");
+  return pset1<Packet8i>(0);
+}
+
+#ifdef __FMA__
+template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
+#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
+  // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
+  // and gcc stupidly generates a vfmadd132ps instruction,
+  // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
+  // the result of the product.
+  Packet8f res = c;
+  __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  return _mm256_fmadd_ps(a,b,c);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
+#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
+  // see above
+  Packet4d res = c;
+  __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  return _mm256_fmadd_pd(a,b,c);
+#endif
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
+
+// Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
+template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
+{
+  // TODO try to find a way to avoid the need of a temporary register
+//   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
+//   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
+//   return _mm256_unpacklo_ps(tmp,tmp);
+  
+  // _mm256_insertf128_ps is very slow on Haswell, thus:
+  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
+  // mimic an "inplace" permutation of the lower 128bits using a blend
+  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
+  // then we can perform a consistent permutation on the global register to get everything in shape:
+  return  _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
+}
+// Loads 2 doubles from memory a returns the packet {a0, a0  a1, a1}
+template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
+{
+  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
+  return  _mm256_permute_pd(tmp, 3<<2);
+}
+
+// Loads 2 floats from memory a returns the packet {a0, a0  a0, a0, a1, a1, a1, a1}
+template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
+{
+  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
+  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+
+// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
+// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
+template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
+{
+  return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
+                       from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
+{
+  return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
+{
+  __m128 low = _mm256_extractf128_ps(from, 0);
+  to[stride*0] = _mm_cvtss_f32(low);
+  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
+  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
+  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
+
+  __m128 high = _mm256_extractf128_ps(from, 1);
+  to[stride*4] = _mm_cvtss_f32(high);
+  to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
+  to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
+  to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
+{
+  __m128d low = _mm256_extractf128_pd(from, 0);
+  to[stride*0] = _mm_cvtsd_f64(low);
+  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
+  __m128d high = _mm256_extractf128_pd(from, 1);
+  to[stride*2] = _mm_cvtsd_f64(high);
+  to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
+{
+  Packet8f pa = pset1<Packet8f>(a);
+  pstore(to, pa);
+}
+template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
+{
+  Packet4d pa = pset1<Packet4d>(a);
+  pstore(to, pa);
+}
+template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
+{
+  Packet8i pa = pset1<Packet8i>(a);
+  pstore(to, pa);
+}
+
+#ifndef EIGEN_VECTORIZE_AVX512
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+#endif
+
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
+  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
+template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
+  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
+}
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet8i>(const Packet8i& a) {
+  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
+{
+  __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
+  return _mm256_permute2f128_ps(tmp, tmp, 1);
+}
+template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
+{
+   __m256d tmp = _mm256_shuffle_pd(a,a,5);
+  return _mm256_permute2f128_pd(tmp, tmp, 1);
+
+  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
+    return _mm256_permute_pd(swap_halves,5);
+}
+
+// pabs should be ok
+template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
+{
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  return _mm256_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
+{
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  return _mm256_and_pd(a,mask);
+}
+
+// preduxp should be ok
+// FIXME: why is this ok? why isn't the simply implementation working as expected?
+template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
+{
+    __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
+    __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
+    __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
+    __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
+
+    __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+    __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+    __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+    __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+    __m256 perm1 =  _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+    __m256 perm2 =  _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+    __m256 perm3 =  _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+    __m256 perm4 =  _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+    __m256 sum1 = _mm256_add_ps(perm1, hsum5);
+    __m256 sum2 = _mm256_add_ps(perm2, hsum6);
+    __m256 sum3 = _mm256_add_ps(perm3, hsum7);
+    __m256 sum4 = _mm256_add_ps(perm4, hsum8);
+
+    __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+    __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+    __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
+    return final;
+}
+template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
+{
+ Packet4d tmp0, tmp1;
+
+  tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  return _mm256_blend_pd(tmp0, tmp1, 0xC);
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
+{
+  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
+}
+template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
+{
+  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
+{
+  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp;
+  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp;
+  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+
+template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
+}
+
+
+template<int Offset>
+struct palign_impl<Offset,Packet8f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm256_blend_ps(first, second, 1);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0x88);
+    }
+    else if (Offset==2)
+    {
+      first = _mm256_blend_ps(first, second, 3);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
+    }
+    else if (Offset==3)
+    {
+      first = _mm256_blend_ps(first, second, 7);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0xee);
+    }
+    else if (Offset==4)
+    {
+      first = _mm256_blend_ps(first, second, 15);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
+    }
+    else if (Offset==5)
+    {
+      first = _mm256_blend_ps(first, second, 31);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0x88);
+    }
+    else if (Offset==6)
+    {
+      first = _mm256_blend_ps(first, second, 63);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0xcc);
+    }
+    else if (Offset==7)
+    {
+      first = _mm256_blend_ps(first, second, 127);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0xee);
+    }
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet4d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm256_blend_pd(first, second, 1);
+      __m256d tmp = _mm256_permute_pd(first, 5);
+      first = _mm256_permute2f128_pd(tmp, tmp, 1);
+      first = _mm256_blend_pd(tmp, first, 0xA);
+    }
+    else if (Offset==2)
+    {
+      first = _mm256_blend_pd(first, second, 3);
+      first = _mm256_permute2f128_pd(first, first, 1);
+    }
+    else if (Offset==3)
+    {
+      first = _mm256_blend_pd(first, second, 7);
+      __m256d tmp = _mm256_permute_pd(first, 5);
+      first = _mm256_permute2f128_pd(tmp, tmp, 1);
+      first = _mm256_blend_pd(tmp, first, 5);
+    }
+  }
+};
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8f,8>& kernel) {
+  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
+  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
+  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
+  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+  __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
+  __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
+  __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
+  __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
+  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
+  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
+  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
+  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
+  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
+  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8f,4>& kernel) {
+  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+
+  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
+  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
+  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
+  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+
+  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
+  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4d,4>& kernel) {
+  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
+  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
+  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
+  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
+
+  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
+  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
+  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
+  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
+  const __m256 zero = _mm256_setzero_ps();
+  const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
+  return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
+  const __m256d zero = _mm256_setzero_pd();
+  const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
+  return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b)
+{
+  return _mm256_blend_ps(a,pset1<Packet8f>(b),1);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b)
+{
+  return _mm256_blend_pd(a,pset1<Packet4d>(b),1);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b)
+{
+  return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b)
+{
+  return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3));
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_AVX_H
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
new file mode 100644
index 000000000..83bfdc604
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX_H
+#define EIGEN_TYPE_CASTING_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// For now we use SSE to handle integers, so we can't use AVX instructions to cast
+// from int to float
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+
+template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
+  return _mm256_cvtps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
+  return _mm256_cvtepi32_ps(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_AVX_H
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
new file mode 100644
index 000000000..399be0ee4
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -0,0 +1,396 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
+#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
+
+namespace Eigen {
+
+namespace internal {
+
+// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
+#if EIGEN_GNUC_AT_LEAST(5, 3)
+
+#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
+  const Packet16f p16f_##NAME = pset1<Packet16f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
+  const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
+  const Packet8d p8d_##NAME = pset1<Packet8d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
+  const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
+
+// Natural logarithm
+// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+// be easily approximated by a polynomial centered on m=1 for stability.
+#if defined(EIGEN_VECTORIZE_AVX512DQ)
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+plog<Packet16f>(const Packet16f& _x) {
+  Packet16f x = _x;
+  _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f);
+
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+  // The smallest non denormalized float number.
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
+
+  // Polynomial coefficients.
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
+
+  // invalid_mask is set to true when x is NaN
+  __mmask16 invalid_mask =
+      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
+  __mmask16 iszero_mask =
+      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
+
+  // Truncate input values to the minimum positive normal.
+  x = pmax(x, p16f_min_norm_pos);
+
+  // Extract the shifted exponents.
+  Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
+  Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
+
+  // Set the exponents to -1, i.e. x are in the range [0.5,1).
+  x = _mm512_and_ps(x, p16f_inv_mant_mask);
+  x = _mm512_or_ps(x, p16f_half);
+
+  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
+  Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps());
+  x = psub(x, p16f_1);
+  e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps()));
+  x = padd(x, tmp);
+
+  Packet16f x2 = pmul(x, x);
+  Packet16f x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant of degree 8 in three parts, probably
+  // to improve instruction-level parallelism.
+  Packet16f y, y1, y2;
+  y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1);
+  y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4);
+  y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7);
+  y = pmadd(y, x, p16f_cephes_log_p2);
+  y1 = pmadd(y1, x, p16f_cephes_log_p5);
+  y2 = pmadd(y2, x, p16f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  y1 = pmul(e, p16f_cephes_log_q1);
+  tmp = pmul(x2, p16f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p16f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+
+  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+  return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf,
+                              _mm512_mask_blend_ps(invalid_mask, p16f_nan, x));
+}
+#endif
+
+// Exponential function. Works by writing "x = m*log(2) + r" where
+// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+pexp<Packet16f>(const Packet16f& _x) {
+  _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet16f(127, 127.0f);
+
+  _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f);
+
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f);
+
+  // Clamp x.
+  Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo);
+
+  // Express exp(x) as exp(m*ln(2) + r), start by extracting
+  // m = floor(x/ln(2) + 0.5).
+  Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half));
+
+  // Get r = x - m*ln(2). Note that we can do this without losing more than one
+  // ulp precision due to the FMA instruction.
+  _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
+  Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
+  Packet16f r2 = pmul(r, r);
+
+  // TODO(gonnet): Split into odd/even polynomials and try to exploit
+  //               instruction-level parallelism.
+  Packet16f y = p16f_cephes_exp_p0;
+  y = pmadd(y, r, p16f_cephes_exp_p1);
+  y = pmadd(y, r, p16f_cephes_exp_p2);
+  y = pmadd(y, r, p16f_cephes_exp_p3);
+  y = pmadd(y, r, p16f_cephes_exp_p4);
+  y = pmadd(y, r, p16f_cephes_exp_p5);
+  y = pmadd(y, r2, r);
+  y = padd(y, p16f_1);
+
+  // Build emm0 = 2^m.
+  Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
+  emm0 = _mm512_slli_epi32(emm0, 23);
+
+  // Return 2^m * exp(r).
+  return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
+}
+
+/*template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+pexp<Packet8d>(const Packet8d& _x) {
+  Packet8d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
+  _EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
+
+  _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
+  _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+  // clamp x
+  x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
+
+  // Express exp(x) as exp(g + n*log(2)).
+  const Packet8d n =
+      _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
+  const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
+  x = psub(x, nC1);
+  x = psub(x, nC2);
+
+  const Packet8d x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet8d px = p8d_cephes_exp_p0;
+  px = pmadd(px, x2, p8d_cephes_exp_p1);
+  px = pmadd(px, x2, p8d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet8d qx = p8d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p8d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p8d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p8d_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = _mm512_div_pd(px, psub(qx, px));
+  x = pmadd(p8d_2, x, p8d_1);
+
+  // Build e=2^n.
+  const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
+      _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  return pmax(pmul(x, e), _x);
+  }*/
+
+// Functions for sqrt.
+// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
+// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
+// exact solution. The main advantage of this approach is not just speed, but
+// also the fact that it can be inlined and pipelined with other computations,
+// further reducing its effective latency.
+#if EIGEN_FAST_MATH
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+psqrt<Packet16f>(const Packet16f& _x) {
+  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
+
+  Packet16f neg_half = pmul(_x, p16f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
+  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
+                                     _mm512_setzero_ps());
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+
+  // Multiply the original _x by it's reciprocal square root to extract the
+  // square root.
+  return pmul(_x, x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+psqrt<Packet8d>(const Packet8d& _x) {
+  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
+  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
+
+  Packet8d neg_half = pmul(_x, p8d_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
+  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
+                                    _mm512_setzero_pd());
+
+  // Do a first step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+
+  // Do a second step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+
+  // Multiply the original _x by it's reciprocal square root to extract the
+  // square root.
+  return pmul(_x, x);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
+  return _mm512_sqrt_ps(x);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
+  return _mm512_sqrt_pd(x);
+}
+#endif
+
+// Functions for rsqrt.
+// Almost identical to the sqrt routine, just leave out the last multiplication
+// and fill in NaN/Inf where needed. Note that this function only exists as an
+// iterative version for doubles since there is no instruction for diretly
+// computing the reciprocal square root in AVX-512.
+#ifdef EIGEN_FAST_MATH
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+prsqrt<Packet16f>(const Packet16f& _x) {
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
+
+  Packet16f neg_half = pmul(_x, p16f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
+  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
+                                     _mm512_rsqrt14_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
+  Packet16f infs_and_nans = _mm512_mask_blend_ps(
+      neg_mask, p16f_nan,
+      _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+prsqrt<Packet8d>(const Packet8d& _x) {
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
+  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
+  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
+
+  Packet8d neg_half = pmul(_x, p8d_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
+  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
+                                    _mm512_rsqrt14_pd(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
+  Packet8d infs_and_nans = _mm512_mask_blend_pd(
+      neg_mask, p8d_nan,
+      _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
+
+  // Do a first step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+
+  // Do a second step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  return _mm512_rsqrt28_ps(x);
+}
+#endif
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
new file mode 100644
index 000000000..f6500a16e
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -0,0 +1,1316 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_AVX512_H
+#define EIGEN_PACKET_MATH_AVX512_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#endif
+
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+#endif
+
+typedef __m512 Packet16f;
+typedef __m512i Packet16i;
+typedef __m512d Packet8d;
+
+template <>
+struct is_arithmetic<__m512> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m512i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m512d> {
+  enum { value = true };
+};
+
+template<> struct packet_traits<float>  : default_packet_traits
+{
+  typedef Packet16f type;
+  typedef Packet8f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+#if EIGEN_GNUC_AT_LEAST(5, 3)
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+    HasLog = 1,
+#endif
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+#endif
+    HasDiv = 1
+  };
+ };
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet8d type;
+  typedef Packet4d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+#if EIGEN_GNUC_AT_LEAST(5, 3)
+    HasSqrt = 1,
+    HasRsqrt = EIGEN_FAST_MATH,
+#endif
+    HasDiv = 1
+  };
+};
+
+/* TODO Implement AVX512 for integers
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet16i type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8
+  };
+};
+*/
+
+template <>
+struct unpacket_traits<Packet16f> {
+  typedef float type;
+  typedef Packet8f half;
+  enum { size = 16, alignment=Aligned64 };
+};
+template <>
+struct unpacket_traits<Packet8d> {
+  typedef double type;
+  typedef Packet4d half;
+  enum { size = 8, alignment=Aligned64 };
+};
+template <>
+struct unpacket_traits<Packet16i> {
+  typedef int type;
+  typedef Packet8i half;
+  enum { size = 16, alignment=Aligned64 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
+  return _mm512_set1_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pset1<Packet8d>(const double& from) {
+  return _mm512_set1_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
+  return _mm512_set1_epi32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
+  return _mm512_broadcastss_ps(_mm_load_ps1(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
+  return _mm512_broadcastsd_pd(_mm_load_pd1(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
+  return _mm512_add_ps(
+      _mm512_set1_ps(a),
+      _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
+                    4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
+  return _mm512_add_pd(_mm512_set1_pd(a),
+                       _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_add_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_add_pd(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_sub_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_sub_pd(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
+  return _mm512_sub_ps(_mm512_set1_ps(0.0), a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
+  return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pconj(const Packet8d& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_mul_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_mul_pd(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_div_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_div_pd(a, b);
+}
+
+#ifdef __FMA__
+template <>
+EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
+                                    const Packet16f& c) {
+  return _mm512_fmadd_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
+                                   const Packet8d& c) {
+  return _mm512_fmadd_pd(a, b, c);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_min_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_min_pd(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_max_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_max_pd(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_and_ps(a, b);
+#else
+  Packet16f res = _mm512_undefined_ps();
+  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
+  res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
+
+  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
+  res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
+
+  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
+  res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
+
+  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
+  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
+  res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
+
+  return res;
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_and_pd(a, b);
+#else
+  Packet8d res = _mm512_undefined_pd();
+  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
+  res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0);
+
+  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
+  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
+  res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
+
+  return res;
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
+                                             const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_or_ps(a, b);
+#else
+  Packet16f res = _mm512_undefined_ps();
+  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
+  res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
+
+  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
+  res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
+
+  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
+  res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
+
+  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
+  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
+  res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
+
+  return res;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
+                                           const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_or_pd(a, b);
+#else
+  Packet8d res = _mm512_undefined_pd();
+  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
+  res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
+
+  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
+  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
+  res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
+
+  return res;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_xor_ps(a, b);
+#else
+  Packet16f res = _mm512_undefined_ps();
+  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
+  res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
+
+  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
+  res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
+
+  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
+  res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
+
+  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
+  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
+  res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
+
+  return res;
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_xor_pd(a, b);
+#else
+  Packet8d res = _mm512_undefined_pd();
+  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
+  res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
+
+  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
+  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
+  res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
+
+  return res;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
+                                                 const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_andnot_ps(a, b);
+#else
+  Packet16f res = _mm512_undefined_ps();
+  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
+  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
+
+  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
+  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
+
+  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
+  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
+
+  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
+  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
+  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
+
+  return res;
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
+                                               const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_andnot_pd(a, b);
+#else
+  Packet8d res = _mm512_undefined_pd();
+  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
+  res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
+
+  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
+  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
+  res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
+
+  return res;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+
+// Loads 8 floats from memory a returns the packet
+// {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
+  Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from);
+  // mimic an "inplace" permutation of the lower 128bits using a blend
+  lane0 = _mm256_blend_ps(
+      lane0, _mm256_castps128_ps256(_mm_permute_ps(
+                 _mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))),
+      15);
+  // then we can perform a consistent permutation on the global register to get
+  // everything in shape:
+  lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2));
+
+  Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4));
+  // mimic an "inplace" permutation of the lower 128bits using a blend
+  lane1 = _mm256_blend_ps(
+      lane1, _mm256_castps128_ps256(_mm_permute_ps(
+                 _mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))),
+      15);
+  // then we can perform a consistent permutation on the global register to get
+  // everything in shape:
+  lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2));
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  Packet16f res = _mm512_undefined_ps();
+  return _mm512_insertf32x8(res, lane0, 0);
+  return _mm512_insertf32x8(res, lane1, 1);
+  return res;
+#else
+  Packet16f res = _mm512_undefined_ps();
+  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0);
+  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1);
+  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2);
+  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3);
+  return res;
+#endif
+}
+// Loads 4 doubles from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3,
+// a3}
+template <>
+EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
+  Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from);
+  lane0 = _mm256_permute_pd(lane0, 3 << 2);
+
+  Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2));
+  lane1 = _mm256_permute_pd(lane1, 3 << 2);
+
+  Packet8d res = _mm512_undefined_pd();
+  res = _mm512_insertf64x4(res, lane0, 0);
+  return _mm512_insertf64x4(res, lane1, 1);
+}
+
+// Loads 4 floats from memory a returns the packet
+// {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
+  Packet16f tmp = _mm512_undefined_ps();
+  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
+  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
+  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
+  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
+  return tmp;
+}
+// Loads 2 doubles from memory a returns the packet
+// {a0, a0  a0, a0, a1, a1, a1, a1}
+template <>
+EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
+  Packet8d tmp = _mm512_undefined_pd();
+  Packet2d tmp0 = _mm_load_pd1(from);
+  Packet2d tmp1 = _mm_load_pd1(from + 1);
+  Packet4d lane0 = _mm256_broadcastsd_pd(tmp0);
+  Packet4d lane1 = _mm256_broadcastsd_pd(tmp1);
+  tmp = _mm512_insertf64x4(tmp, lane0, 0);
+  return _mm512_insertf64x4(tmp, lane1, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet8d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to),
+                                                from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+      reinterpret_cast<__m512i*>(to), from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
+                                                             Index stride) {
+  Packet16i stride_vector = _mm512_set1_epi32(stride);
+  Packet16i stride_multiplier =
+      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+
+  return _mm512_i32gather_ps(indices, from, 4);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
+                                                            Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(stride);
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+
+  return _mm512_i32gather_pd(indices, from, 8);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
+                                                         const Packet16f& from,
+                                                         Index stride) {
+  Packet16i stride_vector = _mm512_set1_epi32(stride);
+  Packet16i stride_multiplier =
+      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+  _mm512_i32scatter_ps(to, indices, from, 4);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
+                                                         const Packet8d& from,
+                                                         Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(stride);
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+  _mm512_i32scatter_pd(to, indices, from, 8);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) {
+  Packet16f pa = pset1<Packet16f>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8d>(double* to, const double& a) {
+  Packet8d pa = pset1<Packet8d>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
+  Packet16i pa = pset1<Packet16i>(a);
+  pstore(to, pa);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
+  return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0));
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet8d>(const Packet8d& a) {
+  return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0));
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) {
+  return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a)
+{
+  return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
+{
+  return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
+{
+  // _mm512_abs_ps intrinsic not found, so hack around it
+  return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
+  // _mm512_abs_ps intrinsic not found, so hack around it
+  return (__m512d)_mm512_and_si512((__m512i)a,
+                                   _mm512_set1_epi64(0x7fffffffffffffff));
+}
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                           \
+  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \
+      _mm512_extractf32x8_ps(INPUT, 1)
+#else
+#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                \
+  __m256 OUTPUT##_0 = _mm256_insertf128_ps(                     \
+      _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
+      _mm512_extractf32x4_ps(INPUT, 1), 1);                     \
+  __m256 OUTPUT##_1 = _mm256_insertf128_ps(                     \
+      _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
+      _mm512_extractf32x4_ps(INPUT, 3), 1);
+#endif
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
+  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0);        \
+  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
+#else
+#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB)                    \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
+#endif
+template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f*
+vecs)
+{
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15);
+
+  __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0);
+  __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0);
+  __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0);
+  __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0);
+
+  __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+  __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+  __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+  __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+  __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+  __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+  __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+  __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+  __m256 sum1 = _mm256_add_ps(perm1, hsum5);
+  __m256 sum2 = _mm256_add_ps(perm2, hsum6);
+  __m256 sum3 = _mm256_add_ps(perm3, hsum7);
+  __m256 sum4 = _mm256_add_ps(perm4, hsum8);
+
+  __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+  __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+  __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
+
+  hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1);
+  hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1);
+  hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1);
+  hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1);
+
+  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+  sum1 = _mm256_add_ps(perm1, hsum5);
+  sum2 = _mm256_add_ps(perm2, hsum6);
+  sum3 = _mm256_add_ps(perm3, hsum7);
+  sum4 = _mm256_add_ps(perm4, hsum8);
+
+  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+  final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0));
+
+  hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0);
+  hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0);
+  hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0);
+  hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0);
+
+  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+  sum1 = _mm256_add_ps(perm1, hsum5);
+  sum2 = _mm256_add_ps(perm2, hsum6);
+  sum3 = _mm256_add_ps(perm3, hsum7);
+  sum4 = _mm256_add_ps(perm4, hsum8);
+
+  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+  __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0);
+
+  hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1);
+  hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1);
+  hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1);
+  hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1);
+
+  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+  sum1 = _mm256_add_ps(perm1, hsum5);
+  sum2 = _mm256_add_ps(perm2, hsum6);
+  sum3 = _mm256_add_ps(perm3, hsum7);
+  sum4 = _mm256_add_ps(perm4, hsum8);
+
+  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+  final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0));
+
+  __m512 final_output;
+
+  EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1);
+  return final_output;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
+{
+  Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0);
+  Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1);
+
+  Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0);
+  Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1);
+
+  Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0);
+  Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1);
+
+  Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0);
+  Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1);
+
+  Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0);
+  Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1);
+
+  Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0);
+  Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1);
+
+  Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0);
+  Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1);
+
+  Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0);
+  Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1);
+
+  Packet4d tmp0, tmp1;
+
+  tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC);
+
+  tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC));
+
+  tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC);
+
+  tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
+
+  __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0);
+
+  return _mm512_insertf64x4(final_output, final_1, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
+  //#ifdef EIGEN_VECTORIZE_AVX512DQ
+#if 0
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  Packet8f sum = padd(lane0, lane1);
+  Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1));
+  tmp0 = _mm256_hadd_ps(tmp0, tmp0);
+  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
+#else
+  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
+  Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3));
+  sum = _mm_hadd_ps(sum, sum);
+  sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
+  return pfirst(sum);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  Packet4d sum = padd(lane0, lane1);
+  Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
+  return pfirst(_mm256_hadd_pd(tmp0, tmp0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return padd(lane0, lane1);
+#else
+  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
+  Packet4f sum0 = padd(lane0, lane2);
+  Packet4f sum1 = padd(lane1, lane3);
+  return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  Packet4d res = padd(lane0, lane1);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
+//#ifdef EIGEN_VECTORIZE_AVX512DQ
+#if 0
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  Packet8f res = pmul(lane0, lane1);
+  res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
+  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+#else
+  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
+  Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
+  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  Packet4d res = pmul(lane0, lane1);
+  res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
+  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
+  Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
+  res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  Packet4d res = _mm256_min_pd(lane0, lane1);
+  res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
+  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
+  Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
+  res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  Packet4d res = _mm256_max_pd(lane0, lane1);
+  res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template <int Offset>
+struct palign_impl<Offset, Packet16f> {
+  static EIGEN_STRONG_INLINE void run(Packet16f& first,
+                                      const Packet16f& second) {
+    if (Offset != 0) {
+      __m512i first_idx = _mm512_set_epi32(
+          Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
+          Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
+          Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
+
+      __m512i second_idx =
+          _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
+                           Offset - 5, Offset - 6, Offset - 7, Offset - 8,
+                           Offset - 9, Offset - 10, Offset - 11, Offset - 12,
+                           Offset - 13, Offset - 14, Offset - 15, Offset - 16);
+
+      unsigned short mask = 0xFFFF;
+      mask <<= (16 - Offset);
+
+      first = _mm512_permutexvar_ps(first_idx, first);
+      Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
+      first = _mm512_mask_blend_ps(mask, first, tmp);
+    }
+  }
+};
+template <int Offset>
+struct palign_impl<Offset, Packet8d> {
+  static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
+    if (Offset != 0) {
+      __m512i first_idx = _mm512_set_epi32(
+          0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
+          Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
+
+      __m512i second_idx = _mm512_set_epi32(
+          0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
+          Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
+
+      unsigned char mask = 0xFF;
+      mask <<= (8 - Offset);
+
+      first = _mm512_permutexvar_pd(first_idx, first);
+      Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
+      first = _mm512_mask_blend_pd(mask, first, tmp);
+    }
+  }
+};
+
+
+#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
+  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
+  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+  __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]);
+  __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]);
+  __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]);
+  __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]);
+  __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]);
+  __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]);
+  __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]);
+  __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]);
+  __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
+
+  EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
+  EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
+  EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
+  EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
+  EIGEN_EXTRACT_8f_FROM_16f(S4, S4);
+  EIGEN_EXTRACT_8f_FROM_16f(S5, S5);
+  EIGEN_EXTRACT_8f_FROM_16f(S6, S6);
+  EIGEN_EXTRACT_8f_FROM_16f(S7, S7);
+  EIGEN_EXTRACT_8f_FROM_16f(S8, S8);
+  EIGEN_EXTRACT_8f_FROM_16f(S9, S9);
+  EIGEN_EXTRACT_8f_FROM_16f(S10, S10);
+  EIGEN_EXTRACT_8f_FROM_16f(S11, S11);
+  EIGEN_EXTRACT_8f_FROM_16f(S12, S12);
+  EIGEN_EXTRACT_8f_FROM_16f(S13, S13);
+  EIGEN_EXTRACT_8f_FROM_16f(S14, S14);
+  EIGEN_EXTRACT_8f_FROM_16f(S15, S15);
+
+  PacketBlock<Packet8f, 32> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20);
+  tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20);
+  tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20);
+  tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20);
+  tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31);
+  tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31);
+  tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31);
+  tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31);
+
+  tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20);
+  tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20);
+  tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20);
+  tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20);
+  tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31);
+  tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31);
+  tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31);
+  tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31);
+
+  // Second set of _m256 outputs
+  tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20);
+  tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20);
+  tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20);
+  tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20);
+  tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31);
+  tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31);
+  tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31);
+  tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31);
+
+  tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20);
+  tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20);
+  tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20);
+  tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20);
+  tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31);
+  tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31);
+  tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31);
+  tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31);
+
+  // Pack them into the output
+  PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16);
+
+  PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16);
+
+  PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16);
+
+  PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
+}
+#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE)         \
+  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
+                           INPUT[2 * INDEX + STRIDE]);
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
+  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+
+  __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+
+  EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
+  EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
+  EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
+  EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
+
+  PacketBlock<Packet8f, 8> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20);
+  tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20);
+  tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31);
+  tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31);
+
+  tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20);
+  tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20);
+  tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31);
+  tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31);
+
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1);
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1);
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1);
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1);
+}
+
+#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE)                \
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
+
+#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE)                         \
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
+  OUTPUT[INDEX] =                                                           \
+      _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
+  __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
+  __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff);
+  __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
+  __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff);
+
+  PacketBlock<Packet4d, 8> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
+                                         _mm512_extractf64x4_pd(T2, 0), 0x20);
+  tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
+                                         _mm512_extractf64x4_pd(T3, 0), 0x20);
+  tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
+                                         _mm512_extractf64x4_pd(T2, 0), 0x31);
+  tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
+                                         _mm512_extractf64x4_pd(T3, 0), 0x31);
+
+  tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
+                                         _mm512_extractf64x4_pd(T2, 1), 0x20);
+  tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
+                                         _mm512_extractf64x4_pd(T3, 1), 0x20);
+  tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
+                                         _mm512_extractf64x4_pd(T2, 1), 0x31);
+  tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
+                                         _mm512_extractf64x4_pd(T3, 1), 0x31);
+
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1);
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
+  __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+  __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+  __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]);
+  __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]);
+  __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]);
+  __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]);
+  __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
+  __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
+
+  PacketBlock<Packet4d, 16> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
+                                         _mm512_extractf64x4_pd(T2, 0), 0x20);
+  tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
+                                         _mm512_extractf64x4_pd(T3, 0), 0x20);
+  tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
+                                         _mm512_extractf64x4_pd(T2, 0), 0x31);
+  tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
+                                         _mm512_extractf64x4_pd(T3, 0), 0x31);
+
+  tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
+                                         _mm512_extractf64x4_pd(T2, 1), 0x20);
+  tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
+                                         _mm512_extractf64x4_pd(T3, 1), 0x20);
+  tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
+                                         _mm512_extractf64x4_pd(T2, 1), 0x31);
+  tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
+                                         _mm512_extractf64x4_pd(T3, 1), 0x31);
+
+  tmp.packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
+                                         _mm512_extractf64x4_pd(T6, 0), 0x20);
+  tmp.packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
+                                         _mm512_extractf64x4_pd(T7, 0), 0x20);
+  tmp.packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
+                                          _mm512_extractf64x4_pd(T6, 0), 0x31);
+  tmp.packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
+                                          _mm512_extractf64x4_pd(T7, 0), 0x31);
+
+  tmp.packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
+                                          _mm512_extractf64x4_pd(T6, 1), 0x20);
+  tmp.packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
+                                          _mm512_extractf64x4_pd(T7, 1), 0x20);
+  tmp.packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
+                                          _mm512_extractf64x4_pd(T6, 1), 0x31);
+  tmp.packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
+                                          _mm512_extractf64x4_pd(T7, 1), 0x31);
+
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8);
+
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
+                                     const Packet16f& /*thenPacket*/,
+                                     const Packet16f& /*elsePacket*/) {
+  assert(false && "To be implemented");
+  return Packet16f();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
+                                    const Packet8d& /*thenPacket*/,
+                                    const Packet8d& /*elsePacket*/) {
+  assert(false && "To be implemented");
+  return Packet8d();
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_AVX512_H
diff --git a/Eigen/src/Core/arch/AltiVec/CMakeLists.txt b/Eigen/src/Core/arch/AltiVec/CMakeLists.txt
deleted file mode 100644
index 9f8d2e9c4..000000000
--- a/Eigen/src/Core/arch/AltiVec/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_AltiVec_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_AltiVec_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AltiVec COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 68d9a2bff..67db2f8ee 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -2,30 +2,34 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_COMPLEX_ALTIVEC_H
-#define EIGEN_COMPLEX_ALTIVEC_H
+#ifndef EIGEN_COMPLEX32_ALTIVEC_H
+#define EIGEN_COMPLEX32_ALTIVEC_H
 
 namespace Eigen {
 
 namespace internal {
 
-static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-static Packet16uc p16uc_COMPLEX_RE   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX_IM   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-static Packet16uc p16uc_COMPLEX_REV  = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+#ifdef __VSX__
+#if defined(_BIG_ENDIAN)
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+#else
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+#endif
+#endif
 
 //---------- float ----------
 struct Packet2cf
 {
-  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
   Packet4f  v;
 };
@@ -33,10 +37,12 @@ struct Packet2cf
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet2cf type;
+  typedef Packet2cf half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
+    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -47,65 +53,78 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
+#ifdef __VSX__
+    HasBlend  = 1,
+#endif
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
-  /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
-  if((ptrdiff_t(&from) % 16) == 0)
+  if((std::ptrdiff_t(&from) % 16) == 0)
     res.v = pload<Packet4f>((const float *)&from);
   else
     res.v = ploadu<Packet4f>((const float *)&from);
-  res.v = vec_perm(res.v, res.v, p16uc_PSET_HI);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>*        from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>*       from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from) { return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet2cf>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, p4ui_CONJ_XOR)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   Packet4f v1, v2;
 
   // Permute and multiply the real parts of a and b
-  v1 = vec_perm(a.v, a.v, p16uc_COMPLEX_RE);
+  v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
   // Get the imaginary parts of a
-  v2 = vec_perm(a.v, a.v, p16uc_COMPLEX_IM);
+  v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
   // multiply a_re * b 
   v1 = vec_madd(v1, b.v, p4f_ZERO);
   // multiply a_im * b and get the conjugate result
   v2 = vec_madd(v2, b.v, p4f_ZERO);
-  v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR);
+  v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
   // permute back to a proper order
-  v2 = vec_perm(v2, v2, p16uc_COMPLEX_REV);
+  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
   
-  return Packet2cf(vec_add(v1, v2));
+  return Packet2cf(padd<Packet4f>(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v, b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from)
-{
-  return pset1<Packet2cf>(*from);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr)    { EIGEN_PPC_PREFETCH(addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -118,26 +137,30 @@ template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Pack
 template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
 {
   Packet4f rev_a;
-  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX_REV2);
+  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
   return Packet2cf(rev_a);
 }
 
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
   Packet4f b;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
-  b = padd(a.v, b);
-  return pfirst(Packet2cf(b));
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
   Packet4f b1, b2;
-  
-  b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
-  b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
-  b2 = (Packet4f) vec_sld(b2, b2, 8);
-  b2 = padd(b1, b2);
+#ifdef _BIG_ENDIAN  
+  b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
+  b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
+#else
+  b1 = vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = vec_sld(vecs[0].v, vecs[1].v, 8);
+#endif
+  b2 = vec_sld(b2, b2, 8);
+  b2 = padd<Packet4f>(b1, b2);
 
   return Packet2cf(b2);
 }
@@ -146,10 +169,10 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
 {
   Packet4f b;
   Packet2cf prod;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
-  prod = pmul(a, Packet2cf(b));
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
 
-  return pfirst(prod);
+  return pfirst<Packet2cf>(prod);
 }
 
 template<int Offset>
@@ -159,7 +182,11 @@ struct palign_impl<Offset,Packet2cf>
   {
     if (Offset==1)
     {
+#ifdef _BIG_ENDIAN
       first.v = vec_sld(first.v, second.v, 8);
+#else
+      first.v = vec_sld(second.v, first.v, 8);
+#endif
     }
   }
 };
@@ -197,21 +224,238 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
   }
 };
 
+template<> struct conj_helper<Packet4f, Packet2cf, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
+  { return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet2cf, Packet4f, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
+  { return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
+};
+
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for AltiVec
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
-  Packet4f s = vec_madd(b.v, b.v, p4f_ZERO);
-  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX_REV))));
+  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
+  Packet4f s = pmul<Packet4f>(b.v, b.v);
+  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
 {
-  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+#ifdef __VSX__
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  return result;
+}
+#endif
+
+//---------- double ----------
+#ifdef __VSX__
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
+{
+  std::complex<double> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet1cd>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
+{
+  std::complex<double> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<double> >(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  Packet2d a_re, a_im, v1, v2;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+  // multiply a_re * b
+  v1 = vec_madd(a_re, b.v, p2d_ZERO);
+  // multiply a_im * b and get the conjugate result
+  v2 = vec_madd(a_im, b.v, p2d_ZERO);
+  v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
+  v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
+
+  return Packet1cd(padd<Packet2d>(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pandnot(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)  { return pset1<Packet1cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr)    { EIGEN_PPC_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  std::complex<double> EIGEN_ALIGN16 res[2];
+  pstore<std::complex<double> >(res, a);
+
+  return res[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)        { return vecs[0]; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+template<> struct conj_helper<Packet2d, Packet1cd, false,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
+  { return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet1cd, Packet2d, false,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
+  { return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for AltiVec
+  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet2d s = pmul<Packet2d>(b.v, b.v);
+  return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64))));
 }
 
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+#endif // __VSX__
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_ALTIVEC_H
+#endif // EIGEN_COMPLEX32_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
new file mode 100644
index 000000000..c5e4bede7
--- /dev/null
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -0,0 +1,322 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+/* natural logarithm computed for 4 simultaneous float
+  return NaN for x <= 0
+*/
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+#ifdef __VSX__
+static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+#ifdef __POWER8_VECTOR__
+static Packet2l p2l_1023 = { 1023, 1023 };
+static Packet2ul p2ul_52 = { 52, 52 };
+#endif
+
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+
+  Packet4i emm0;
+
+  /* isvalid_mask is 0 if x < 0 or x is NaN. */
+  Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
+  Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
+
+  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
+  emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
+                reinterpret_cast<Packet4ui>(p4i_23));
+
+  /* keep only the fractional part */
+  x = pand(x, p4f_inv_mant_mask);
+  x = por(x, p4f_half);
+
+  emm0 = psub(emm0, p4i_0x7f);
+  Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
+  Packet4f tmp = pand(x, mask);
+  x = psub(x, p4f_1);
+  e = psub(e, pand(p4f_1, mask));
+  x = padd(x, tmp);
+
+  Packet4f x2 = pmul(x,x);
+  Packet4f x3 = pmul(x2,x);
+
+  Packet4f y, y1, y2;
+  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y  = pmadd(y , x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y1 = pmul(e, p4f_cephes_log_q1);
+  tmp = pmul(x2, p4f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p4f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+  // negative arg will be NAN, 0 will be -INF
+  x = vec_sel(x, p4f_minus_inf, iszero_mask);
+  x = vec_sel(p4f_minus_nan, x, isvalid_mask);
+  return x;
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+
+  Packet4f tmp, fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+  // express exp(x) as exp(g + n*log(2))
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = vec_cts(fx, 0);
+  emm0 = vec_add(emm0, p4i_0x7f);
+  emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
+                 isnumber_mask);
+}
+
+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x)
+{
+  return  vec_rsqrt(x);
+}
+#endif
+
+#ifdef __VSX__
+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x)
+{
+  return  vec_rsqrt(x);
+}
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  return  vec_sqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x)
+{
+  return  vec_sqrt(x);
+}
+
+// VSX support varies between different compilers and even different
+// versions of the same compiler.  For gcc version >= 4.9.3, we can use
+// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
+// a slow version that works with older compilers. 
+// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
+// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
+static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
+    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
+  return vec_cts(x, 0);    // TODO: check clang version.
+#else
+  double tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2l l = { static_cast<long long>(tmp[0]),
+                 static_cast<long long>(tmp[1]) };
+  return l;
+#endif
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& _x)
+{
+  Packet2d x = _x;
+
+  Packet2d tmp, fx;
+  Packet2l emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x,x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul (px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px,psub(qx,px));
+  x = pmadd(p2d_2,x,p2d_1);
+
+  // build 2^n
+  emm0 = ConvertToPacket2l(fx);
+
+#ifdef __POWER8_VECTOR__ 
+  emm0 = vec_add(emm0, p2l_1023);
+  emm0 = vec_sl(emm0, p2ul_52);
+#else
+  // Code is a bit complex for POWER7.  There is actually a
+  // vec_xxsldi intrinsic but it is not supported by some gcc versions.
+  // So we shift (52-32) bits and do a word swap with zeros.
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+  _EIGEN_DECLARE_CONST_Packet4i(20, 20);    // 52 - 32
+
+  Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
+  emm04i = vec_add(emm04i, p4i_1023);
+  emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
+  static const Packet16uc perm = {
+    0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
+    0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+#ifdef  _BIG_ENDIAN
+  emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
+#else
+  emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
+#endif
+
+#endif
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
+                 isnumber_mask);
+}
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index e4089962d..b3f1ea199 100644..100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,13 +18,17 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif
 
-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif
 
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
 #endif
 
 typedef __vector float          Packet4f;
@@ -38,7 +42,7 @@ typedef __vector unsigned char  Packet16uc;
 // and it doesn't really work to declare them global, so we define macros instead
 
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
+  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
 
 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
   Packet4i p4i_##NAME = vec_splat_s32(X)
@@ -46,60 +50,158 @@ typedef __vector unsigned char  Packet16uc;
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
-
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
+  Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
+  Packet2l p2l_##NAME = pset1<Packet2l>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
+
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
 
+
+// These constants are endian-agnostic
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+#ifndef __VSX__
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
+#endif
+
+static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
+static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+
+static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
+static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+
+// Mask alignment
+#ifdef __PPC64__
+#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
+#else
+#define _EIGEN_MASK_ALIGNMENT	0xfffffff0
+#endif
+
+#define _EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
+
+// Handle endianness properly while loading constants
 // Define global static constants:
-static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
-static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
-static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+#ifdef _BIG_ENDIAN
 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
-static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};
-
-static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
-static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
+#ifdef __VSX__
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#endif
+static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#else
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#endif // _BIG_ENDIAN
+
+static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16;                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16;                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+
+static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+
+#ifdef _BIG_ENDIAN
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#else
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#endif // _BIG_ENDIAN
+
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#else
+  #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#endif
 
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet4f type;
+  typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
-
-    // FIXME check the Has*
+    HasHalfPacket = 1,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
     HasSin  = 0,
     HasCos  = 0,
     HasLog  = 0,
-    HasExp  = 0,
-    HasSqrt = 0
+    HasExp  = 1,
+#ifdef __VSX__
+    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
+#else
+    HasSqrt = 0,
+    HasRsqrt = 0,
+#endif
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
   };
 };
 template<> struct packet_traits<int>    : default_packet_traits
 {
   typedef Packet4i type;
+  typedef Packet4i half;
   enum {
-    // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
   };
 };
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
-/*
+
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+
+inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
+{
+  union {
+    Packet16uc   v;
+    unsigned char n[16];
+  } vt;
+  vt.v = v;
+  for (int i=0; i< 16; i++)
+    s << (int)vt.n[i] << ", ";
+  return s;
+}
+
 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
 {
   union {
@@ -133,89 +235,136 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
+// Need to define them first or we get specialization after instantiation errors
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
 {
-  union {
-    Packet4bi v;
-    unsigned int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
-}
-*/
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from;
-  Packet4f vc = vec_ld(0, af);
-  vc = vec_splat(vc, 0);
-  return vc;
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from;
-  Packet4i vc = vec_ld(0, ai);
-  vc = vec_splat(vc, 0);
-  return vc;
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)     { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
-
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
+  Packet4f v = {from, from, from, from};
+  return v;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
+  Packet4i v = {from, from, from, from};
+  return v;
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4i>(const int *a,
+                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
+{
+  a3 = pload<Packet4i>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
-/* Commented out: it's actually slower than processing it scalar
- *
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  af[2] = from[2*stride];
+  af[3] = from[3*stride];
+ return pload<Packet4f>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4i>(ai);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
-  //Set up constants, variables
-  Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
+  float EIGEN_ALIGN16 af[4];
+  pstore<float>(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+  to[2*stride] = af[2];
+  to[3*stride] = af[3];
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  pstore<int>((int *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
 
-  // Get the absolute values
-  a1  = vec_abs(a);
-  b1  = vec_abs(b);
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
 
-  // Get the signs using xor
-  Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; }
 
-  // Do the multiplication for the asbolute values.
-  bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
-  low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
-  high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
-  high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
-  prod = vec_add( low_prod, high_prod );
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; }
 
-  // NOR the product and select only the negative elements according to the sign mask
-  prod_ = vec_nor(prod, prod);
-  prod_ = vec_sel(p4i_ZERO, prod_, sgn);
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
 
-  // Add 1 to the result to get the negative numbers
-  v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
-  prod_ = vec_add(prod_, v1sel);
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
 
-  // Merge the results back to the final vector.
-  prod = vec_sel(prod, prod_, sgn);
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }
 
-  return prod;
-}
-*/
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
-  Packet4f t, y_0, y_1, res;
+#ifndef __VSX__  // VSX actually provides a div instruction
+  Packet4f t, y_0, y_1;
 
   // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
   y_0 = vec_re(b);
@@ -224,8 +373,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
   t   = vec_nmsub(y_0, b, p4f_ONE);
   y_1 = vec_madd(y_0, t, y_0);
 
-  res = vec_madd(a, y_1, p4f_ZERO);
-  return res;
+  return vec_madd(a, y_1, p4f_MZERO);
+#else
+  return vec_div(a, b);
+#endif
 }
 
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
@@ -234,8 +385,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
@@ -243,7 +394,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
 
-// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
 
@@ -256,13 +406,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
 
+#ifdef _BIG_ENDIAN
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
   Packet16uc MSQ, LSQ;
   Packet16uc mask;
   MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
@@ -282,25 +433,36 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
   mask = vec_lvsl(0, from);                        // create the permute mask
   return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
 }
+#else
+// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
+}
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
+}
+#endif
 
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
 {
   Packet4f p;
-  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4f>(from);
-  else                              p = ploadu<Packet4f>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE);
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet4f>(from);
+  else                                  p = ploadu<Packet4f>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
   Packet4i p;
-  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4i>(from);
-  else                              p = ploadu<Packet4i>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE);
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet4i>(from);
+  else                                  p = ploadu<Packet4i>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-
+#ifdef _BIG_ENDIAN
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
 {
   EIGEN_DEBUG_UNALIGNED_STORE
@@ -337,15 +499,33 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
 }
+#else
+// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+}
+#endif
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
@@ -353,10 +533,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
   Packet4f b, sum;
-  b   = (Packet4f) vec_sld(a, a, 8);
-  sum = vec_add(a, b);
-  b   = (Packet4f) vec_sld(sum, sum, 4);
-  sum = vec_add(sum, b);
+  b   = vec_sld(a, a, 8);
+  sum = a + b;
+  b   = vec_sld(sum, sum, 4);
+  sum += b;
   return pfirst(sum);
 }
 
@@ -379,11 +559,11 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 
   // Now do the summation:
   // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
   // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
+  sum[1] = sum[2] + sum[3];
   // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
 
   return sum[0];
 }
@@ -392,7 +572,11 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i sum;
   sum = vec_sums(a, p4i_ZERO);
+#ifdef _BIG_ENDIAN
   sum = vec_sld(sum, p4i_ZERO, 12);
+#else
+  sum = vec_sld(p4i_ZERO, sum, 4);
+#endif
   return pfirst(sum);
 }
 
@@ -415,11 +599,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 
   // Now do the summation:
   // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
   // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
+  sum[1] = sum[2] + sum[3];
   // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
 
   return sum[0];
 }
@@ -429,8 +613,8 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
   Packet4f prod;
-  prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
-  return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
 }
 
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
@@ -479,8 +663,25 @@ struct palign_impl<Offset,Packet4f>
 {
   static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
   {
-    if (Offset!=0)
-      first = vec_sld(first, second, Offset*4);
+#ifdef _BIG_ENDIAN
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+#else
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(second, first, 12); break;
+    case 2:
+      first = vec_sld(second, first, 8); break;
+    case 3:
+      first = vec_sld(second, first, 4); break;
+    }
+#endif
   }
 };
 
@@ -489,11 +690,342 @@ struct palign_impl<Offset,Packet4i>
 {
   static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
   {
-    if (Offset!=0)
-      first = vec_sld(first, second, Offset*4);
+#ifdef _BIG_ENDIAN
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+#else
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(second, first, 12); break;
+    case 2:
+      first = vec_sld(second, first, 8); break;
+    case 3:
+      first = vec_sld(second, first, 4); break;
+    }
+#endif
   }
 };
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  Packet4f t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  Packet4i t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+
+//---------- double ----------
+#ifdef __VSX__
+typedef __vector double              Packet2d;
+typedef __vector unsigned long long  Packet2ul;
+typedef __vector long long           Packet2l;
+#if EIGEN_COMP_CLANG
+typedef Packet2ul                    Packet2bl;
+#else
+typedef __vector __bool long         Packet2bl;
+#endif
+
+static Packet2l  p2l_ONE  = { 1, 1 };
+static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
+static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
+static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
+static Packet2d  p2d_MZERO = { -0.0, -0.0 };
+
+#ifdef _BIG_ENDIAN
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
+#else
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
+#endif
+
+template<int index> Packet2d vec_splat_dbl(Packet2d& a);
+
+template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
+}
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 1,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
+{
+  union {
+    Packet2l   v;
+    int64_t n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
+{
+  union {
+    Packet2d   v;
+    double n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}
+
+// Need to define them first or we get specialization after instantiation errors
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
+  Packet2d v = {from, from};
+  return v;
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  a1 = pload<Packet2d>(a);
+  a0 = vec_splat_dbl<0>(a1);
+  a1 = vec_splat_dbl<1>(a1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec_splat_dbl<0>(a3);
+  a3 = vec_splat_dbl<1>(a3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+ return pload<Packet2d>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  pstore<double>(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
+
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
+
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  Packet2d p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
+  else                                  p = ploadu<Packet2d>(from);
+  return vec_splat_dbl<0>(p);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  Packet2d b, sum;
+  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
+  sum = a + b;
+  return pfirst<Packet2d>(sum);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  Packet2d v[2], sum;
+  v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
+  v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
+ 
+#ifdef _BIG_ENDIAN
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
+#else
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8));
+#endif
+
+  return sum;
+}
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+}
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
+  {
+    if (Offset == 1)
+#ifdef _BIG_ENDIAN
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8));
+#else
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8));
+#endif
+  }
+};
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  Packet2d t0, t1;
+  t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
+  t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
+  kernel.packet[0] = t0;
+  kernel.packet[1] = t1;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+#endif // __VSX__
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/CMakeLists.txt b/Eigen/src/Core/arch/CMakeLists.txt
deleted file mode 100644
index 8456dec15..000000000
--- a/Eigen/src/Core/arch/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-ADD_SUBDIRECTORY(SSE)
-ADD_SUBDIRECTORY(AltiVec)
-ADD_SUBDIRECTORY(NEON)
-ADD_SUBDIRECTORY(Default)
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
new file mode 100644
index 000000000..9c2536509
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_CUDA_H
+#define EIGEN_COMPLEX_CUDA_H
+
+// clang-format off
+
+namespace Eigen {
+
+namespace internal {
+
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+// Many std::complex methods such as operator+, operator-, operator* and
+// operator/ are not constexpr. Due to this, clang does not treat them as device
+// functions and thus Eigen functors making use of these operators fail to
+// compile. Here, we manually specialize these functors for complex types when
+// building for CUDA to avoid non-constexpr methods.
+
+// Sum
+template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    return std::complex<T>(numext::real(a) + numext::real(b),
+                           numext::imag(a) + numext::imag(b));
+  }
+};
+
+template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Difference
+template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    return std::complex<T>(numext::real(a) - numext::real(b),
+                           numext::imag(a) - numext::imag(b));
+  }
+};
+
+template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Product
+template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  enum {
+    Vectorizable = packet_traits<std::complex<T>>::HasMul
+  };
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    const T a_real = numext::real(a);
+    const T a_imag = numext::imag(a);
+    const T b_real = numext::real(b);
+    const T b_imag = numext::imag(b);
+    return std::complex<T>(a_real * b_real - a_imag * b_imag,
+                           a_real * b_imag + a_imag * b_real);
+  }
+};
+
+template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Quotient
+template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  enum {
+    Vectorizable = packet_traits<std::complex<T>>::HasDiv
+  };
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    const T a_real = numext::real(a);
+    const T a_imag = numext::imag(a);
+    const T b_real = numext::real(b);
+    const T b_imag = numext::imag(b);
+    const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
+    return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
+                           (a_imag * b_real - a_real * b_imag) * norm);
+  }
+};
+
+template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
new file mode 100644
index 000000000..52892db38
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -0,0 +1,585 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Standard 16-bit float type, mostly useful for GPUs. Defines a new
+// type Eigen::half (inheriting from CUDA's __half struct) with
+// operator overloads such that it behaves basically as an arithmetic
+// type. It will be quite slow on CPUs (so it is recommended to stay
+// in fp32 for CPUs, except for simple parameter conversions, I/O
+// to disk and the likes), but fast on GPUs.
+
+
+#ifndef EIGEN_HALF_CUDA_H
+#define EIGEN_HALF_CUDA_H
+
+#if __cplusplus > 199711L
+#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
+#else
+#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
+#endif
+
+
+namespace Eigen {
+
+struct half;
+
+namespace half_impl {
+
+#if !defined(EIGEN_HAS_CUDA_FP16)
+
+// Make our own __half definition that is similar to CUDA's.
+struct __half {
+  EIGEN_DEVICE_FUNC __half() {}
+  explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
+  unsigned short x;
+};
+
+#endif
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
+
+struct half_base : public __half {
+  EIGEN_DEVICE_FUNC half_base() {}
+  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
+};
+
+} // namespace half_impl
+
+// Class definition.
+struct half : public half_impl::half_base {
+  #if !defined(EIGEN_HAS_CUDA_FP16)
+    typedef half_impl::__half __half;
+  #endif
+
+  EIGEN_DEVICE_FUNC half() {}
+
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+
+  explicit EIGEN_DEVICE_FUNC half(bool b)
+      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+  template<class T>
+  explicit EIGEN_DEVICE_FUNC half(const T& val)
+      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
+  explicit EIGEN_DEVICE_FUNC half(float f)
+      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
+    // +0.0 and -0.0 become false, everything else becomes true.
+    return (x & 0x7fff) != 0;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
+    return static_cast<signed char>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
+    return static_cast<unsigned char>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
+    return static_cast<short>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
+    return static_cast<unsigned short>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
+    return static_cast<int>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
+    return static_cast<unsigned int>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
+    return static_cast<long>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
+    return static_cast<unsigned long>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
+    return static_cast<long long>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
+    return static_cast<unsigned long long>(half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
+    return half_impl::half_to_float(*this);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
+    return static_cast<double>(half_impl::half_to_float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
+    x = other.x;
+    return *this;
+  }
+};
+
+namespace half_impl {
+
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+
+// Intrinsics for native fp16 support. Note that on current hardware,
+// these are no faster than fp32 arithmetic (you need to use the half2
+// versions to get the ALU speed increased), but you do save the
+// conversion steps back and forth.
+
+__device__ half operator + (const half& a, const half& b) {
+  return __hadd(a, b);
+}
+__device__ half operator * (const half& a, const half& b) {
+  return __hmul(a, b);
+}
+__device__ half operator - (const half& a, const half& b) {
+  return __hsub(a, b);
+}
+__device__ half operator / (const half& a, const half& b) {
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+}
+__device__ half operator - (const half& a) {
+  return __hneg(a);
+}
+__device__ half& operator += (half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+__device__ half& operator *= (half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+__device__ half& operator -= (half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+__device__ half& operator /= (half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+__device__ bool operator == (const half& a, const half& b) {
+  return __heq(a, b);
+}
+__device__ bool operator != (const half& a, const half& b) {
+  return __hne(a, b);
+}
+__device__ bool operator < (const half& a, const half& b) {
+  return __hlt(a, b);
+}
+__device__ bool operator <= (const half& a, const half& b) {
+  return __hle(a, b);
+}
+__device__ bool operator > (const half& a, const half& b) {
+  return __hgt(a, b);
+}
+__device__ bool operator >= (const half& a, const half& b) {
+  return __hge(a, b);
+}
+
+#else  // Emulate support for half floats
+
+// Definitions for CPUs and older CUDA, mostly working through conversion
+// to/from fp32.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
+  return half(float(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
+  return half(float(a) * float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
+  return half(float(a) - float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
+  return half(float(a) / float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+  half result;
+  result.x = a.x ^ 0x8000;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+  a = half(float(a) + float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+  a = half(float(a) * float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+  a = half(float(a) - float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+  a = half(float(a) / float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
+  return float(a) == float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+  return float(a) != float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
+  return float(a) < float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
+  return float(a) <= float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
+  return float(a) > float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
+  return float(a) >= float(b);
+}
+
+#endif  // Emulate support for half floats
+
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to half.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
+  return half(static_cast<float>(a) / static_cast<float>(b));
+}
+
+// Conversion routines, including fallbacks for the host or older CUDA.
+// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
+// these in hardware. If we need more performance on older/other CPUs, they are
+// also possible to vectorize directly.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
+  __half h;
+  h.x = x;
+  return h;
+}
+
+union FP32 {
+  unsigned int u;
+  float f;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  return __float2half(ff);
+
+#elif defined(EIGEN_HAS_FP16_C)
+  __half h;
+  h.x = _cvtss_sh(ff, 0);
+  return h;
+
+#else
+  FP32 f; f.f = ff;
+
+  const FP32 f32infty = { 255 << 23 };
+  const FP32 f16max = { (127 + 16) << 23 };
+  const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+  unsigned int sign_mask = 0x80000000u;
+  __half o;
+  o.x = static_cast<unsigned short>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
+    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+  } else {  // (De)normalized number or zero
+    if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      o.x = static_cast<unsigned short>(f.u >> 13);
+    }
+  }
+
+  o.x |= static_cast<unsigned short>(sign >> 16);
+  return o;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  return __half2float(h);
+
+#elif defined(EIGEN_HAS_FP16_C)
+  return _cvtsh_ss(h.x);
+
+#else
+  const FP32 magic = { 113 << 23 };
+  const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+  FP32 o;
+
+  o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u;   // just the exponent
+  o.u += (127 - 15) << 23;                // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {     // Inf/NaN?
+    o.u += (128 - 16) << 23;    // extra exp adjust
+  } else if (exp == 0) {        // Zero/Denormal?
+    o.u += 1 << 23;             // extra exp adjust
+    o.f -= magic.f;             // renormalize
+  }
+
+  o.u |= (h.x & 0x8000) << 16;    // sign bit
+  return o.f;
+#endif
+}
+
+// --- standard functions ---
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
+  return (a.x & 0x7fff) == 0x7c00;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hisnan(a);
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
+  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
+  half result;
+  result.x = a.x & 0x7FFF;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
+  return half(::expf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return Eigen::half(::hlog(a));
+#else
+  return half(::logf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
+  return half(numext::log1p(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
+  return half(::log10f(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
+  return half(::sqrtf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
+  return half(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
+  return half(::sinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
+  return half(::cosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
+  return half(::tanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
+  return half(::tanhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
+  return half(::floorf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
+  return half(::ceilf(float(a)));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(b, a) ? b : a;
+#else
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f2 < f1 ? b : a;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a, b) ? b : a;
+#else
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f1 < f2 ? b : a;
+#endif
+}
+
+EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
+  os << static_cast<float>(v);
+  return os;
+}
+
+} // end namespace half_impl
+
+// import Eigen::half_impl::half into Eigen namespace
+// using half_impl::half;
+
+namespace internal {
+
+template<>
+struct random_default_impl<half, false, false>
+{
+  static inline half run(const half& x, const half& y)
+  {
+    return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
+  }
+  static inline half run()
+  {
+    return run(half(-1.f), half(1.f));
+  }
+};
+
+template<> struct is_arithmetic<half> { enum { value = true }; };
+
+} // end namespace internal
+
+template<> struct NumTraits<Eigen::half>
+    : GenericNumTraits<Eigen::half>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+    return half_impl::raw_uint16_to_half(0x0800);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
+    return half_impl::raw_uint16_to_half(0x7bff);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
+    return half_impl::raw_uint16_to_half(0xfbff);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
+    return half_impl::raw_uint16_to_half(0x7c00);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+    return half_impl::raw_uint16_to_half(0x7c01);
+  }
+};
+
+} // end namespace Eigen
+
+// C-like standard mathematical functions and trancendentals.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
+  Eigen::half result;
+  result.x = a.x & 0x7FFF;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
+  return Eigen::half(::expf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return Eigen::half(::hlog(a));
+#else
+  return Eigen::half(::logf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
+  return Eigen::half(::sqrtf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) {
+  return Eigen::half(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
+  return Eigen::half(::floorf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
+  return Eigen::half(::ceilf(float(a)));
+}
+
+namespace std {
+
+#if __cplusplus > 199711L
+template <>
+struct hash<Eigen::half> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const {
+    return static_cast<std::size_t>(a.x);
+  }
+};
+#endif
+
+} // end namespace std
+
+
+// Add the missing shfl_xor intrinsic
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+  return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
+}
+#endif
+
+// ldg() has an overload for __half, but we also need one for Eigen::half.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
+  return Eigen::half_impl::raw_uint16_to_half(
+      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
+}
+#endif
+
+
+#if defined(__CUDA_ARCH__)
+namespace Eigen {
+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isnan)(const Eigen::half& h) {
+  return (half_impl::isnan)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isinf)(const Eigen::half& h) {
+  return (half_impl::isinf)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isfinite)(const Eigen::half& h) {
+  return (half_impl::isfinite)(h);
+}
+
+} // namespace Eigen
+}  // namespace numext
+#endif
+
+#endif // EIGEN_HALF_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h
new file mode 100644
index 000000000..0348b41db
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
+#define EIGEN_MATH_FUNCTIONS_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plog<float4>(const float4& a)
+{
+  return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plog<double2>(const double2& a)
+{
+  using ::log;
+  return make_double2(log(a.x), log(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plog1p<float4>(const float4& a)
+{
+  return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w));
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plog1p<double2>(const double2& a)
+{
+  return make_double2(log1p(a.x), log1p(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pexp<float4>(const float4& a)
+{
+  return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pexp<double2>(const double2& a)
+{
+  using ::exp;
+  return make_double2(exp(a.x), exp(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 psqrt<float4>(const float4& a)
+{
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 psqrt<double2>(const double2& a)
+{
+  using ::sqrt;
+  return make_double2(sqrt(a.x), sqrt(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 prsqrt<float4>(const float4& a)
+{
+  return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 prsqrt<double2>(const double2& a)
+{
+  return make_double2(rsqrt(a.x), rsqrt(a.y));
+}
+
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
new file mode 100644
index 000000000..ad66399e0
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -0,0 +1,333 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_CUDA_H
+#define EIGEN_PACKET_MATH_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> struct is_arithmetic<float4>  { enum { value = true }; };
+template<> struct is_arithmetic<double2> { enum { value = true }; };
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasIGamma = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasIGamma = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+  };
+};
+
+
+template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
+  return make_float4(from, from, from, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+  return make_double2(from, from);
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+  return make_float4(a, a+1, a+2, a+3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
+  return make_double2(a, a+1);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x+b.x, a.y+b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x-b.x, a.y-b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+  return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+  return make_double2(-a.x, -a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x*b.x, a.y*b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x/b.x, a.y/b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+  return *reinterpret_cast<const float4*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+  return *reinterpret_cast<const double2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+  return make_float4(from[0], from[1], from[2], from[3]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+  return make_double2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+  return make_float4(from[0], from[0], from[1], from[1]);
+}
+template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+  return make_double2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
+  *reinterpret_cast<float4*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+  *reinterpret_cast<double2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+  to[2] = from.z;
+  to[3] = from.w;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return __ldg((const float4*)from);
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return __ldg((const double2*)from);
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return make_double2(__ldg(from+0), __ldg(from+1));
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
+  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
+  return make_double2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+  to[stride*2] = from.z;
+  to[stride*3] = from.w;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
+  return a.x + a.y + a.z + a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+  return a.x + a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
+  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+  return fmax(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
+  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+  return fmin(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
+  return a.x * a.y * a.z * a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
+  return a.x * a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(fabs(a.x), fabs(a.y));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<float4,4>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+
+  tmp = kernel.packet[0].z;
+  kernel.packet[0].z = kernel.packet[2].x;
+  kernel.packet[2].x = tmp;
+
+  tmp = kernel.packet[0].w;
+  kernel.packet[0].w = kernel.packet[3].x;
+  kernel.packet[3].x = tmp;
+
+  tmp = kernel.packet[1].z;
+  kernel.packet[1].z = kernel.packet[2].y;
+  kernel.packet[2].y = tmp;
+
+  tmp = kernel.packet[1].w;
+  kernel.packet[1].w = kernel.packet[3].y;
+  kernel.packet[3].y = tmp;
+
+  tmp = kernel.packet[2].w;
+  kernel.packet[2].w = kernel.packet[3].z;
+  kernel.packet[3].z = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<double2,2>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_PACKET_MATH_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
new file mode 100644
index 000000000..ae54225f8
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -0,0 +1,1123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
+#define EIGEN_PACKET_MATH_HALF_CUDA_H
+
+
+namespace Eigen {
+namespace internal {
+
+// Most of the following operations require arch >= 3.0
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+
+template<> struct is_arithmetic<half2> { enum { value = true }; };
+
+template<> struct packet_traits<Eigen::half> : default_packet_traits
+{
+  typedef half2 type;
+  typedef half2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasLog    = 1,
+    HasLog1p  = 1
+  };
+};
+
+template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+  return __half2half2(from);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+  return *reinterpret_cast<const half2*>(from);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+  return __halves2half2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
+  return __halves2half2(from[0], from[0]);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+  *reinterpret_cast<half2*>(to) = from;
+}
+
+template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+  to[0] = __low2half(from);
+  to[1] = __high2half(from);
+}
+
+template<>
+ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+#if __CUDA_ARCH__ >= 350
+   return __ldg((const half2*)from);
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
+}
+
+template<>
+__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+#if __CUDA_ARCH__ >= 350
+   return __halves2half2(__ldg(from+0), __ldg(from+1));
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+  return __halves2half2(from[0*stride], from[1*stride]);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+  to[stride*0] = __low2half(from);
+  to[stride*1] = __high2half(from);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+  return __low2half(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+  half2 result;
+  result.x = a.x & 0x7FFF7FFF;
+  return result;
+}
+
+
+__device__ EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<half2,2>& kernel) {
+  __half a1 = __low2half(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
+  __half b1 = __low2half(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
+  kernel.packet[0] = __halves2half2(a1, b1);
+  kernel.packet[1] = __halves2half2(a2, b2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+#if __CUDA_ARCH__ >= 530
+  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return __halves2half2(a, __float2half(f));
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
+  return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+#if __CUDA_ARCH__ >= 530
+   return __hfma2(a, b, c);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c1;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  return __hmul(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = log1pf(a1);
+  float r2 = log1pf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+
+template<>  __device__ EIGEN_STRONG_INLINE
+half2 plog<half2>(const half2& a) {
+  return h2log(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 pexp<half2>(const half2& a) {
+  return h2exp(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 psqrt<half2>(const half2& a) {
+  return h2sqrt(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 prsqrt<half2>(const half2& a) {
+  return h2rsqrt(a);
+}
+
+#else
+
+template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = logf(a1);
+  float r2 = logf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expf(a1);
+  float r2 = expf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = sqrtf(a1);
+  float r2 = sqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = rsqrtf(a1);
+  float r2 = rsqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+#endif
+
+#elif defined EIGEN_VECTORIZE_AVX512
+
+typedef struct {
+  __m256i x;
+} Packet16h;
+
+
+template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+  typedef Packet16h type;
+  // There is no half-size packet for Packet16h.
+  typedef Packet16h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 0,
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasDiv = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
+
+template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+  Packet16h result;
+  result.x = _mm256_set1_epi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+  Packet16h result;
+  result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+  Packet16h result;
+  result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+  _mm256_store_si256((__m256i*)to, from.x);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+  _mm256_storeu_si256((__m256i*)to, from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploadquad(const Eigen::half* from) {
+  Packet16h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+  return result;
+}
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm512_cvtph_ps(a.x);
+#else
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+  float f8(aux[8]);
+  float f9(aux[9]);
+  float fa(aux[10]);
+  float fb(aux[11]);
+  float fc(aux[12]);
+  float fd(aux[13]);
+  float fe(aux[14]);
+  float ff(aux[15]);
+
+  return _mm512_set_ps(
+      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  Packet16h result;
+  result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+  return result;
+#else
+  EIGEN_ALIGN64 float aux[16];
+  pstore(aux, a);
+  half h0(aux[0]);
+  half h1(aux[1]);
+  half h2(aux[2]);
+  half h3(aux[3]);
+  half h4(aux[4]);
+  half h5(aux[5]);
+  half h6(aux[6]);
+  half h7(aux[7]);
+  half h8(aux[8]);
+  half h9(aux[9]);
+  half ha(aux[10]);
+  half hb(aux[11]);
+  half hc(aux[12]);
+  half hd(aux[13]);
+  half he(aux[14]);
+  half hf(aux[15]);
+
+  Packet16h result;
+  result.x = _mm256_set_epi16(
+      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
+      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+  return result;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
+{
+  Packet16h result;
+  result.x = _mm256_set_epi16(
+      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
+      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
+      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
+      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
+{
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, from);
+  to[stride*0].x = aux[0].x;
+  to[stride*1].x = aux[1].x;
+  to[stride*2].x = aux[2].x;
+  to[stride*3].x = aux[3].x;
+  to[stride*4].x = aux[4].x;
+  to[stride*5].x = aux[5].x;
+  to[stride*6].x = aux[6].x;
+  to[stride*7].x = aux[7].x;
+  to[stride*8].x = aux[8].x;
+  to[stride*9].x = aux[9].x;
+  to[stride*10].x = aux[10].x;
+  to[stride*11].x = aux[11].x;
+  to[stride*12].x = aux[12].x;
+  to[stride*13].x = aux[13].x;
+  to[stride*14].x = aux[14].x;
+  to[stride*15].x = aux[15].x;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,16>& kernel) {
+  __m256i a = kernel.packet[0].x;
+  __m256i b = kernel.packet[1].x;
+  __m256i c = kernel.packet[2].x;
+  __m256i d = kernel.packet[3].x;
+  __m256i e = kernel.packet[4].x;
+  __m256i f = kernel.packet[5].x;
+  __m256i g = kernel.packet[6].x;
+  __m256i h = kernel.packet[7].x;
+  __m256i i = kernel.packet[8].x;
+  __m256i j = kernel.packet[9].x;
+  __m256i k = kernel.packet[10].x;
+  __m256i l = kernel.packet[11].x;
+  __m256i m = kernel.packet[12].x;
+  __m256i n = kernel.packet[13].x;
+  __m256i o = kernel.packet[14].x;
+  __m256i p = kernel.packet[15].x;
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+  kernel.packet[0].x = a_p_0;
+  kernel.packet[1].x = a_p_1;
+  kernel.packet[2].x = a_p_2;
+  kernel.packet[3].x = a_p_3;
+  kernel.packet[4].x = a_p_4;
+  kernel.packet[5].x = a_p_5;
+  kernel.packet[6].x = a_p_6;
+  kernel.packet[7].x = a_p_7;
+  kernel.packet[8].x = a_p_8;
+  kernel.packet[9].x = a_p_9;
+  kernel.packet[10].x = a_p_a;
+  kernel.packet[11].x = a_p_b;
+  kernel.packet[12].x = a_p_c;
+  kernel.packet[13].x = a_p_d;
+  kernel.packet[14].x = a_p_e;
+  kernel.packet[15].x = a_p_f;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,8>& kernel) {
+  EIGEN_ALIGN64 half in[8][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+  pstore<half>(in[4], kernel.packet[4]);
+  pstore<half>(in[5], kernel.packet[5]);
+  pstore<half>(in[6], kernel.packet[6]);
+  pstore<half>(in[7], kernel.packet[7]);
+
+  EIGEN_ALIGN64 half out[8][16];
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 8; ++j) {
+      out[i][j+8] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+  kernel.packet[4] = pload<Packet16h>(out[4]);
+  kernel.packet[5] = pload<Packet16h>(out[5]);
+  kernel.packet[6] = pload<Packet16h>(out[6]);
+  kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,4>& kernel) {
+  EIGEN_ALIGN64 half in[4][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN64 half out[4][16];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][4*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][4*i+1];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+8] = in[j][4*i+2];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+12] = in[j][4*i+3];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+
+#elif defined EIGEN_VECTORIZE_AVX
+
+typedef struct {
+  __m128i x;
+} Packet8h;
+
+
+template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8h type;
+  // There is no half-size packet for Packet8h.
+  typedef Packet8h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasDiv = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
+
+template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  Packet8h result;
+  result.x = _mm_set1_epi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploadquad<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
+  return result;
+}
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtph_ps(a.x);
+#else
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+
+  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  Packet8h result;
+  result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+  return result;
+#else
+  EIGEN_ALIGN32 float aux[8];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+  Eigen::half h4(aux[4]);
+  Eigen::half h5(aux[5]);
+  Eigen::half h6(aux[6]);
+  Eigen::half h7(aux[7]);
+
+  Packet8h result;
+  result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+  return result;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
+{
+  Packet8h result;
+  result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
+{
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride*0].x = aux[0].x;
+  to[stride*1].x = aux[1].x;
+  to[stride*2].x = aux[2].x;
+  to[stride*3].x = aux[3].x;
+  to[stride*4].x = aux[4].x;
+  to[stride*5].x = aux[5].x;
+  to[stride*6].x = aux[6].x;
+  to[stride*7].x = aux[7].x;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_max<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_min<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_mul<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,8>& kernel) {
+  __m128i a = kernel.packet[0].x;
+  __m128i b = kernel.packet[1].x;
+  __m128i c = kernel.packet[2].x;
+  __m128i d = kernel.packet[3].x;
+  __m128i e = kernel.packet[4].x;
+  __m128i f = kernel.packet[5].x;
+  __m128i g = kernel.packet[6].x;
+  __m128i h = kernel.packet[7].x;
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0].x = a0b0c0d0e0f0g0h0;
+  kernel.packet[1].x = a1b1c1d1e1f1g1h1;
+  kernel.packet[2].x = a2b2c2d2e2f2g2h2;
+  kernel.packet[3].x = a3b3c3d3e3f3g3h3;
+  kernel.packet[4].x = a4b4c4d4e4f4g4h4;
+  kernel.packet[5].x = a5b5c5d5e5f5g5h5;
+  kernel.packet[6].x = a6b6c6d6e6f6g6h6;
+  kernel.packet[7].x = a7b7c7d7e7f7g7h7;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#elif 0
+
+typedef struct {
+  __m64 x;
+} Packet4h;
+
+
+template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet4h type;
+  // There is no half-size packet for Packet4h.
+  typedef Packet4h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasDiv = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
+
+template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
+  Packet4h result;
+  result.x = _mm_set1_pi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha + hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha * hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h
+ploadquad<Packet4h>(const Eigen::half* from) {
+  return pset1<Packet4h>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
+{
+  Packet4h result;
+  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
+{
+  __int64_t a = _mm_cvtm64_si64(from.x);
+  to[stride*0].x = static_cast<unsigned short>(a);
+  to[stride*1].x = static_cast<unsigned short>(a >> 16);
+  to[stride*2].x = static_cast<unsigned short>(a >> 32);
+  to[stride*3].x = static_cast<unsigned short>(a >> 48);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h,4>& kernel) {
+  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
+  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
+
+  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
+  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
+  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
+  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
+}
+
+#endif
+
+}
+}
+
+#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
new file mode 100644
index 000000000..aa5fbce8e
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -0,0 +1,212 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_CUDA_H
+#define EIGEN_TYPE_CASTING_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<>
+struct scalar_cast_op<float, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+      return __float2half(a);
+    #else
+      return Eigen::half(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<int, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+      return __float2half(static_cast<float>(a));
+    #else
+      return Eigen::half(static_cast<float>(a));
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<Eigen::half, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+      return __half2float(a);
+    #else
+      return static_cast<float>(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<Eigen::half, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+  float2 r1 = __half22float2(a);
+  float2 r2 = __half22float2(b);
+  return make_float4(r1.x, r1.y, r2.x, r2.y);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+  // Simply discard the second half of the input
+  return __floats2half2_rn(a.x, a.y);
+}
+
+#elif defined EIGEN_VECTORIZE_AVX512
+template <>
+struct type_casting_traits<half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+  return float2half(a);
+}
+
+#elif defined EIGEN_VECTORIZE_AVX
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#elif 0
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
+  float f1 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  float f2 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  float f3 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  float f4 = static_cast<float>(h);
+  return _mm_set_ps(f4, f3, f2, f1);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
+  EIGEN_ALIGN16 float aux[4];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+
+  Packet4h result;
+  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
+  return result;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_CUDA_H
diff --git a/Eigen/src/Core/arch/Default/CMakeLists.txt b/Eigen/src/Core/arch/Default/CMakeLists.txt
deleted file mode 100644
index 339c091d1..000000000
--- a/Eigen/src/Core/arch/Default/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_Default_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_Default_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/Default COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/NEON/CMakeLists.txt b/Eigen/src/Core/arch/NEON/CMakeLists.txt
deleted file mode 100644
index fd4d4af50..000000000
--- a/Eigen/src/Core/arch/NEON/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_NEON_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_NEON_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/NEON COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 8d9255eef..57e9b431f 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,8 +15,21 @@ namespace Eigen {
 
 namespace internal {
 
-static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000);
-static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000);
+inline uint32x4_t p4ui_CONJ_XOR() {
+// See bug 1325, clang fails to call vld1q_u64.
+#if EIGEN_COMP_CLANG
+  uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+  return ret;
+#else
+  static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+  return vld1q_u32( conj_XOR_DATA );
+#endif
+}
+
+inline uint32x2_t p2ui_CONJ_XOR() {
+  static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
+  return vld1_u32( conj_XOR_DATA );
+}
 
 //---------- float ----------
 struct Packet2cf
@@ -28,10 +42,12 @@ struct Packet2cf
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet2cf type;
+  typedef Packet2cf half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
+    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -46,7 +62,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -62,7 +78,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
   Packet4ui b = vreinterpretq_u32_f32(a.v);
-  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR)));
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@@ -71,14 +87,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
 
   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
   v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
-  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
   v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
   // Multiply the real a with b
   v1 = vmulq_f32(v1, b.v);
   // Multiply the imag a with b
   v2 = vmulq_f32(v2, b.v);
   // Conjugate v2 
-  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR));
+  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64q_f32(v2);
   // Add and return the result
@@ -87,7 +103,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
 
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+  return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@@ -110,6 +126,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
 
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  Packet4f res = pset1<Packet4f>(0.f);
+  res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
+  res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
+  res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
+  res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
+  return Packet2cf(res);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
+  to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
+}
+
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
@@ -177,7 +209,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   // Multiply the imag a with b
   v2 = vmul_f32(v2, a2);
   // Conjugate v2 
-  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR));
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64_f32(v2);
   // Add v1, v2
@@ -235,7 +267,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for AltiVec
+  // TODO optimize it for NEON
   Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
   Packet4f s, rev_s;
 
@@ -246,6 +278,207 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
   return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
 }
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+  Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
+  kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
+  kernel.packet[1].v = tmp;
+}
+
+//---------- double ----------
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+// See bug 1325, clang fails to call vld1q_u64.
+#if EIGEN_COMP_CLANG
+  static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
+#else
+  const uint64_t  p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
+  static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
+#endif
+
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+
+template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd<Packet2d>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub<Packet2d>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate<Packet2d>(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  Packet2d v1, v2;
+
+  // Get the real values of a 
+  v1 = vdupq_lane_f64(vget_low_f64(a.v), 0);
+  // Get the imag values of a
+  v2 = vdupq_lane_f64(vget_high_f64(a.v), 0);
+  // Multiply the real a with b
+  v1 = vmulq_f64(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmulq_f64(v2, b.v);
+  // Conjugate v2 
+  v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
+  // Swap real/imag elements in v2.
+  v2 = preverse<Packet2d>(v2);
+  // Add and return the result
+  return Packet1cd(vaddq_f64(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((double *)addr); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
+{
+  Packet2d res = pset1<Packet2d>(0.0);
+  res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
+  res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1);
+  return Packet1cd(res);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
+{
+  to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
+}
+
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  std::complex<double> EIGEN_ALIGN16 res;
+  pstore<std::complex<double> >(&res, a);
+
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) { return vecs[0]; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for NEON
+  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet2d s = pmul<Packet2d>(b.v, b.v);
+  Packet2d rev_s = preverse<Packet2d>(s);
+
+  return Packet1cd(pdiv(res.v, padd<Packet2d>(s,rev_s)));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v));
+  kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
+  kernel.packet[1].v = tmp;
+}
+#endif // EIGEN_ARCH_ARM64
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
new file mode 100644
index 000000000..6bb05bb92
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
+#define EIGEN_MATH_FUNCTIONS_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  Packet4f tmp, fx;
+
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+  x = vminq_f32(x, p4f_exp_hi);
+  x = vmaxq_f32(x, p4f_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
+
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+  /* if greater, substract 1 */
+  Packet4ui mask = vcgtq_f32(tmp, fx);
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
+
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+  tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
+  Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+
+  Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
+  z = vmulq_f32(x, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p5);
+
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, p4f_1);
+
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, p4i_0x7f);
+  mm = vshlq_n_s32(mm, 23);
+  Packet4f pow2n = vreinterpretq_f32_s32(mm);
+
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_NEON_H
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 94dfab330..84a56bdcc 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 // Heavily based on Gael's SSE version.
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -20,42 +20,44 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-// FIXME NEON has 16 quad registers, but since the current register allocator
-// is so bad, it is much better to reduce it to 8
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#endif
+
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
+#if EIGEN_ARCH_ARM64
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#else
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 
+#endif
 #endif
 
+typedef float32x2_t Packet2f;
 typedef float32x4_t Packet4f;
 typedef int32x4_t   Packet4i;
+typedef int32x2_t   Packet2i;
 typedef uint32x4_t  Packet4ui;
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
+  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-#if defined(__llvm__) && !defined(__clang__)
-  //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
-  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {{X, Y}}
-  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
-#else
-  //Default initializer for packets
-  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {X, Y}
-  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
-#endif
-
 // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
 // which available on LLVM and GCC (at least)
-#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
   #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
   #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif !defined(__aarch64__)
+#elif !EIGEN_ARCH_ARM64
   #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
 #else
   // by default no explicit prefetching
@@ -65,53 +67,60 @@ typedef uint32x4_t  Packet4ui;
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet4f type;
+  typedef Packet4f half; // Packet2f intrinsics not implemented yet
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
+    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
    
     HasDiv  = 1,
     // FIXME check the Has*
     HasSin  = 0,
     HasCos  = 0,
     HasLog  = 0,
-    HasExp  = 0,
+    HasExp  = 1,
     HasSqrt = 0
   };
 };
-template<> struct packet_traits<int>    : default_packet_traits
+template<> struct packet_traits<int32_t>    : default_packet_traits
 {
   typedef Packet4i type;
+  typedef Packet4i half; // Packet2i intrinsics not implemented yet
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size=4,
+    HasHalfPacket=0 // Packet2i intrinsics not implemented yet
     // FIXME check the Has*
   };
 };
 
-#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__)
+#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
 // workaround gcc 4.2, 4.3 and 4.4 compilatin issue
 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
 EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
 EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
 EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float   type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
 
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t&    from)   { return vdupq_n_s32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a)
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
-  Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
+  const float32_t f[] = {0, 1, 2, 3};
+  Packet4f countdown = vld1q_f32(f);
   return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
 {
-  Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
+  const int32_t i[] = {0, 1, 2, 3};
+  Packet4i countdown = vld1q_s32(i);
   return vaddq_s32(pset1<Packet4i>(a), countdown);
 }
 
@@ -132,6 +141,9 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
+#if EIGEN_ARCH_ARM64
+  return vdivq_f32(a,b);
+#else
   Packet4f inv, restep, div;
 
   // NEON does not offer a divide instruction, we have to do a reciprocal approximation
@@ -150,14 +162,51 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
   div = vmulq_f32(a, inv);
 
   return div;
+#endif
 }
+
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet4i>(0);
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
+// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
+// then implements a slow software scalar fallback calling fmaf()!
+// Filed LLVM bug:
+//     https://llvm.org/bugs/show_bug.cgi?id=27216
+#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
+// See bug 936.
+// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
+// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
+// MLA is not fused i.e. does 2 roundings.
+// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
+// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
+  // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu,
+  // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on
+  // -march=armv7-a, that is a very common case.
+  // See e.g. this thread:
+  //     http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html
+  // Filed LLVM bug:
+  //     https://llvm.org/bugs/show_bug.cgi?id=27219
+  Packet4f r = c;
+  asm volatile(
+    "vmla.f32 %q[r], %q[a], %q[b]"
+    : [r] "+w" (r)
+    : [a] "w" (a),
+      [b] "w" (b)
+    : );
+  return r;
+#else
+  return vmlaq_f32(c,a,b);
+#endif
+}
+#endif
+
+// No FMA instruction for int, so use MLA unconditionally.
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
@@ -191,20 +240,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
 }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*   from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t*  from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)   { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
 {
   float32x2_t lo, hi;
   lo = vld1_dup_f32(from);
   hi = vld1_dup_f32(from+1);
   return vcombine_f32(lo, hi);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
 {
   int32x2_t lo, hi;
   lo = vld1_dup_s32(from);
@@ -212,18 +261,52 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
   return vcombine_s32(lo, hi);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<float>  (float*    to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t*  to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>  (float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  Packet4f res = pset1<Packet4f>(0.f);
+  res = vsetq_lane_f32(from[0*stride], res, 0);
+  res = vsetq_lane_f32(from[1*stride], res, 1);
+  res = vsetq_lane_f32(from[2*stride], res, 2);
+  res = vsetq_lane_f32(from[3*stride], res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
+{
+  Packet4i res = pset1<Packet4i>(0);
+  res = vsetq_lane_s32(from[0*stride], res, 0);
+  res = vsetq_lane_s32(from[1*stride], res, 1);
+  res = vsetq_lane_s32(from[2*stride], res, 2);
+  res = vsetq_lane_s32(from[3*stride], res, 3);
+  return res;
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  to[stride*0] = vgetq_lane_f32(from, 0);
+  to[stride*1] = vgetq_lane_f32(from, 1);
+  to[stride*2] = vgetq_lane_f32(from, 2);
+  to[stride*3] = vgetq_lane_f32(from, 3);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
+{
+  to[stride*0] = vgetq_lane_s32(from, 0);
+  to[stride*1] = vgetq_lane_s32(from, 1);
+  to[stride*2] = vgetq_lane_s32(from, 2);
+  to[stride*3] = vgetq_lane_s32(from, 3);
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>  (const float*    addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t*  addr) { EIGEN_ARM_PREFETCH(addr); }
 
 // FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float   pfirst<Packet4f>(const Packet4f& a) { float   EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
   float32x2_t a_lo, a_hi;
@@ -243,6 +326,7 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
   a_hi = vget_high_s32(a_r64);
   return vcombine_s32(a_hi, a_lo);
 }
+
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
 
@@ -277,7 +361,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
   return sum;
 }
 
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
 {
   int32x2_t a_lo, a_hi, sum;
 
@@ -324,7 +408,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 
   return vget_lane_f32(prod, 0);
 }
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
 {
   int32x2_t a_lo, a_hi, prod;
 
@@ -352,7 +436,7 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
   return vget_lane_f32(min, 0);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
 {
   int32x2_t a_lo, a_hi, min;
 
@@ -377,13 +461,14 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
   return vget_lane_f32(max, 0);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
 {
   int32x2_t a_lo, a_hi, max;
 
   a_lo = vget_low_s32(a);
   a_hi = vget_high_s32(a);
   max = vpmax_s32(a_lo, a_hi);
+  max = vpmax_s32(max, max);
 
   return vget_lane_s32(max, 0);
 }
@@ -409,9 +494,231 @@ PALIGN_NEON(0,Packet4i,vextq_s32)
 PALIGN_NEON(1,Packet4i,vextq_s32)
 PALIGN_NEON(2,Packet4i,vextq_s32)
 PALIGN_NEON(3,Packet4i,vextq_s32)
-    
+
 #undef PALIGN_NEON
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
+  float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
+
+  kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
+  kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
+  kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
+  kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
+  int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
+  kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
+  kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
+  kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
+  kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
+}
+
+//---------- double ----------
+
+// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
+// Confirmed at least with __apple_build_version__ = 6000054.
+#ifdef __apple_build_version__
+// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
+// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
+// major toolchain updates.
+#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
+#else
+#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
+#endif
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+// Bug 907: workaround missing declarations of the following two functions in the ADK
+// Defining these functions as templates ensures that if these intrinsics are
+// already defined in arm_neon.h, then our workaround doesn't cause a conflict
+// and has lower priority in overload resolution.
+template <typename T>
+uint64x2_t vreinterpretq_u64_f64(T a)
+{
+  return (uint64x2_t) a;
+}
+
+template <typename T>
+float64x2_t vreinterpretq_f64_u64(T a)
+{
+  return (float64x2_t) a;
+}
+
+typedef float64x2_t Packet2d;
+typedef float64x1_t Packet1d;
+
+template<> struct packet_traits<double>  : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket=0,
+   
+    HasDiv  = 1,
+    // FIXME check the Has*
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 0,
+    HasSqrt = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2d> { typedef double  type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
+{
+  const double countdown_raw[] = {0.0,1.0};
+  const Packet2d countdown = vld1q_f64(countdown_raw);
+  return vaddq_f64(pset1<Packet2d>(a), countdown);
+}
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
+
+#ifdef __ARM_FEATURE_FMA
+// See bug 936. See above comment about FMA for float.
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
+#else
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  return vld1q_dup_f64(from);
+}
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  Packet2d res = pset1<Packet2d>(0.0);
+  res = vsetq_lane_f64(from[0*stride], res, 0);
+  res = vsetq_lane_f64(from[1*stride], res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  to[stride*0] = vgetq_lane_f64(from, 0);
+  to[stride*1] = vgetq_lane_f64(from, 1);
+}
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
+
+// FIXME only store the 2 first elements ?
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
+
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+// workaround ICE, see bug 907
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  float64x2_t trn1, trn2;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  trn1 = vzip1q_f64(vecs[0], vecs[1]);
+  trn2 = vzip2q_f64(vecs[0], vecs[1]);
+
+  // Do the addition of the resulting vectors
+  return vaddq_f64(trn1, trn2);
+}
+// Other reduction functions:
+// mul
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+#endif
+
+// min
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
+
+// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
+// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
+#define PALIGN_NEON(Offset,Type,Command) \
+template<>\
+struct palign_impl<Offset,Type>\
+{\
+    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
+    {\
+        if (Offset!=0)\
+            first = Command(first, second, Offset);\
+    }\
+};\
+
+PALIGN_NEON(0,Packet2d,vextq_f64)
+PALIGN_NEON(1,Packet2d,vextq_f64)
+#undef PALIGN_NEON
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
+  float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
+
+  kernel.packet[0] = trn1;
+  kernel.packet[1] = trn2;
+}
+#endif // EIGEN_ARCH_ARM64 
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/CMakeLists.txt b/Eigen/src/Core/arch/SSE/CMakeLists.txt
deleted file mode 100644
index 46ea7cc62..000000000
--- a/Eigen/src/Core/arch/SSE/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_SSE_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_SSE_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/SSE COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 91bba5e38..5607fe0ab 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -22,13 +22,18 @@ struct Packet2cf
   __m128  v;
 };
 
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet2cf type;
+  typedef Packet2cf half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
+    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -39,11 +44,13 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
-    HasSetLinear = 0
+    HasSetLinear = 0,
+    HasBlend = 1
   };
 };
+#endif
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@@ -60,7 +67,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for SSE3 and 4
   #ifdef EIGEN_VECTORIZE_SSE3
   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
                                  _mm_mul_ps(_mm_movehdup_ps(a.v),
@@ -104,8 +110,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
 
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
+
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
+                              std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
+  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
+}
 
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 
@@ -124,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Pack
   #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
@@ -214,7 +235,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false>
   { return padd(c, pmul(x,y)); }
 
   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(Eigen::internal::pmul(x, y.v)); }
+  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
 };
 
 template<> struct conj_helper<Packet2cf, Packet4f, false,false>
@@ -223,7 +244,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false>
   { return padd(c, pmul(x,y)); }
 
   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(Eigen::internal::pmul(x.v, y)); }
+  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
 };
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@@ -234,7 +255,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
   return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
 }
 
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x)
 {
   return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
 }
@@ -248,13 +269,18 @@ struct Packet1cd
   __m128d  v;
 };
 
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
   typedef Packet1cd type;
+  typedef Packet1cd half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
+    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -268,12 +294,13 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
     HasSetLinear = 0
   };
 };
+#endif
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
 
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
 {
   const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
@@ -282,9 +309,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  // TODO optimize it for SSE3 and 4
   #ifdef EIGEN_VECTORIZE_SSE3
-  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
+  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v),
                                  _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
                                             vec2d_swizzle1(b.v, 1, 0))));
   #else
@@ -311,8 +337,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
 template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
 
 // FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
 
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 
@@ -410,7 +436,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false>
   { return padd(c, pmul(x,y)); }
 
   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(Eigen::internal::pmul(x, y.v)); }
+  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
 };
 
 template<> struct conj_helper<Packet1cd, Packet2d, false,false>
@@ -419,7 +445,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false>
   { return padd(c, pmul(x,y)); }
 
   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(Eigen::internal::pmul(x.v, y)); }
+  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
 };
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
@@ -430,9 +456,44 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
   return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
 }
 
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/* <Packet1cd> */(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+  __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
+  __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
+
+  __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
+  kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
+  kernel.packet[1].v = tmp;
+}
+
+template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
+  return Packet2cf(_mm_castpd_ps(result));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex<float> b)
+{
+  return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast<const __m64*>(&b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex<double> b)
+{
+  return pset1<Packet1cd>(b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex<float> b)
+{
+  return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast<const __m64*>(&b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex<double> b)
 {
-  return Packet1cd(preverse(x.v));
+  return pset1<Packet1cd>(b);
 }
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index d16f30bb0..7b5f948e1 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -32,7 +32,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
   /* the smallest non denormalized float number */
   _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
   _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000);//-1.f/0.f);
-  
+
   /* natural logarithm computed for 4 simultaneous float
     return NaN for x <= 0
   */
@@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
   x = _mm_or_ps(x, p4f_half);
 
   emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
-  Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);
+  Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
 
   /* part2:
      if( x < SQRTHF ) {
@@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
      } else { x = x - 1.0; }
   */
   Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
-  Packet4f tmp = _mm_and_ps(x, mask);
+  Packet4f tmp = pand(x, mask);
   x = psub(x, p4f_1);
-  e = psub(e, _mm_and_ps(p4f_1, mask));
+  e = psub(e, pand(p4f_1, mask));
   x = padd(x, tmp);
 
   Packet4f x2 = pmul(x,x);
@@ -126,7 +126,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
 
-  Packet4f tmp = _mm_setzero_ps(), fx;
+  Packet4f tmp, fx;
   Packet4i emm0;
 
   // clamp x
@@ -195,7 +195,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
   static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
 
-  Packet2d tmp = _mm_setzero_pd(), fx;
+  Packet2d tmp, fx;
   Packet4i emm0;
 
   // clamp x
@@ -279,7 +279,7 @@ Packet4f psin<Packet4f>(const Packet4f& _x)
   _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
   _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
 
-  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+  Packet4f xmm1, xmm2, xmm3, sign_bit, y;
 
   Packet4i emm0, emm2;
   sign_bit = x;
@@ -378,7 +378,7 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
   _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
   _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
 
-  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+  Packet4f xmm1, xmm2, xmm3, y;
   Packet4i emm0, emm2;
 
   x = pabs(x);
@@ -444,32 +444,119 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
 
 #if EIGEN_FAST_MATH
 
-// This is based on Quake3's fast inverse square root.
+// Functions for sqrt.
+// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
+// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
+// exact solution. It does not handle +inf, or denormalized numbers correctly.
+// The main advantage of this approach is not just speed, but also the fact that
+// it can be inlined and pipelined with other computations, further reducing its
+// effective latency. This is similar to Quake3's fast inverse square root.
 // For detail see here: http://www.beyond3d.com/content/articles/8/
-// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& _x)
 {
   Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
+  Packet4f denormal_mask = _mm_and_ps(
+      _mm_cmpge_ps(_x, _mm_setzero_ps()),
+      _mm_cmplt_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())));
 
-  /* select only the inverse sqrt of non-zero inputs */
-  Packet4f non_zero_mask = _mm_cmpge_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)()));
-  Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
-
+  // Compute approximate reciprocal sqrt.
+  Packet4f x = _mm_rsqrt_ps(_x);
+  // Do a single step of Newton's iteration.
   x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
-  return pmul(_x,x);
+  // Flush results for denormals to zero.
+  return _mm_andnot_ps(denormal_mask, pmul(_x,x));
 }
 
 #else
 
-template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
+
+  Packet4f neg_half = pmul(_x, p4f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
+  Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
+  Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
+  Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
+                                     _mm_and_ps(zero_mask, p4f_inf));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm_or_ps(x, infs_and_nans);
+}
+
+#else
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
+  return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
+}
 
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
+  return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& x) {
+  return internal::generic_fast_tanh_float(x);
+}
 
 } // end namespace internal
 
+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sqrt(const float &x)
+{
+  return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sqrt(const double &x)
+{
+#if EIGEN_COMP_GNUC_STRICT
+  // This works around a GCC bug generating poor code for _mm_sqrt_pd
+  // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b
+  return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
+#else
+  return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
+#endif
+}
+
+} // end namespace numex
+
 } // end namespace Eigen
 
 #endif // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 32caaa26c..3832de147 100644..100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -22,9 +22,40 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
+#endif
+#endif
+
+#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)
+// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
+// have overloads for both types without linking error.
+// One solution is to increase ABI version using -fabi-version=4 (or greater).
+// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
+// structure:
+template<typename T>
+struct eigen_packet_wrapper
+{
+  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
+  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+  
+  T m_val;
+};
+typedef eigen_packet_wrapper<__m128>  Packet4f;
+typedef eigen_packet_wrapper<__m128i> Packet4i;
+typedef eigen_packet_wrapper<__m128d> Packet2d;
+#else
 typedef __m128  Packet4f;
 typedef __m128i Packet4i;
 typedef __m128d Packet2d;
+#endif
 
 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
@@ -58,51 +89,85 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
 
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet4f type;
+  typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
+    HasHalfPacket = 0,
 
     HasDiv  = 1,
     HasSin  = EIGEN_FAST_MATH,
     HasCos  = EIGEN_FAST_MATH,
     HasLog  = 1,
     HasExp  = 1,
-    HasSqrt = 1
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh  = EIGEN_FAST_MATH,
+    HasBlend = 1
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+    ,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
+#endif
   };
 };
 template<> struct packet_traits<double> : default_packet_traits
 {
   typedef Packet2d type;
+  typedef Packet2d half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=2,
+    HasHalfPacket = 0,
 
     HasDiv  = 1,
     HasExp  = 1,
-    HasSqrt = 1
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasBlend = 1
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+    ,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
+#endif
   };
 };
+#endif
 template<> struct packet_traits<int>    : default_packet_traits
 {
   typedef Packet4i type;
+  typedef Packet4i half;
   enum {
-    // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size=4,
+
+    HasBlend = 1
   };
 };
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
 
-#if defined(_MSC_VER) && (_MSC_VER==1500)
+#ifndef EIGEN_VECTORIZE_AVX
+template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
+template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
+#endif
+
+#if EIGEN_COMP_MSVC==1500
 // Workaround MSVC 9 internal compiler error.
 // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
 // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
@@ -110,14 +175,25 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { re
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
+// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
+// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
+// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
+// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
+// Also note that with AVX, we want it to generate a vbroadcastss.
+#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
+template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
+  return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
+}
+#endif
+  
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
@@ -139,7 +215,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
 {
-  return psub(_mm_setr_epi32(0,0,0,0), a);
+  return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
@@ -166,13 +242,13 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by SSE");
-  return pset1<Packet4i>(0);
-}
 
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+#ifdef __FMA__
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
+#endif
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
@@ -200,6 +276,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const
 #endif
 }
 
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
+#endif
+
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
@@ -216,42 +303,16 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) {
-  EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef EIGEN_ANDROID_SSE_WR
-// Workaround for X86 on Android crash on aligned operation.
-  return _mm_loadu_ps(from);
-#else
-  return _mm_load_ps(from);
-#endif
-  }
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) {
-  EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef EIGEN_ANDROID_SSE_WR
-// Workaround for X86 on Android crash on aligned operation.
-  return _mm_loadu_pd(from);
-#else
-  return _mm_load_pd(from);
-#endif
-  }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) {
-  EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef EIGEN_ANDROID_SSE_WR
-// Workaround for X86 on Android crash on aligned operation.
-  return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
-#else
-  return _mm_load_si128(reinterpret_cast<const Packet4i*>(from));
-#endif
-  }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
 
-#if defined(_MSC_VER)
+#if EIGEN_COMP_MSVC
   template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
     EIGEN_DEBUG_UNALIGNED_LOAD
-    #if (_MSC_VER==1600)
+    #if (EIGEN_COMP_MSVC==1600)
     // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
     // (i.e., it does not generate an unaligned load!!
-    // TODO On most architectures this version should also be faster than a single _mm_loadu_ps
-    // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
     __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
     res = _mm_loadh_pi(res, (const __m64*)(from+2));
     return res;
@@ -259,78 +320,27 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) {
     return _mm_loadu_ps(from);
     #endif
   }
-  template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
 #else
-// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
-// require pointer casting to incompatible pointer types and leads to invalid code
-// because of the strict aliasing rule. The "dummy" stuff are required to enforce
-// a correct instruction dependency.
-// TODO: do the same for MSVC (ICC is compatible)
 // NOTE: with the code below, MSVC's compiler crashes!
 
-#if defined(__GNUC__) && defined(__i386__)
-  // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
-#elif defined(__clang__)
-  // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
-#else
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
-#endif
-
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
   return _mm_loadu_ps(from);
-#else
-  __m128d res;
-#ifdef EIGEN_ANDROID_SSE_WR
-// Workaround for X86 on Android crash on aligned operation.
-  res =  _mm_loadu_sd((const double*)(from)) ;
-#else
-  res =  _mm_load_sd((const double*)(from)) ;
-#endif
-  res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
-  return _mm_castpd_ps(res);
-#endif
 }
+#endif
+
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
   return _mm_loadu_pd(from);
-#else
-  __m128d res;
-#ifdef EIGEN_ANDROID_SSE_WR
-// Workaround for X86 on Android crash on aligned operation.
-  res = _mm_loadu_sd(from) ;
-#else
-  res = _mm_load_sd(from) ;
-#endif
-  res = _mm_loadh_pd(res,from+1);
-  return res;
-#endif
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
-#else
-  __m128d res;
-#ifdef EIGEN_ANDROID_SSE_WR
-// Workaround for X86 on Android crash on aligned operation.
-  res =  _mm_loadu_sd((const double*)(from)) ;
-#else
-  res =  _mm_load_sd((const double*)(from)) ;
-#endif
-  res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
-  return _mm_castpd_si128(res);
-#endif
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 }
-#endif
+
 
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
 {
@@ -341,70 +351,77 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
   Packet4i tmp;
-  tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
+  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
   return vec4i_swizzle1(tmp, 0, 0, 1, 1);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) {
-  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef EIGEN_ANDROID_SSE_WR
-// Workaround for X86 on Android crash on aligned operation.
-  _mm_storeu_ps(to, from);
-#else
-  _mm_store_ps(to, from);
-#endif
-  }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
-  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef EIGEN_ANDROID_SSE_WR
-// Workaround for X86 on Android crash on aligned operation.
-  _mm_storeu_pd(to, from);
-#else
-  _mm_store_pd(to, from);
-#endif
-  }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef EIGEN_ANDROID_SSE_WR
-// Workaround for X86 on Android crash on aligned operation.
-  _mm_storeu_si128(reinterpret_cast<Packet4i*>(to), from);
-#else
-  _mm_store_si128(reinterpret_cast<Packet4i*>(to), from);
-#endif
-  }
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  _mm_storel_pd((to), from);
-  _mm_storeh_pd((to+1), from);
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+ return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+ return _mm_set_pd(from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+ return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+ }
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  to[stride*0] = _mm_cvtss_f32(from);
+  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
+  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
+  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  to[stride*0] = _mm_cvtsd_f64(from);
+  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  to[stride*0] = _mm_cvtsi128_si32(from);
+  to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
 
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
 {
   Packet4f pa = _mm_set_ss(a);
-  pstore(to, vec4f_swizzle1(pa,0,0,0,0));
+  pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
 }
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
 {
   Packet2d pa = _mm_set_sd(a);
-  pstore(to, vec2d_swizzle1(pa,0,0));
+  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }
 
+#ifndef EIGEN_VECTORIZE_AVX
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+#endif
 
-#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
 // Direct of the struct members fixed bug #62.
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#elif EIGEN_COMP_MSVC_STRICT
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
@@ -422,7 +439,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 { return _mm_shuffle_epi32(a,0x1B); }
 
-
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
 {
   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
@@ -443,6 +459,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
   #endif
 }
 
+// with AVX, the default implementations based on pload1 are faster
+#ifndef __AVX__
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec4f_swizzle1(a3, 0,0,0,0);
+  a1 = vec4f_swizzle1(a3, 1,1,1,1);
+  a2 = vec4f_swizzle1(a3, 2,2,2,2);
+  a3 = vec4f_swizzle1(a3, 3,3,3,3);
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+#ifdef EIGEN_VECTORIZE_SSE3
+  a0 = _mm_loaddup_pd(a+0);
+  a1 = _mm_loaddup_pd(a+1);
+  a2 = _mm_loaddup_pd(a+2);
+  a3 = _mm_loaddup_pd(a+3);
+#else
+  a1 = pload<Packet2d>(a);
+  a0 = vec2d_swizzle1(a1, 0,0);
+  a1 = vec2d_swizzle1(a1, 1,1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec2d_swizzle1(a3, 0,0);
+  a3 = vec2d_swizzle1(a3, 1,1);
+#endif
+}
+#endif
+
 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
 {
   vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
@@ -452,47 +500,17 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
 }
 
 #ifdef EIGEN_VECTORIZE_SSE3
-// TODO implement SSE2 versions as well as integer versions
 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 {
   return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
 }
+
 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
   return _mm_hadd_pd(vecs[0], vecs[1]);
 }
-// SSSE3 version:
-// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
-// {
-//   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
-// }
-
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp0 = _mm_hadd_ps(a,a);
-  return pfirst(_mm_hadd_ps(tmp0, tmp0));
-}
-
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
 
-// SSSE3 version:
-// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
-// {
-//   Packet4i tmp0 = _mm_hadd_epi32(a,a);
-//   return pfirst(_mm_hadd_epi32(tmp0, tmp0));
-// }
 #else
-// SSE2 versions
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
-  return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 {
   Packet4f tmp0, tmp1, tmp2;
@@ -513,10 +531,45 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 }
 #endif  // SSE3
 
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+  // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+//   Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
+//   return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
+// #else
+  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
+  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+// #endif
+}
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+  // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+//   return pfirst<Packet2d>(_mm_hadd_pd(a, a));
+// #else
+  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+// #endif
+}
+
+#ifdef EIGEN_VECTORIZE_SSSE3
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+{
+  return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
+}
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i tmp0 = _mm_hadd_epi32(a,a);
+  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
+}
+#else
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
-  return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
+  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
@@ -532,18 +585,18 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
   tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
   return _mm_add_epi32(tmp0, tmp2);
 }
-
+#endif
 // Other reduction functions:
 
 // mul
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
   Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 {
@@ -559,14 +612,18 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
 {
   Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
+  return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+#else
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the pstore !!)
   EIGEN_ALIGN16 int aux[4];
@@ -574,20 +631,25 @@ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
   int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
   int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
   return aux0<aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
 }
 
 // max
 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
 {
   Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
+  return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+#else
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the pstore !!)
   EIGEN_ALIGN16 int aux[4];
@@ -595,9 +657,10 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
   int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
   int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
   return aux0>aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
 }
 
-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
 // {
 //   Packet4f res = b;
@@ -705,6 +768,110 @@ struct palign_impl<Offset,Packet2d>
 };
 #endif
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
+  __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
+  __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
+
+  kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
+  kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
+  kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
+  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m128i false_mask = _mm_cmpeq_epi32(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  const __m128 zero = _mm_setzero_ps();
+  const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m128 false_mask = _mm_cmpeq_ps(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  const __m128d zero = _mm_setzero_pd();
+  const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
+  __m128d false_mask = _mm_cmpeq_pd(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blend_ps(a,pset1<Packet4f>(b),1);
+#else
+  return _mm_move_ss(a, _mm_load_ss(&b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blend_pd(a,pset1<Packet2d>(b),1);
+#else
+  return _mm_move_sd(a, _mm_load_sd(&b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blend_ps(a,pset1<Packet4f>(b),(1<<3));
+#else
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF));
+  return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1<Packet4f>(b)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blend_pd(a,pset1<Packet2d>(b),(1<<1));
+#else
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF));
+  return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1<Packet2d>(b)));
+#endif
+}
+
+// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
+#ifdef __FMA__
+template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
+  return ::fmaf(a,b,c);
+}
+template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
+  return ::fma(a,b,c);
+}
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
new file mode 100644
index 000000000..c84893230
--- /dev/null
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SSE_H
+#define EIGEN_TYPE_CASTING_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+
+template <>
+struct type_casting_traits<double, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  // Simply discard the second half of the input
+  return _mm_cvtps_pd(a);
+}
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SSE_H
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
new file mode 100644
index 000000000..d39d2d105
--- /dev/null
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -0,0 +1,394 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX32_ALTIVEC_H
+#define EIGEN_COMPLEX32_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  union {
+    Packet4f v;
+    Packet1cd cd[2];
+  };
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasBlend  = 1,
+    HasSetLinear = 0
+  };
+};
+
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+
+/* Forward declaration */
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
+
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+  res.cd[1] = res.cd[0];
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet2cf>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
+{
+  return pload<Packet1cd>(from);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
+{
+  pstore<std::complex<double> >(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
+  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  Packet2d a_re, a_im, v1, v2;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+  // multiply a_re * b
+  v1 = vec_madd(a_re, b.v, p2d_ZERO);
+  // multiply a_im * b and get the conjugate result
+  v2 = vec_madd(a_im, b.v, p2d_ZERO);
+  v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
+  v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+
+  return Packet1cd(v1 + v2);
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  std::complex<double> EIGEN_ALIGN16 res;
+  pstore<std::complex<double> >(&res, a);
+
+  return res;
+}
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 res[2];
+  pstore<std::complex<float> >(res, a);
+
+  return res[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.cd[0] = a.cd[1];
+  res.cd[1] = a.cd[0];
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
+{
+  return vecs[0];
+}
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  PacketBlock<Packet2cf,2> transpose;
+  transpose.packet[0] = vecs[0];
+  transpose.packet[1] = vecs[1];
+  ptranspose(transpose);
+
+  return padd<Packet2cf>(transpose.packet[0], transpose.packet[1]);
+} 
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset == 1) {
+      first.cd[0] = first.cd[1];
+      first.cd[1] = second.cd[0];
+    }
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for AltiVec
+  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
+  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res;
+  res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
+  res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+{
+  Packet2cf res;
+  res.cd[0] = pcplxflip(x.cd[0]);
+  res.cd[1] = pcplxflip(x.cd[1]);
+  return res;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet1cd tmp = kernel.packet[0].cd[1];
+  kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
+  kernel.packet[1].cd[0] = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
+  result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
+  return result;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX32_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h
new file mode 100644
index 000000000..5c7aa7256
--- /dev/null
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -0,0 +1,137 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& _x)
+{
+  Packet2d x = _x;
+
+  Packet2d tmp, fx;
+  Packet2l emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
+
+  fx = vec_floor(fx);
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x,x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul (px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px,psub(qx,px));
+  x = pmadd(p2d_2,x,p2d_1);
+
+  // build 2^n
+  emm0 = vec_ctsl(fx, 0);
+
+  static const Packet2l p2l_1023 = { 1023, 1023 };
+  static const Packet2ul p2ul_52 = { 52, 52 };
+
+  emm0 = emm0 + p2l_1023;
+  emm0 = emm0 << reinterpret_cast<Packet2l>(p2ul_52);
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
+                 isnumber_mask);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& x)
+{
+  Packet4f res;
+  res.v4f[0] = pexp<Packet2d>(x.v4f[0]);
+  res.v4f[1] = pexp<Packet2d>(x.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x)
+{
+  return  __builtin_s390_vfsqdb(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  Packet4f res;
+  res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
+  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  Packet4f res;
+  res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
+  return res;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
new file mode 100755
index 000000000..57b01fc63
--- /dev/null
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -0,0 +1,945 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_ZVECTOR_H
+#define EIGEN_PACKET_MATH_ZVECTOR_H
+
+#include <stdint.h>
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  16
+#endif
+
+typedef __vector int                 Packet4i;
+typedef __vector unsigned int        Packet4ui;
+typedef __vector __bool int          Packet4bi;
+typedef __vector short int           Packet8i;
+typedef __vector unsigned char       Packet16uc;
+typedef __vector double              Packet2d;
+typedef __vector unsigned long long  Packet2ul;
+typedef __vector long long           Packet2l;
+
+typedef struct {
+	Packet2d  v4f[2];
+} Packet4f;
+
+typedef union {
+  int32_t   i[4];
+  uint32_t ui[4];
+  int64_t   l[2];
+  uint64_t ul[2];
+  double    d[2];
+  Packet4i  v4i;
+  Packet4ui v4ui;
+  Packet2l  v2l;
+  Packet2ul v2ul;
+  Packet2d  v2d;
+} Packet;
+
+// We don't want to write the same code all the time, but we need to reuse the constants
+// and it doesn't really work to declare them global, so we define macros instead
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
+  Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \
+  Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \
+  Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
+  Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
+  Packet2l p2l_##NAME = pset1<Packet2l>(X)
+
+// These constants are endian-agnostic
+//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
+
+static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
+
+static Packet2d p2d_ONE = { 1.0, 1.0 }; 
+static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
+
+static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
+
+static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+
+// Mask alignment
+#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
+
+#define _EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
+
+// Handle endianness properly while loading constants
+// Define global static constants:
+
+static Packet16uc p16uc_FORWARD =   { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+
+static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+
+static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
+static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16);                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16);                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
+static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+
+//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+
+//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+
+
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#else
+  #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( "   pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#endif
+
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet4i type;
+  typedef Packet4i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasBlend = 1
+  };
+};
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef Packet4f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 1,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+
+/* Forward declaration */
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
+ 
+inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
+{
+  Packet vt;
+  vt.v4i = v;
+  s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
+{
+  Packet vt;
+  vt.v4ui = v;
+  s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
+{
+  Packet vt;
+  vt.v2l = v;
+  s << vt.l[0] << ", " << vt.l[1];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v)
+{
+  Packet vt;
+  vt.v2ul = v;
+  s << vt.ul[0] << ", " << vt.ul[1] ;
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
+{
+  Packet vt;
+  vt.v2d = v;
+  s << vt.d[0] << ", " << vt.d[1];
+  return s;
+}
+
+/* Helper function to simulate a vec_splat_packet4f
+ */
+template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
+{
+  Packet4f splat;
+  switch (element) {
+  case 0:
+    splat.v4f[0] = vec_splat(from.v4f[0], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 1:
+    splat.v4f[0] = vec_splat(from.v4f[0], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 2:
+    splat.v4f[0] = vec_splat(from.v4f[1], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 3:
+    splat.v4f[0] = vec_splat(from.v4f[1], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  }
+  return splat;
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet4i>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
+  {
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+  }
+};
+
+/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
+ */
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    switch (Offset % 4) {
+    case 1:
+      first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
+      first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
+      break;
+    case 2:
+      first.v4f[0] = first.v4f[1];
+      first.v4f[1] = second.v4f[0];
+      break;
+    case 3:
+      first.v4f[0] = vec_sld(first.v4f[1],  second.v4f[0], 8);
+      first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
+      break;
+    }
+  }
+};
+
+
+template<int Offset>
+struct palign_impl<Offset,Packet2d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
+  {
+    if (Offset == 1)
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(first), reinterpret_cast<Packet4i>(second), 8));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v4i;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet4f vfrom;
+  vfrom.v4f[0] = vec_ld2f(&from[0]);
+  vfrom.v4f[1] = vec_ld2f(&from[2]);
+  return vfrom;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v2d;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v4i = from;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_st2f(from.v4f[0], &to[0]);
+  vec_st2f(from.v4f[1], &to[2]);
+}
+
+
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v2d = from;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)
+{
+  return vec_splats(from);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  return vec_splats(from);
+}
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
+{
+  Packet4f to;
+  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
+  to.v4f[1] = to.v4f[0];
+  return to;
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4i>(const int *a,
+                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
+{
+  a3 = pload<Packet4i>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat_packet4f<0>(a3);
+  a1 = vec_splat_packet4f<1>(a3);
+  a2 = vec_splat_packet4f<2>(a3);
+  a3 = vec_splat_packet4f<3>(a3);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  a1 = pload<Packet2d>(a);
+  a0 = vec_splat(a1, 0);
+  a1 = vec_splat(a1, 1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec_splat(a3, 0);
+  a3 = vec_splat(a3, 1);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4i>(ai);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4f>(ai);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+ return pload<Packet2d>(af);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  pstore<int>((int *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  pstore<float>((float *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  pstore<double>(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] + b.v4f[0];
+  c.v4f[1] = a.v4f[1] + b.v4f[1];
+  return c;
+}
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] - b.v4f[0];
+  c.v4f[1] = a.v4f[1] - b.v4f[1];
+  return c;
+}
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] * b.v4f[0];
+  c.v4f[1] = a.v4f[1] * b.v4f[1];
+  return c;
+}
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] / b.v4f[0];
+  c.v4f[1] = a.v4f[1] / b.v4f[1];
+  return c;
+}
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+  Packet4f c;
+  c.v4f[0] = -a.v4f[0];
+  c.v4f[1] = -a.v4f[1];
+  return c;
+}
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
+  Packet4f res;
+  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
+  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
+
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)    { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_round(a.v4f[0]);
+  res.v4f[1] = vec_round(a.v4f[1]);
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_ceil(a.v4f[0]);
+  res.v4f[1] = vec_ceil(a.v4f[1]);
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_floor(a.v4f[0]);
+  res.v4f[1] = vec_floor(a.v4f[1]);
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*       from) { return pload<Packet4i>(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*     from) { return pload<Packet4f>(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double*    from) { return pload<Packet2d>(from); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+{
+  Packet4i p = pload<Packet4i>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  p.v4f[1] = vec_splat(p.v4f[0], 1);
+  p.v4f[0] = vec_splat(p.v4f[0], 0);
+  return p;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  Packet2d p = pload<Packet2d>(from);
+  return vec_perm(p, p, p16uc_PSET64_HI);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*        to, const Packet4i& from) { pstore<int>(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*    to, const Packet4f& from) { pstore<float>(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { pstore<double>(to, from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  Packet4f rev;
+  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
+  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
+  return rev;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = pabs(a.v4f[0]);
+  res.v4f[1] = pabs(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, sum;
+  b   = vec_sld(a, a, 8);
+  sum = padd<Packet4i>(a, b);
+  b   = vec_sld(sum, sum, 4);
+  sum = padd<Packet4i>(sum, b);
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  Packet2d b, sum;
+  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
+  sum = padd<Packet2d>(a, b);
+  return pfirst(sum);
+}
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet2d sum;
+  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
+  double first = predux<Packet2d>(sum);
+  return static_cast<float>(first);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+{
+  Packet4i v[4], sum[4];
+
+  // It's easier and faster to transpose then add as columns
+  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+  // Do the transpose, first set of moves
+  v[0] = vec_mergeh(vecs[0], vecs[2]);
+  v[1] = vec_mergel(vecs[0], vecs[2]);
+  v[2] = vec_mergeh(vecs[1], vecs[3]);
+  v[3] = vec_mergel(vecs[1], vecs[3]);
+  // Get the resulting vectors
+  sum[0] = vec_mergeh(v[0], v[2]);
+  sum[1] = vec_mergel(v[0], v[2]);
+  sum[2] = vec_mergeh(v[1], v[3]);
+  sum[3] = vec_mergel(v[1], v[3]);
+
+  // Now do the summation:
+  // Lines 0+1
+  sum[0] = padd<Packet4i>(sum[0], sum[1]);
+  // Lines 2+3
+  sum[1] = padd<Packet4i>(sum[2], sum[3]);
+  // Add the results
+  sum[0] = padd<Packet4i>(sum[0], sum[1]);
+
+  return sum[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  Packet2d v[2], sum;
+  v[0] = padd<Packet2d>(vecs[0], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[0]), reinterpret_cast<Packet4ui>(vecs[0]), 8)));
+  v[1] = padd<Packet2d>(vecs[1], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[1]), reinterpret_cast<Packet4ui>(vecs[1]), 8)));
+ 
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v[0]), reinterpret_cast<Packet4ui>(v[1]), 8));
+
+  return sum;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  PacketBlock<Packet4f,4> transpose;
+  transpose.packet[0] = vecs[0];
+  transpose.packet[1] = vecs[1];
+  transpose.packet[2] = vecs[2];
+  transpose.packet[3] = vecs[3];
+  ptranspose(transpose);
+
+  Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
+  sum = padd(sum, transpose.packet[2]);
+  sum = padd(sum, transpose.packet[3]);
+  return sum;
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+{
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
+}
+
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  // Return predux_mul<Packet2d> of the subvectors product
+  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, res;
+  b   = pmin<Packet4i>(a, vec_sld(a, a, 8));
+  res = pmin<Packet4i>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
+// max
+template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, res;
+  b = pmax<Packet4i>(a, vec_sld(a, a, 8));
+  res = pmax<Packet4i>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
+  Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
+  kernel.packet[0] = t0;
+  kernel.packet[1] = t1;
+}
+
+/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
+ */
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  PacketBlock<Packet2d,2> t0,t1,t2,t3;
+  // copy top-left 2x2 Packet2d block
+  t0.packet[0] = kernel.packet[0].v4f[0];
+  t0.packet[1] = kernel.packet[1].v4f[0];
+
+  // copy top-right 2x2 Packet2d block
+  t1.packet[0] = kernel.packet[0].v4f[1];
+  t1.packet[1] = kernel.packet[1].v4f[1];
+
+  // copy bottom-left 2x2 Packet2d block
+  t2.packet[0] = kernel.packet[2].v4f[0];
+  t2.packet[1] = kernel.packet[3].v4f[0];
+
+  // copy bottom-right 2x2 Packet2d block
+  t3.packet[0] = kernel.packet[2].v4f[1];
+  t3.packet[1] = kernel.packet[3].v4f[1];
+
+  // Transpose all 2x2 blocks
+  ptranspose(t0);
+  ptranspose(t1);
+  ptranspose(t2);
+  ptranspose(t3);
+
+  // Copy back transposed blocks, but exchange t1 and t2 due to transposition
+  kernel.packet[0].v4f[0] = t0.packet[0];
+  kernel.packet[0].v4f[1] = t2.packet[0];
+  kernel.packet[1].v4f[0] = t0.packet[1];
+  kernel.packet[1].v4f[1] = t2.packet[1];
+  kernel.packet[2].v4f[0] = t1.packet[0];
+  kernel.packet[2].v4f[1] = t3.packet[0];
+  kernel.packet[3].v4f[0] = t1.packet[1];
+  kernel.packet[3].v4f[1] = t3.packet[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
+  Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet4f result;
+  result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi);
+  result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_ZVECTOR_H
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
new file mode 100644
index 000000000..4153b877c
--- /dev/null
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -0,0 +1,168 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGNMENT_FUNCTORS_H
+#define EIGEN_ASSIGNMENT_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+  
+/** \internal
+  * \brief Template functor for scalar/packet assignment
+  *
+  */
+template<typename DstScalar,typename SrcScalar> struct assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,b); }
+};
+
+// Empty overload for void type (used by PermutationMatrix)
+template<typename DstScalar> struct assign_op<DstScalar,void> {};
+
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::Vectorizable && packet_traits<SrcScalar>::Vectorizable
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with addition
+  *
+  */
+template<typename DstScalar,typename SrcScalar> struct add_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(add_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<add_assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasAdd
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with subtraction
+  *
+  */
+template<typename DstScalar,typename SrcScalar> struct sub_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(sub_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a -= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<sub_assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasSub
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with multiplication
+  *
+  */
+template<typename DstScalar, typename SrcScalar=DstScalar>
+struct mul_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(mul_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a *= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::pmul(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename DstScalar, typename SrcScalar>
+struct functor_traits<mul_assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasMul
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with diviving
+  *
+  */
+template<typename DstScalar, typename SrcScalar=DstScalar> struct div_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(div_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a /= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename DstScalar, typename SrcScalar>
+struct functor_traits<div_assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasDiv
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with swapping
+  *
+  * It works as follow. For a non-vectorized evaluation loop, we have:
+  *   for(i) func(A.coeffRef(i), B.coeff(i));
+  * where B is a SwapWrapper expression. The trick is to make SwapWrapper::coeff behaves like a non-const coeffRef.
+  * Actually, SwapWrapper might not even be needed since even if B is a plain expression, since it has to be writable
+  * B.coeff already returns a const reference to the underlying scalar value.
+  * 
+  * The case of a vectorized loop is more tricky:
+  *   for(i,j) func.assignPacket<A_Align>(&A.coeffRef(i,j), B.packet<B_Align>(i,j));
+  * Here, B must be a SwapWrapper whose packet function actually returns a proxy object holding a Scalar*,
+  * the actual alignment and Packet type.
+  *
+  */
+template<typename Scalar> struct swap_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
+  {
+#ifdef __CUDACC__
+    // FIXME is there some kind of cuda::swap?
+    Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
+#else
+    using std::swap;
+    swap(a,const_cast<Scalar&>(b));
+#endif
+  }
+};
+template<typename Scalar>
+struct functor_traits<swap_assign_op<Scalar> > {
+  enum {
+    Cost = 3 * NumTraits<Scalar>::ReadCost,
+    PacketAccess = packet_traits<Scalar>::Vectorizable
+  };
+};
+
+} // namespace internal
+
+} // namespace Eigen
+
+#endif // EIGEN_ASSIGNMENT_FUNCTORS_H
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
new file mode 100644
index 000000000..96747bac7
--- /dev/null
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -0,0 +1,482 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BINARY_FUNCTORS_H
+#define EIGEN_BINARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- associative binary functors ----------
+
+template<typename Arg1, typename Arg2>
+struct binary_op_base
+{
+  typedef Arg1 first_argument_type;
+  typedef Arg2 second_argument_type;
+};
+
+/** \internal
+  * \brief Template functor to compute the sum of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_sum_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
+#else
+  scalar_sum_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::padd(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  { return internal::predux(a); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2, // rough estimate!
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
+    // TODO vectorize mixed sum
+  };
+};
+
+/** \internal
+  * \brief Template specialization to deprecate the summation of boolean expressions.
+  * This is required to solve Bug 426.
+  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
+  */
+template<> struct scalar_sum_op<bool,bool> : scalar_sum_op<int,int> {
+  EIGEN_DEPRECATED
+  scalar_sum_op() {}
+};
+
+
+/** \internal
+  * \brief Template functor to compute the product of two scalars
+  *
+  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_product_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+#else
+  scalar_product_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmul(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  { return internal::predux_mul(a); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
+    // TODO vectorize mixed product
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the conjugate product of two scalars
+  *
+  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+
+  enum {
+    Conj = NumTraits<LhsScalar>::IsComplex
+  };
+  
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
+  
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
+  
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = NumTraits<LhsScalar>::MulCost,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the min of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmin(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  { return internal::predux_min(a); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_min_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the max of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_max_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmax(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  { return internal::predux_max(a); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_max_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
+  };
+};
+
+/** \internal
+  * \brief Template functors for comparison of two scalars
+  * \todo Implement packet-comparisons
+  */
+template<typename LhsScalar, typename RhsScalar, ComparisonName cmp> struct scalar_cmp_op;
+
+template<typename LhsScalar, typename RhsScalar, ComparisonName cmp>
+struct functor_traits<scalar_cmp_op<LhsScalar,RhsScalar, cmp> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = false
+  };
+};
+
+template<ComparisonName Cmp, typename LhsScalar, typename RhsScalar>
+struct result_of<scalar_cmp_op<LhsScalar, RhsScalar, Cmp>(LhsScalar,RhsScalar)> {
+  typedef bool type;
+};
+
+
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_EQ> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LT> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<b;}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LE> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GT> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GE> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_UNORD> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;}
+};
+
+
+/** \internal
+  * \brief Template functor to compute the hypot of two scalars
+  *
+  * \sa MatrixBase::stableNorm(), class Redux
+  */
+template<typename Scalar>
+struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
+//   typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
+  {
+    EIGEN_USING_STD_MATH(sqrt)
+    Scalar p, qp;
+    if(_x>_y)
+    {
+      p = _x;
+      qp = _y / p;
+    }
+    else
+    {
+      p = _y;
+      qp = _x / p;
+    }
+    return p * sqrt(Scalar(1) + qp*qp);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
+  enum
+  {
+    Cost = 3 * NumTraits<Scalar>::AddCost +
+           2 * NumTraits<Scalar>::MulCost +
+           2 * scalar_div_cost<Scalar,false>::value,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the pow of two scalars
+  */
+template<typename Scalar, typename Exponent>
+struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
+{
+  typedef typename ScalarBinaryOpTraits<Scalar,Exponent,scalar_pow_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op)
+#else
+  scalar_pow_op() {
+    typedef Scalar LhsScalar;
+    typedef Exponent RhsScalar;
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC
+  inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
+};
+template<typename Scalar, typename Exponent>
+struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
+
+
+//---------- non associative binary functors ----------
+
+/** \internal
+  * \brief Template functor to compute the difference of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::operator-
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_difference_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
+#else
+  scalar_difference_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::psub(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the quotient of two scalars
+  *
+  * \sa class CwiseBinaryOp, Cwise::operator/()
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_quotient_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_quotient_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+#else
+  scalar_quotient_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pdiv(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
+  typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
+  enum {
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv,
+    Cost = scalar_div_cost<result_type,PacketAccess>::value
+  };
+};
+
+
+
+/** \internal
+  * \brief Template functor to compute the and of two booleans
+  *
+  * \sa class CwiseBinaryOp, ArrayBase::operator&&
+  */
+struct scalar_boolean_and_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
+};
+template<> struct functor_traits<scalar_boolean_and_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the or of two booleans
+  *
+  * \sa class CwiseBinaryOp, ArrayBase::operator||
+  */
+struct scalar_boolean_or_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
+};
+template<> struct functor_traits<scalar_boolean_or_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the xor of two booleans
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator^
+ */
+struct scalar_boolean_xor_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
+};
+template<> struct functor_traits<scalar_boolean_xor_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
+
+//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
+
+// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value.
+// They are analogues to std::binder1st/binder2nd but with the following differences:
+//  - they are compatible with packetOp
+//  - they are portable across C++ versions (the std::binder* are deprecated in C++11)
+template<typename BinaryOp> struct bind1st_op : BinaryOp {
+
+  typedef typename BinaryOp::first_argument_type  first_argument_type;
+  typedef typename BinaryOp::second_argument_type second_argument_type;
+  typedef typename BinaryOp::result_type          result_type;
+
+  bind1st_op(const first_argument_type &val) : m_value(val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
+
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const
+  { return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b); }
+
+  first_argument_type m_value;
+};
+template<typename BinaryOp> struct functor_traits<bind1st_op<BinaryOp> > : functor_traits<BinaryOp> {};
+
+
+template<typename BinaryOp> struct bind2nd_op : BinaryOp {
+
+  typedef typename BinaryOp::first_argument_type  first_argument_type;
+  typedef typename BinaryOp::second_argument_type second_argument_type;
+  typedef typename BinaryOp::result_type          result_type;
+
+  bind2nd_op(const second_argument_type &val) : m_value(val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
+
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return BinaryOp::packetOp(a,internal::pset1<Packet>(m_value)); }
+
+  second_argument_type m_value;
+};
+template<typename BinaryOp> struct functor_traits<bind2nd_op<BinaryOp> > : functor_traits<BinaryOp> {};
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BINARY_FUNCTORS_H
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
new file mode 100644
index 000000000..6a30466fb
--- /dev/null
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -0,0 +1,189 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NULLARY_FUNCTORS_H
+#define EIGEN_NULLARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Scalar>
+struct scalar_constant_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() () const { return m_other; }
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const { return internal::pset1<PacketType>(m_other); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_constant_op<Scalar> >
+{ enum { Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
+         PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
+
+template<typename Scalar> struct scalar_identity_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType row, IndexType col) const { return row==col ? Scalar(1) : Scalar(0); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_identity_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
+
+template <typename Scalar, typename Packet, bool IsInteger> struct linspaced_op_impl;
+
+template <typename Scalar, typename Packet>
+struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
+{
+  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
+    m_interPacket(plset<Packet>(0)),
+    m_flip(numext::abs(high)<numext::abs(low))
+  {}
+
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
+    if(m_flip)
+      return (i==0)? m_low : (m_high - (m_size1-i)*m_step);
+    else
+      return (i==m_size1)? m_high : (m_low + i*m_step);
+  }
+
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const
+  {
+    // Principle:
+    // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
+    if(m_flip)
+    {
+      Packet pi = padd(pset1<Packet>(Scalar(i-m_size1)),m_interPacket);
+      Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
+      if(i==0)
+        res = pinsertfirst(res, m_low);
+      return res;
+    }
+    else
+    {
+      Packet pi = padd(pset1<Packet>(Scalar(i)),m_interPacket);
+      Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
+      if(i==m_size1-unpacket_traits<Packet>::size+1)
+        res = pinsertlast(res, m_high);
+      return res;
+    }
+  }
+
+  const Scalar m_low;
+  const Scalar m_high;
+  const Index m_size1;
+  const Scalar m_step;
+  const Packet m_interPacket;
+  const bool m_flip;
+};
+
+template <typename Scalar, typename Packet>
+struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
+{
+  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low),
+    m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
+    m_divisor(convert_index<Scalar>((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))),
+    m_use_divisor(num_steps>1 && (numext::abs(high-low)+1)<num_steps)
+  {}
+
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar operator() (IndexType i) const
+  {
+    if(m_use_divisor) return m_low + convert_index<Scalar>(i)/m_divisor;
+    else              return m_low + convert_index<Scalar>(i)*m_multiplier;
+  }
+
+  const Scalar m_low;
+  const Scalar m_multiplier;
+  const Scalar m_divisor;
+  const bool m_use_divisor;
+};
+
+// ----- Linspace functor ----------------------------------------------------------------
+
+// Forward declaration (we default to random access which does not really give
+// us a speed gain when using packet access but it allows to use the functor in
+// nested expressions).
+template <typename Scalar, typename PacketType> struct linspaced_op;
+template <typename Scalar, typename PacketType> struct functor_traits< linspaced_op<Scalar,PacketType> >
+{
+  enum
+  {
+    Cost = 1,
+    PacketAccess =   (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear && packet_traits<Scalar>::HasBlend,
+                  /*&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),*/ // <- vectorization for integer is currently disabled
+    IsRepeatable = true
+  };
+};
+template <typename Scalar, typename PacketType> struct linspaced_op
+{
+  linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
+    : impl((num_steps==1 ? high : low),high,num_steps)
+  {}
+
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); }
+
+  template<typename Packet,typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); }
+
+  // This proxy object handles the actual required temporaries and the different
+  // implementations (integer vs. floating point).
+  const linspaced_op_impl<Scalar,PacketType,NumTraits<Scalar>::IsInteger> impl;
+};
+
+// Linear access is automatically determined from the operator() prototypes available for the given functor.
+// If it exposes an operator()(i,j), then we assume the i and j coefficients are required independently
+// and linear access is not possible. In all other cases, linear access is enabled.
+// Users should not have to deal with this structure.
+template<typename Functor> struct functor_has_linear_access { enum { ret = !has_binary_operator<Functor>::value }; };
+
+// For unreliable compilers, let's specialize the has_*ary_operator
+// helpers so that at least built-in nullary functors work fine.
+#if !( (EIGEN_COMP_MSVC>1600) || (EIGEN_GNUC_AT_LEAST(4,8)) || (EIGEN_COMP_ICC>=1600))
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 0}; };
+
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 1}; };
+
+template<typename Scalar, typename PacketType,typename IndexType>
+struct has_nullary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
+template<typename Scalar, typename PacketType,typename IndexType>
+struct has_unary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 1}; };
+template<typename Scalar, typename PacketType,typename IndexType>
+struct has_binary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
+
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_NULLARY_FUNCTORS_H
diff --git a/Eigen/src/Core/functors/StlFunctors.h b/Eigen/src/Core/functors/StlFunctors.h
new file mode 100644
index 000000000..6df3fa501
--- /dev/null
+++ b/Eigen/src/Core/functors/StlFunctors.h
@@ -0,0 +1,132 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STL_FUNCTORS_H
+#define EIGEN_STL_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// default functor traits for STL functors:
+
+template<typename T>
+struct functor_traits<std::multiplies<T> >
+{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::divides<T> >
+{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::plus<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::minus<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::negate<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_or<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_and<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_not<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::greater<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::less<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::greater_equal<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::less_equal<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::equal_to<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::not_equal_to<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+#if (__cplusplus < 201103L) && (EIGEN_COMP_MSVC <= 1900)
+// std::binder* are deprecated since c++11 and will be removed in c++17
+template<typename T>
+struct functor_traits<std::binder2nd<T> >
+{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binder1st<T> >
+{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+#endif
+
+template<typename T>
+struct functor_traits<std::unary_negate<T> >
+{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binary_negate<T> >
+{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+
+#ifdef EIGEN_STDEXT_SUPPORT
+
+template<typename T0,typename T1>
+struct functor_traits<std::project1st<T0,T1> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::project2nd<T0,T1> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::select2nd<std::pair<T0,T1> > >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::select1st<std::pair<T0,T1> > >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::unary_compose<T0,T1> >
+{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false }; };
+
+template<typename T0,typename T1,typename T2>
+struct functor_traits<std::binary_compose<T0,T1,T2> >
+{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false }; };
+
+#endif // EIGEN_STDEXT_SUPPORT
+
+// allow to add new functors and specializations of functor_traits from outside Eigen.
+// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
+#ifdef EIGEN_FUNCTORS_PLUGIN
+#include EIGEN_FUNCTORS_PLUGIN
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_STL_FUNCTORS_H
diff --git a/Eigen/src/Core/functors/TernaryFunctors.h b/Eigen/src/Core/functors/TernaryFunctors.h
new file mode 100644
index 000000000..b254e96c6
--- /dev/null
+++ b/Eigen/src/Core/functors/TernaryFunctors.h
@@ -0,0 +1,25 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TERNARY_FUNCTORS_H
+#define EIGEN_TERNARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- associative ternary functors ----------
+
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TERNARY_FUNCTORS_H
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
new file mode 100644
index 000000000..2e6a00ffd
--- /dev/null
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -0,0 +1,792 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_UNARY_FUNCTORS_H
+#define EIGEN_UNARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \brief Template functor to compute the opposite of a scalar
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator-
+  */
+template<typename Scalar> struct scalar_opposite_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pnegate(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_opposite_op<Scalar> >
+{ enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNegate };
+};
+
+/** \internal
+  * \brief Template functor to compute the absolute value of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::abs
+  */
+template<typename Scalar> struct scalar_abs_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pabs(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_abs_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAbs
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the score of a scalar, to chose a pivot
+  *
+  * \sa class CwiseUnaryOp
+  */
+template<typename Scalar> struct scalar_score_coeff_op : scalar_abs_op<Scalar>
+{
+  typedef void Score_is_abs;
+};
+template<typename Scalar>
+struct functor_traits<scalar_score_coeff_op<Scalar> > : functor_traits<scalar_abs_op<Scalar> > {};
+
+/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor.  */
+template<typename Scalar, typename=void> struct abs_knowing_score
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Score>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { return numext::abs(a); }
+};
+template<typename Scalar> struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Scal>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; }
+};
+
+/** \internal
+  * \brief Template functor to compute the squared absolute value of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::abs2
+  */
+template<typename Scalar> struct scalar_abs2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_abs2_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 }; };
+
+/** \internal
+  * \brief Template functor to compute the conjugate of a complex value
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::conjugate()
+  */
+template<typename Scalar> struct scalar_conjugate_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_conjugate_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
+    PacketAccess = packet_traits<Scalar>::HasConj
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the phase angle of a complex
+  *
+  * \sa class CwiseUnaryOp, Cwise::arg
+  */
+template<typename Scalar> struct scalar_arg_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::parg(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_arg_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? 5 * NumTraits<Scalar>::MulCost : NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasArg
+  };
+};
+/** \internal
+  * \brief Template functor to cast a scalar to another type
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::cast()
+  */
+template<typename Scalar, typename NewType>
+struct scalar_cast_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef NewType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast<Scalar, NewType>(a); }
+};
+template<typename Scalar, typename NewType>
+struct functor_traits<scalar_cast_op<Scalar,NewType> >
+{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the real part of a complex
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::real()
+  */
+template<typename Scalar>
+struct scalar_real_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_real_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the imaginary part of a complex
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::imag()
+  */
+template<typename Scalar>
+struct scalar_imag_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_imag_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the real part of a complex as a reference
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::real()
+  */
+template<typename Scalar>
+struct scalar_real_ref_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast<Scalar*>(&a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_real_ref_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the imaginary part of a complex as a reference
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::imag()
+  */
+template<typename Scalar>
+struct scalar_imag_ref_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast<Scalar*>(&a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_imag_ref_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the exponential of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::exp()
+  */
+template<typename Scalar> struct scalar_exp_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::exp(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_exp_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasExp,
+    // The following numbers are based on the AVX implementation.
+#ifdef EIGEN_VECTORIZE_FMA
+    // Haswell can issue 2 add/mul/madd per cycle.
+    Cost =
+    (sizeof(Scalar) == 4
+     // float: 8 pmadd, 4 pmul, 2 padd/psub, 6 other
+     ? (8 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost)
+     // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
+     : (14 * NumTraits<Scalar>::AddCost +
+        6 * NumTraits<Scalar>::MulCost +
+        scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value))
+#else
+    Cost =
+    (sizeof(Scalar) == 4
+     // float: 7 pmadd, 6 pmul, 4 padd/psub, 10 other
+     ? (21 * NumTraits<Scalar>::AddCost + 13 * NumTraits<Scalar>::MulCost)
+     // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
+     : (23 * NumTraits<Scalar>::AddCost +
+        12 * NumTraits<Scalar>::MulCost +
+        scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value))
+#endif
+  };
+};
+
+/** \internal
+  *
+  * \brief Template functor to compute the logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::log()
+  */
+template<typename Scalar> struct scalar_log_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_log_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasLog,
+    Cost =
+    (PacketAccess
+     // The following numbers are based on the AVX implementation.
+#ifdef EIGEN_VECTORIZE_FMA
+     // 8 pmadd, 6 pmul, 8 padd/psub, 16 other, can issue 2 add/mul/madd per cycle.
+     ? (20 * NumTraits<Scalar>::AddCost + 7 * NumTraits<Scalar>::MulCost)
+#else
+     // 8 pmadd, 6 pmul, 8 padd/psub, 20 other
+     ? (36 * NumTraits<Scalar>::AddCost + 14 * NumTraits<Scalar>::MulCost)
+#endif
+     // Measured cost of std::log.
+     : sizeof(Scalar)==4 ? 40 : 85)
+  };
+};
+
+/** \internal
+  *
+  * \brief Template functor to compute the logarithm of 1 plus a scalar value
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::log1p()
+  */
+template<typename Scalar> struct scalar_log1p_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log1p_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log1p(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog1p(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_log1p_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasLog1p,
+    Cost = functor_traits<scalar_log_op<Scalar> >::Cost // TODO measure cost of log1p
+  };
+};
+
+/** \internal
+  *
+  * \brief Template functor to compute the base-10 logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log10()
+  */
+template<typename Scalar> struct scalar_log10_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD_MATH(log10) return log10(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log10_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog10 }; };
+
+/** \internal
+  * \brief Template functor to compute the square root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sqrt()
+  */
+template<typename Scalar> struct scalar_sqrt_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sqrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_sqrt_op<Scalar> > {
+  enum {
+#if EIGEN_FAST_MATH
+    // The following numbers are based on the AVX implementation.
+    Cost = (sizeof(Scalar) == 8 ? 28
+                                // 4 pmul, 1 pmadd, 3 other
+                                : (3 * NumTraits<Scalar>::AddCost +
+                                   5 * NumTraits<Scalar>::MulCost)),
+#else
+    // The following numbers are based on min VSQRT throughput on Haswell.
+    Cost = (sizeof(Scalar) == 8 ? 28 : 14),
+#endif
+    PacketAccess = packet_traits<Scalar>::HasSqrt
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the reciprocal square root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::rsqrt()
+  */
+template<typename Scalar> struct scalar_rsqrt_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(1)/numext::sqrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); }
+};
+
+template<typename Scalar>
+struct functor_traits<scalar_rsqrt_op<Scalar> >
+{ enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRsqrt
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the cosine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::cos()
+  */
+template<typename Scalar> struct scalar_cos_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::cos(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cos_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCos
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the sine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sin()
+  */
+template<typename Scalar> struct scalar_sin_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sin(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sin_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSin
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the tan of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::tan()
+  */
+template<typename Scalar> struct scalar_tan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tan(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_tan_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasTan
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the arc cosine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::acos()
+  */
+template<typename Scalar> struct scalar_acos_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::acos(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_acos_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasACos
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the arc sine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::asin()
+  */
+template<typename Scalar> struct scalar_asin_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::asin(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_asin_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasASin
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the atan of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::atan()
+  */
+template<typename Scalar> struct scalar_atan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::atan(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_atan_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasATan
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the tanh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::tanh()
+  */
+template <typename Scalar>
+struct scalar_tanh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tanh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { return ptanh(x); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_tanh_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasTanh,
+    Cost = ( (EIGEN_FAST_MATH && is_same<Scalar,float>::value)
+// The following numbers are based on the AVX implementation,
+#ifdef EIGEN_VECTORIZE_FMA
+                // Haswell can issue 2 add/mul/madd per cycle.
+                // 9 pmadd, 2 pmul, 1 div, 2 other
+                ? (2 * NumTraits<Scalar>::AddCost +
+                   6 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value)
+#else
+                ? (11 * NumTraits<Scalar>::AddCost +
+                   11 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value)
+#endif
+                // This number assumes a naive implementation of tanh
+                : (6 * NumTraits<Scalar>::AddCost +
+                   3 * NumTraits<Scalar>::MulCost +
+                   2 * scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value +
+                   functor_traits<scalar_exp_op<Scalar> >::Cost))
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the sinh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sinh()
+  */
+template<typename Scalar> struct scalar_sinh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sinh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sinh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSinh
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the cosh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::cosh()
+  */
+template<typename Scalar> struct scalar_cosh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::cosh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cosh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCosh
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the inverse of a scalar
+  * \sa class CwiseUnaryOp, Cwise::inverse()
+  */
+template<typename Scalar>
+struct scalar_inverse_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+
+/** \internal
+  * \brief Template functor to compute the square of a scalar
+  * \sa class CwiseUnaryOp, Cwise::square()
+  */
+template<typename Scalar>
+struct scalar_square_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_square_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+/** \internal
+  * \brief Template functor to compute the cube of a scalar
+  * \sa class CwiseUnaryOp, Cwise::cube()
+  */
+template<typename Scalar>
+struct scalar_cube_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a*a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,pmul(a,a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cube_op<Scalar> >
+{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+/** \internal
+  * \brief Template functor to compute the rounded value of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::round()
+  */
+template<typename Scalar> struct scalar_round_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_round_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::round(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pround(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_round_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRound
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the floor of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::floor()
+  */
+template<typename Scalar> struct scalar_floor_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_floor_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::floor(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pfloor(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_floor_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasFloor
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the ceil of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::ceil()
+  */
+template<typename Scalar> struct scalar_ceil_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_ceil_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCeil
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute whether a scalar is NaN
+  * \sa class CwiseUnaryOp, ArrayBase::isnan()
+  */
+template<typename Scalar> struct scalar_isnan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isnan_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to check whether a scalar is +/-inf
+  * \sa class CwiseUnaryOp, ArrayBase::isinf()
+  */
+template<typename Scalar> struct scalar_isinf_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isinf_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to check whether a scalar has a finite value
+  * \sa class CwiseUnaryOp, ArrayBase::isfinite()
+  */
+template<typename Scalar> struct scalar_isfinite_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isfinite_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the logical not of a boolean
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::operator!
+  */
+template<typename Scalar> struct scalar_boolean_not_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_not_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a) const { return !a; }
+};
+template<typename Scalar>
+struct functor_traits<scalar_boolean_not_op<Scalar> > {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the signum of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sign()
+  */
+template<typename Scalar,bool iscpx=(NumTraits<Scalar>::IsComplex!=0) > struct scalar_sign_op;
+template<typename Scalar>
+struct scalar_sign_op<Scalar,false> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
+  {
+      return Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+template<typename Scalar>
+struct scalar_sign_op<Scalar,true> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
+  {
+    typedef typename NumTraits<Scalar>::Real real_type;
+    real_type aa = numext::abs(a);
+    if (aa==real_type(0))
+      return Scalar(0);
+    aa = real_type(1)/aa;
+    return Scalar(real(a)*aa, imag(a)*aa );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sign_op<Scalar> >
+{ enum {
+    Cost = 
+        NumTraits<Scalar>::IsComplex
+        ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
+        : ( 3*NumTraits<Scalar>::AddCost),
+    PacketAccess = packet_traits<Scalar>::HasSign
+  };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_FUNCTORS_H
diff --git a/Eigen/src/Core/products/CMakeLists.txt b/Eigen/src/Core/products/CMakeLists.txt
deleted file mode 100644
index 21fc94ae3..000000000
--- a/Eigen/src/Core/products/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_Product_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_Product_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/products COMPONENT Devel
-  )
diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h
deleted file mode 100644
index 2a9d65b94..000000000
--- a/Eigen/src/Core/products/CoeffBasedProduct.h
+++ /dev/null
@@ -1,476 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COEFFBASED_PRODUCT_H
-#define EIGEN_COEFFBASED_PRODUCT_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/*********************************************************************************
-*  Coefficient based product implementation.
-*  It is designed for the following use cases:
-*  - small fixed sizes
-*  - lazy products
-*********************************************************************************/
-
-/* Since the all the dimensions of the product are small, here we can rely
- * on the generic Assign mechanism to evaluate the product per coeff (or packet).
- *
- * Note that here the inner-loops should always be unrolled.
- */
-
-template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl;
-
-template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl;
-
-template<typename LhsNested, typename RhsNested, int NestingFlags>
-struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
-{
-  typedef MatrixXpr XprKind;
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-  typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind,
-                                           typename traits<_RhsNested>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
-                                         typename traits<_RhsNested>::Index>::type Index;
-
-  enum {
-      LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-      RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-      LhsFlags = _LhsNested::Flags,
-      RhsFlags = _RhsNested::Flags,
-
-      RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
-      ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
-      InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
-
-      MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
-
-      LhsRowMajor = LhsFlags & RowMajorBit,
-      RhsRowMajor = RhsFlags & RowMajorBit,
-
-      SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
-
-      CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                      && (ColsAtCompileTime == Dynamic
-                          || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
-                              && (RhsFlags&AlignedBit)
-                             )
-                         ),
-
-      CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                      && (RowsAtCompileTime == Dynamic
-                          || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
-                              && (LhsFlags&AlignedBit)
-                             )
-                         ),
-
-      EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-                     : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-                     : (RhsRowMajor && !CanVectorizeLhs),
-
-      Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
-            | (EvalToRowMajor ? RowMajorBit : 0)
-            | NestingFlags
-            | (LhsFlags & RhsFlags & AlignedBit)
-            // TODO enable vectorization for mixed types
-            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
-
-      CoeffReadCost = InnerSize == Dynamic ? Dynamic
-                    : InnerSize == 0 ? 0
-                    : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
-                      + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
-
-      /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
-      * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
-      * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
-      * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
-      */
-      CanVectorizeInner =    SameType
-                          && LhsRowMajor
-                          && (!RhsRowMajor)
-                          && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                          && (LhsFlags & RhsFlags & AlignedBit)
-                          && (InnerSize % packet_traits<Scalar>::size == 0)
-    };
-};
-
-} // end namespace internal
-
-template<typename LhsNested, typename RhsNested, int NestingFlags>
-class CoeffBasedProduct
-  : internal::no_assignment_operator,
-    public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> >
-{
-  public:
-
-    typedef MatrixBase<CoeffBasedProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct)
-    typedef typename Base::PlainObject PlainObject;
-
-  private:
-
-    typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested;
-    typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested;
-
-    enum {
-      PacketSize = internal::packet_traits<Scalar>::size,
-      InnerSize  = internal::traits<CoeffBasedProduct>::InnerSize,
-      Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-      CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner
-    };
-
-    typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
-                                   Unroll ? InnerSize : Dynamic,
-                                   _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
-
-    typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;
-
-  public:
-
-    inline CoeffBasedProduct(const CoeffBasedProduct& other)
-      : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
-    {}
-
-    template<typename Lhs, typename Rhs>
-    inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
-      // We still allow to mix T and complex<T>.
-      EIGEN_STATIC_ASSERT((internal::scalar_product_traits<typename Lhs::RealScalar, typename Rhs::RealScalar>::Defined),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-      eigen_assert(lhs.cols() == rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-    {
-      Scalar res;
-      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
-     * which is why we don't set the LinearAccessBit.
-     */
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      Scalar res;
-      const Index row = RowsAtCompileTime == 1 ? 0 : index;
-      const Index col = RowsAtCompileTime == 1 ? index : 0;
-      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const
-    {
-      PacketScalar res;
-      internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                              Unroll ? InnerSize : Dynamic,
-                              _LhsNested, _RhsNested, PacketScalar, LoadMode>
-        ::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    // Implicit conversion to the nested type (trigger the evaluation of the product)
-    EIGEN_STRONG_INLINE operator const PlainObject& () const
-    {
-      m_result.lazyAssign(*this);
-      return m_result;
-    }
-
-    const _LhsNested& lhs() const { return m_lhs; }
-    const _RhsNested& rhs() const { return m_rhs; }
-
-    const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
-
-    template<int DiagonalIndex>
-    const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
-
-    const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); }
-
-  protected:
-    typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
-    typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
-
-    mutable PlainObject m_result;
-};
-
-namespace internal {
-
-// here we need to overload the nested rule for products
-// such that the nested type is a const reference to a plain matrix
-template<typename Lhs, typename Rhs, int N, typename PlainObject>
-struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject>
-{
-  typedef PlainObject const& type;
-};
-
-/***************************************************************************
-* Normal product .coeff() implementation (with meta-unrolling)
-***************************************************************************/
-
-/**************************************
-*** Scalar path  - no vectorization ***
-**************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
-    res += lhs.coeff(row, UnrollingIndex-1) * rhs.coeff(UnrollingIndex-1, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, 1, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, RetScalar &res)
-  {
-    res = RetScalar(0);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
-  {
-    res = (lhs.row(row).transpose().cwiseProduct( rhs.col(col) )).sum();
-  }
-};
-
-/*******************************************
-*** Scalar path with inner vectorization ***
-*******************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
-struct product_coeff_vectorized_unroller
-{
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
-  {
-    product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
-    pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet>
-struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
-  {
-    pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<InnerVectorizedTraversal, 0, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, RetScalar &res)
-  {
-    res = 0;
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::PacketScalar Packet;
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    Packet pres;
-    product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
-    res = predux(pres);
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
-struct product_coeff_vectorized_dyn_selector
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
-// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
-template<typename Lhs, typename Rhs, int RhsCols>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
-  }
-};
-
-/*******************
-*** Packet path  ***
-*******************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode>(UnrollingIndex-1, col), res);
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Packet &res)
-  {
-    res = pset1<Packet>(0);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Packet &res)
-  {
-    res = pset1<Packet>(0);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
-  {
-    res = pset1<Packet>(0);
-    for(Index i = 0; i < lhs.cols(); ++i)
-      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
-  {
-    res = pset1<Packet>(0);
-    for(Index i = 0; i < lhs.cols(); ++i)
-      res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COEFFBASED_PRODUCT_H
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index bcdca5b0d..45230bce5 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -10,8 +10,9 @@
 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
 #define EIGEN_GENERAL_BLOCK_PANEL_H
 
-namespace Eigen { 
-  
+
+namespace Eigen {
+
 namespace internal {
 
 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
@@ -24,29 +25,51 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
   return a<=0 ? b : a;
 }
 
+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = 32*1024;
+const std::ptrdiff_t defaultL2CacheSize = 256*1024;
+const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
+#else
+const std::ptrdiff_t defaultL1CacheSize = 16*1024;
+const std::ptrdiff_t defaultL2CacheSize = 512*1024;
+const std::ptrdiff_t defaultL3CacheSize = 512*1024;
+#endif
+
 /** \internal */
-inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
-{
-  static std::ptrdiff_t m_l1CacheSize = 0;
-  static std::ptrdiff_t m_l2CacheSize = 0;
-  if(m_l2CacheSize==0)
-  {
-    m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
-    m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
+struct CacheSizes {
+  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
+    int l1CacheSize, l2CacheSize, l3CacheSize;
+    queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+    m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+    m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+    m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
   }
-  
+
+  std::ptrdiff_t m_l1;
+  std::ptrdiff_t m_l2;
+  std::ptrdiff_t m_l3;
+};
+
+
+/** \internal */
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
+{
+  static CacheSizes m_cacheSizes;
+
   if(action==SetAction)
   {
     // set the cpu cache size and cache all block sizes from a global cache size in byte
     eigen_internal_assert(l1!=0 && l2!=0);
-    m_l1CacheSize = *l1;
-    m_l2CacheSize = *l2;
+    m_cacheSizes.m_l1 = *l1;
+    m_cacheSizes.m_l2 = *l2;
+    m_cacheSizes.m_l3 = *l3;
   }
   else if(action==GetAction)
   {
     eigen_internal_assert(l1!=0 && l2!=0);
-    *l1 = m_l1CacheSize;
-    *l2 = m_l2CacheSize;
+    *l1 = m_cacheSizes.m_l1;
+    *l2 = m_cacheSizes.m_l2;
+    *l3 = m_cacheSizes.m_l3;
   }
   else
   {
@@ -54,6 +77,206 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
   }
 }
 
+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+  // Explanations:
+  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
+  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
+  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
+  // at the register level. This small horizontal panel has to stay within L1 cache.
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+
+  if (num_threads > 1) {
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+      kr = 8,
+      mr = Traits::mr,
+      nr = Traits::nr
+    };
+    // Increasing k gives us more time to prefetch the content of the "C"
+    // registers. However once the latency is hidden there is no point in
+    // increasing the value of k, so we'll cap it at 320 (value determined
+    // experimentally).
+    const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
+    if (k_cache < k) {
+      k = k_cache - (k_cache % kr);
+      eigen_internal_assert(k > 0);
+    }
+
+    const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    const Index n_per_thread = numext::div_ceil(n, num_threads);
+    if (n_cache <= n_per_thread) {
+      // Don't exceed the capacity of the l2 cache.
+      eigen_internal_assert(n_cache >= static_cast<Index>(nr));
+      n = n_cache - (n_cache % nr);
+      eigen_internal_assert(n > 0);
+    } else {
+      n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
+    }
+
+    if (l3 > l2) {
+      // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+      const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      const Index m_per_thread = numext::div_ceil(m, num_threads);
+      if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
+        m = m_cache - (m_cache % mr);
+        eigen_internal_assert(m > 0);
+      } else {
+        m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
+      }
+    }
+  }
+  else {
+    // In unit tests we do not want to use extra large matrices,
+    // so we reduce the cache size to check the blocking strategy is not flawed
+#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    l1 = 9*1024;
+    l2 = 32*1024;
+    l3 = 512*1024;
+#endif
+
+    // Early return for small problems because the computation below are time consuming for small problems.
+    // Perhaps it would make more sense to consider k*n*m??
+    // Note that for very tiny problem, this function should be bypassed anyway
+    // because we use the coefficient-based implementation for them.
+    if((numext::maxi)(k,(numext::maxi)(m,n))<48)
+      return;
+
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      k_peeling = 8,
+      k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
+    };
+
+    // ---- 1st level of blocking on L1, yields kc ----
+
+    // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
+    // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
+    // We also include a register-level block of the result (mx x nr).
+    // (In an ideal world only the lhs panel would stay in L1)
+    // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
+    const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
+    const Index old_k = k;
+    if(k>max_kc)
+    {
+      // We are really blocking on the third dimension:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the result.
+      k = (k%max_kc)==0 ? max_kc
+                        : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
+
+      eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
+    }
+
+    // ---- 2nd level of blocking on max(L2,L3), yields nc ----
+
+    // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
+    //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
+    // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
+    // For instance, it corresponds to 6MB of L3 shared among 4 cores.
+    #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    const Index actual_l2 = l3;
+    #else
+    const Index actual_l2 = 1572864; // == 1.5 MB
+    #endif
+
+    // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
+    // The second half is implicitly reserved to access the result and lhs coefficients.
+    // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
+    // to limit this growth: we bound nc to growth by a factor x1.5.
+    // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
+    // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
+    Index max_nc;
+    const Index lhs_bytes = m * k * sizeof(LhsScalar);
+    const Index remaining_l1 = l1- k_sub - lhs_bytes;
+    if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
+    {
+      // L1 blocking
+      max_nc = remaining_l1 / (k*sizeof(RhsScalar));
+    }
+    else
+    {
+      // L2 blocking
+      max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    }
+    // WARNING Below, we assume that Traits::nr is a power of two.
+    Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
+    if(n>nc)
+    {
+      // We are really blocking over the columns:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the packed lhs.
+      //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
+      n = (n%nc)==0 ? nc
+                    : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
+    }
+    else if(old_k==k)
+    {
+      // So far, no blocking at all, i.e., kc==k, and nc==n.
+      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
+      Index problem_size = k*n*sizeof(LhsScalar);
+      Index actual_lm = actual_l2;
+      Index max_mc = m;
+      if(problem_size<=1024)
+      {
+        // problem is small enough to keep in L1
+        // Let's choose m such that lhs's block fit in 1/3 of L1
+        actual_lm = l1;
+      }
+      else if(l3!=0 && problem_size<=32768)
+      {
+        // we have both L2 and L3, and problem is small enough to be kept in L2
+        // Let's choose m such that lhs's block fit in 1/3 of L2
+        actual_lm = l2;
+        max_mc = (numext::mini<Index>)(576,max_mc);
+      }
+      Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
+      if (mc > Traits::mr) mc -= mc % Traits::mr;
+      else if (mc==0) return;
+      m = (m%mc)==0 ? mc
+                    : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
+    }
+  }
+}
+
+template <typename Index>
+inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+{
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    return true;
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(k)
+  EIGEN_UNUSED_VARIABLE(m)
+  EIGEN_UNUSED_VARIABLE(n)
+#endif
+  return false;
+}
+
 /** \brief Computes the blocking parameters for a m x k times k x n matrix product
   *
   * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
@@ -62,48 +285,30 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
   *
   * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
   * this function computes the blocking size parameters along the respective dimensions
-  * for matrix products and related algorithms. The blocking sizes depends on various
-  * parameters:
-  * - the L1 and L2 cache sizes,
-  * - the register level blocking sizes defined by gebp_traits,
-  * - the number of scalars that fit into a packet (when vectorization is enabled).
+  * for matrix products and related algorithms.
+  *
+  * The blocking size parameters may be evaluated:
+  *   - either by a heuristic based on cache sizes;
+  *   - or using fixed prescribed values (for testing purposes).
   *
   * \sa setCpuCacheSizes */
-template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
-void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
-{
-  EIGEN_UNUSED_VARIABLE(n);
-  // Explanations:
-  // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
-  // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
-  // per kc x nr vertical small panels where nr is the blocking size along the n dimension
-  // at the register level. For vectorization purpose, these small vertical panels are unpacked,
-  // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
-  // stay in L1 cache.
-  std::ptrdiff_t l1, l2;
 
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-  enum {
-    kdiv = KcFactor * 2 * Traits::nr
-         * Traits::RhsProgress * sizeof(RhsScalar),
-    mr = gebp_traits<LhsScalar,RhsScalar>::mr,
-    mr_mask = (0xffffffff/mr)*mr
-  };
-
-  manage_caching_sizes(GetAction, &l1, &l2);
-  k = std::min<SizeType>(k, l1/kdiv);
-  SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
-  if(_m<m) m = _m & mr_mask;
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  if (!useSpecificBlockingSizes(k, m, n)) {
+    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
+  }
 }
 
-template<typename LhsScalar, typename RhsScalar, typename SizeType>
-inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+template<typename LhsScalar, typename RhsScalar, typename Index>
+inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
-  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
 }
 
-#ifdef EIGEN_HAS_FUSE_CJMADD
-  #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+  #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
 #else
 
   // FIXME (a bit overkill maybe ?)
@@ -128,8 +333,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
     gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
   }
 
-  #define MADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
-//   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
+  #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
+//   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
 #endif
 
 /* Vectorization logic
@@ -148,7 +353,7 @@ class gebp_traits
 public:
   typedef _LhsScalar LhsScalar;
   typedef _RhsScalar RhsScalar;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
   enum {
     ConjLhs = _ConjLhs,
@@ -160,16 +365,22 @@ public:
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
 
-    // register block size along the N direction (must be either 2 or 4)
-    nr = NumberOfRegisters/4,
+    // register block size along the N direction must be 1 or 4
+    nr = 4,
 
     // register block size along the M direction (currently, this one cannot be modified)
-    mr = 2 * LhsPacketSize,
+    default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+    // we assume 16 registers
+    // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
+    // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
+    mr = Vectorizable ? 3*LhsPacketSize : default_mr,
+#else
+    mr = default_mr,
+#endif
     
-    WorkSpaceFactor = nr * RhsPacketSize,
-
     LhsProgress = LhsPacketSize,
-    RhsProgress = RhsPacketSize
+    RhsProgress = 1
   };
 
   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
@@ -186,36 +397,67 @@ public:
   {
     p = pset1<ResPacket>(ResScalar(0));
   }
-
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+  
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    pbroadcast4(b, b0, b1, b2, b3);
+  }
+  
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     pbroadcast2(b, b0, b1);
+//   }
+  
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    dest = pset1<RhsPacketType>(*b);
+  }
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
-    for(DenseIndex k=0; k<n; k++)
-      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+    dest = ploadquad<RhsPacket>(b);
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
   {
-    dest = pload<RhsPacket>(b);
+    dest = pload<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
   {
-    dest = pload<LhsPacket>(a);
+    dest = ploadu<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
+  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
   {
-    tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+    conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
+    // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
+    // let gcc allocate the register in which to store the result of the pmul
+    // (in the case where there is no FMA) gcc fails to figure out how to avoid
+    // spilling register.
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c = cj.pmadd(a,b,c);
+#else
+    tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
+#endif
   }
 
   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
   {
     r = pmadd(c,alpha,r);
   }
+  
+  template<typename ResPacketHalf>
+  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
+  {
+    r = pmadd(c,alpha,r);
+  }
 
-protected:
-//   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
-//   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
 };
 
 template<typename RealScalar, bool _ConjLhs>
@@ -224,7 +466,7 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
 public:
   typedef std::complex<RealScalar> LhsScalar;
   typedef RealScalar RhsScalar;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
   enum {
     ConjLhs = _ConjLhs,
@@ -235,12 +477,16 @@ public:
     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
-    nr = NumberOfRegisters/4,
-    mr = 2 * LhsPacketSize,
-    WorkSpaceFactor = nr*RhsPacketSize,
+    nr = 4,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+    // we assume 16 registers
+    mr = 3*LhsPacketSize,
+#else
+    mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#endif
 
     LhsProgress = LhsPacketSize,
-    RhsProgress = RhsPacketSize
+    RhsProgress = 1
   };
 
   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
@@ -258,15 +504,14 @@ public:
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
   {
-    for(DenseIndex k=0; k<n; k++)
-      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+    dest = pset1<RhsPacket>(*b);
   }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
-    dest = pload<RhsPacket>(b);
+    dest = pset1<RhsPacket>(*b);
   }
 
   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
@@ -274,6 +519,21 @@ public:
     dest = pload<LhsPacket>(a);
   }
 
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploadu<LhsPacket>(a);
+  }
+
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    pbroadcast4(b, b0, b1, b2, b3);
+  }
+  
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     pbroadcast2(b, b0, b1);
+//   }
+
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
   {
     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
@@ -281,7 +541,12 @@ public:
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
   {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c.v = pmadd(a.v,b,c.v);
+#else
     tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
+#endif
   }
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
@@ -298,6 +563,38 @@ protected:
   conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
 };
 
+template<typename Packet>
+struct DoublePacket
+{
+  Packet first;
+  Packet second;
+};
+
+template<typename Packet>
+DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+{
+  DoublePacket<Packet> res;
+  res.first  = padd(a.first, b.first);
+  res.second = padd(a.second,b.second);
+  return res;
+}
+
+template<typename Packet>
+const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
+{
+  return a;
+}
+
+template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
+// template<typename Packet>
+// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+// {
+//   DoublePacket<Packet> res;
+//   res.first  = padd(a.first, b.first);
+//   res.second = padd(a.second,b.second);
+//   return res;
+// }
+
 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
 {
@@ -314,60 +611,80 @@ public:
                 && packet_traits<Scalar>::Vectorizable,
     RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
     ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
-    
-    nr = 2,
-    mr = 2 * ResPacketSize,
-    WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+
+    // FIXME: should depend on NumberOfRegisters
+    nr = 4,
+    mr = ResPacketSize,
 
     LhsProgress = ResPacketSize,
-    RhsProgress = Vectorizable ? 2*ResPacketSize : 1
+    RhsProgress = 1
   };
   
   typedef typename packet_traits<RealScalar>::type RealPacket;
   typedef typename packet_traits<Scalar>::type     ScalarPacket;
-  struct DoublePacket
-  {
-    RealPacket first;
-    RealPacket second;
-  };
+  typedef DoublePacket<RealPacket> DoublePacketType;
 
   typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
-  typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
-  typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
+  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
   
   EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
 
-  EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
+  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
   {
     p.first   = pset1<RealPacket>(RealScalar(0));
     p.second  = pset1<RealPacket>(RealScalar(0));
   }
 
-  /* Unpack the rhs coeff such that each complex coefficient is spread into
-   * two packects containing respectively the real and imaginary coefficient
-   * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
-   */
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
+  // Scalar path
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
   {
-    for(DenseIndex k=0; k<n; k++)
-    {
-      if(Vectorizable)
-      {
-        pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0],             real(rhs[k]));
-        pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
-      }
-      else
-        b[k] = rhs[k];
-    }
+    dest = pset1<ResPacket>(*b);
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
+  // Vectorized path
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
+  {
+    dest.first  = pset1<RealPacket>(real(*b));
+    dest.second = pset1<RealPacket>(imag(*b));
+  }
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
+  {
+    eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
+    loadRhs(b,dest);
+  }
+  
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
   {
-    dest.first  = pload<RealPacket>((const RealScalar*)b);
-    dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
+    loadRhs(b+2, b2);
+    loadRhs(b+3, b3);
+  }
+  
+  // Vectorized path
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
+  {
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
+  }
+  
+  // Scalar path
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
+  {
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
   }
 
   // nothing special here
@@ -376,7 +693,12 @@ public:
     dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
   {
     c.first   = padd(pmul(a,b.first), c.first);
     c.second  = padd(pmul(a,b.second),c.second);
@@ -389,7 +711,7 @@ public:
   
   EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
   
-  EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
+  EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
   {
     // assemble c
     ResPacket tmp;
@@ -440,12 +762,12 @@ public:
     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+    // FIXME: should depend on NumberOfRegisters
     nr = 4,
-    mr = 2*ResPacketSize,
-    WorkSpaceFactor = nr*RhsPacketSize,
+    mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
 
     LhsProgress = ResPacketSize,
-    RhsProgress = ResPacketSize
+    RhsProgress = 1
   };
 
   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
@@ -463,21 +785,38 @@ public:
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
   {
-    for(DenseIndex k=0; k<n; k++)
-      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+    dest = pset1<RhsPacket>(*b);
   }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  
+  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
   {
-    dest = pload<RhsPacket>(b);
+    pbroadcast4(b, b0, b1, b2, b3);
   }
+  
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     // FIXME not sure that's the best way to implement it!
+//     b0 = pload1<RhsPacket>(b+0);
+//     b1 = pload1<RhsPacket>(b+1);
+//   }
 
   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
   {
     dest = ploaddup<LhsPacket>(a);
   }
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploaddup<LhsPacket>(a);
+  }
 
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
   {
@@ -486,7 +825,13 @@ public:
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
   {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c.v = pmadd(a,b.v,c.v);
+#else
     tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
+#endif
+    
   }
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
@@ -510,7 +855,7 @@ protected:
  *  |real |cplx | no vectorization yet, would require to pack A with duplication
  *  |cplx |real | easy vectorization
  */
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 struct gebp_kernel
 {
   typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
@@ -520,6 +865,15 @@ struct gebp_kernel
   typedef typename Traits::ResPacket ResPacket;
   typedef typename Traits::AccPacket AccPacket;
 
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
+  typedef typename SwappedTraits::ResScalar SResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
   enum {
     Vectorizable  = Traits::Vectorizable,
     LhsProgress   = Traits::LhsProgress,
@@ -528,571 +882,788 @@ struct gebp_kernel
   };
 
   EIGEN_DONT_INLINE
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0);
+  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+                  Index rows, Index depth, Index cols, ResScalar alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
 };
 
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 EIGEN_DONT_INLINE
-void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
-  ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB)
+void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
+  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+               Index rows, Index depth, Index cols, ResScalar alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
     Traits traits;
+    SwappedTraits straits;
     
     if(strideA==-1) strideA = depth;
     if(strideB==-1) strideB = depth;
     conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
-//     conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-    Index packet_cols = (cols/nr) * nr;
-    const Index peeled_mc = (rows/mr)*mr;
-    // FIXME:
-    const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
-    const Index peeled_kc = (depth/4)*4;
-
-    if(unpackedB==0)
-      unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
-
-    // loops on each micro vertical panel of rhs (depth x nr)
-    for(Index j2=0; j2<packet_cols; j2+=nr)
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+    const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
+    const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
+    const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
+    enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
+    const Index peeled_kc  = depth & ~(pk-1);
+    const Index prefetch_res_offset = 32/sizeof(ResScalar);    
+//     const Index depth2     = depth & ~1;
+
+    //---------- Process 3 * LhsProgress rows at once ----------
+    // This corresponds to 3*LhsProgress x nr register blocks.
+    // Usually, make sense only with FMA
+    if(mr>=3*Traits::LhsProgress)
     {
-      traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); 
-
-      // loops on each largest micro horizontal panel of lhs (mr x depth)
-      // => we select a mr x nr micro block of res which is entirely
-      //    stored into mr/packet_size x nr registers.
-      for(Index i=0; i<peeled_mc; i+=mr)
+      // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
+      // and on each largest micro vertical panel of the rhs (depth * nr).
+      // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
+      // However, if depth is too small, we can extend the number of rows of these horizontal panels.
+      // This actual number of rows is computed as follow:
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
+      for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
       {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
-        prefetch(&blA[0]);
-
-        // gets res block as register
-        AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
-                  traits.initAcc(C0);
-                  traits.initAcc(C1);
-        if(nr==4) traits.initAcc(C2);
-        if(nr==4) traits.initAcc(C3);
-                  traits.initAcc(C4);
-                  traits.initAcc(C5);
-        if(nr==4) traits.initAcc(C6);
-        if(nr==4) traits.initAcc(C7);
-
-        ResScalar* r0 = &res[(j2+0)*resStride + i];
-        ResScalar* r1 = r0 + resStride;
-        ResScalar* r2 = r1 + resStride;
-        ResScalar* r3 = r2 + resStride;
-
-        prefetch(r0+16);
-        prefetch(r1+16);
-        prefetch(r2+16);
-        prefetch(r3+16);
-
-        // performs "inner" product
-        // TODO let's check wether the folowing peeled loop could not be
-        //      optimized via optimal prefetching from one loop to the other
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<peeled_kc; k+=4)
+        const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
         {
-          if(nr==2)
-          {
-            LhsPacket A0, A1;
-            RhsPacket B_0;
-            RhsPacket T0;
-            
-EIGEN_ASM_COMMENT("mybegin2");
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.loadLhs(&blA[3*LhsProgress], A1);
-            traits.loadRhs(&blB[2*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[3*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-
-            traits.loadLhs(&blA[4*LhsProgress], A0);
-            traits.loadLhs(&blA[5*LhsProgress], A1);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[5*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-
-            traits.loadLhs(&blA[6*LhsProgress], A0);
-            traits.loadLhs(&blA[7*LhsProgress], A1);
-            traits.loadRhs(&blB[6*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[7*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-EIGEN_ASM_COMMENT("myend");
-          }
-          else
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
           {
-EIGEN_ASM_COMMENT("mybegin4");
-            LhsPacket A0, A1;
-            RhsPacket B_0, B1, B2, B3;
-            RhsPacket T0;
-            
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-
-            traits.madd(A0,B_0,C0,T0);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.loadRhs(&blB[5*RhsProgress], B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.loadRhs(&blB[6*RhsProgress], B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.madd(A1,B3,C7,B3);
-            traits.loadLhs(&blA[3*LhsProgress], A1);
-            traits.loadRhs(&blB[7*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[8*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.loadRhs(&blB[9*RhsProgress], B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.loadRhs(&blB[10*RhsProgress], B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.loadLhs(&blA[4*LhsProgress], A0);
-            traits.madd(A1,B3,C7,B3);
-            traits.loadLhs(&blA[5*LhsProgress], A1);
-            traits.loadRhs(&blB[11*RhsProgress], B3);
-
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[12*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.loadRhs(&blB[13*RhsProgress], B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.loadRhs(&blB[14*RhsProgress], B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.loadLhs(&blA[6*LhsProgress], A0);
-            traits.madd(A1,B3,C7,B3);
-            traits.loadLhs(&blA[7*LhsProgress], A1);
-            traits.loadRhs(&blB[15*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.madd(A1,B3,C7,B3);
-          }
+          
+          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 3 x nr registers.
+          
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2,  C3,
+                    C4, C5, C6,  C7,
+                    C8, C9, C10, C11;
+          traits.initAcc(C0);  traits.initAcc(C1);  traits.initAcc(C2);  traits.initAcc(C3);
+          traits.initAcc(C4);  traits.initAcc(C5);  traits.initAcc(C6);  traits.initAcc(C7);
+          traits.initAcc(C8);  traits.initAcc(C9);  traits.initAcc(C10); traits.initAcc(C11);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(0);
+          r1.prefetch(0);
+          r2.prefetch(0);
+          r3.prefetch(0);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0, A1;
 
-          blB += 4*nr*RhsProgress;
-          blA += 4*mr;
-        }
-        // process remaining peeled loop
-        for(Index k=peeled_kc; k<depth; k++)
-        {
-          if(nr==2)
+          for(Index k=0; k<peeled_kc; k+=pk)
           {
-            LhsPacket A0, A1;
-            RhsPacket B_0;
-            RhsPacket T0;
-
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
+            RhsPacket B_0, T0;
+            LhsPacket A2;
+
+#define EIGEN_GEBP_ONESTEP(K) \
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              internal::prefetch(blA+(3*K+16)*LhsProgress); \
+              if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
+              traits.madd(A0, B_0, C0, T0); \
+              traits.madd(A1, B_0, C4, T0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
+              traits.madd(A0, B_0, C1, T0); \
+              traits.madd(A1, B_0, C5, T0); \
+              traits.madd(A2, B_0, C9, B_0); \
+              traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
+              traits.madd(A0, B_0, C2,  T0); \
+              traits.madd(A1, B_0, C6,  T0); \
+              traits.madd(A2, B_0, C10, B_0); \
+              traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
+              traits.madd(A0, B_0, C3 , T0); \
+              traits.madd(A1, B_0, C7,  T0); \
+              traits.madd(A2, B_0, C11, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+            } while(false)
+
+            internal::prefetch(blB);
+            EIGEN_GEBP_ONESTEP(0);
+            EIGEN_GEBP_ONESTEP(1);
+            EIGEN_GEBP_ONESTEP(2);
+            EIGEN_GEBP_ONESTEP(3);
+            EIGEN_GEBP_ONESTEP(4);
+            EIGEN_GEBP_ONESTEP(5);
+            EIGEN_GEBP_ONESTEP(6);
+            EIGEN_GEBP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
           }
-          else
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
           {
-            LhsPacket A0, A1;
-            RhsPacket B_0, B1, B2, B3;
-            RhsPacket T0;
-
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-
-            traits.madd(A0,B_0,C0,T0);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.madd(A1,B3,C7,B3);
+            RhsPacket B_0, T0;
+            LhsPacket A2;
+            EIGEN_GEBP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 3*Traits::LhsProgress;
           }
 
-          blB += nr*RhsProgress;
-          blA += mr;
-        }
+#undef EIGEN_GEBP_ONESTEP
 
-        if(nr==4)
-        {
-          ResPacket R0, R1, R2, R3, R4, R5, R6;
+          ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = ploadu<ResPacket>(r0);
-          R1 = ploadu<ResPacket>(r1);
-          R2 = ploadu<ResPacket>(r2);
-          R3 = ploadu<ResPacket>(r3);
-          R4 = ploadu<ResPacket>(r0 + ResPacketSize);
-          R5 = ploadu<ResPacket>(r1 + ResPacketSize);
-          R6 = ploadu<ResPacket>(r2 + ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
-          pstoreu(r0, R0);
-          R0 = ploadu<ResPacket>(r3 + ResPacketSize);
-
-          traits.acc(C1, alphav, R1);
-          traits.acc(C2, alphav, R2);
-          traits.acc(C3, alphav, R3);
-          traits.acc(C4, alphav, R4);
-          traits.acc(C5, alphav, R5);
-          traits.acc(C6, alphav, R6);
-          traits.acc(C7, alphav, R0);
-          
-          pstoreu(r1, R1);
-          pstoreu(r2, R2);
-          pstoreu(r3, R3);
-          pstoreu(r0 + ResPacketSize, R4);
-          pstoreu(r1 + ResPacketSize, R5);
-          pstoreu(r2 + ResPacketSize, R6);
-          pstoreu(r3 + ResPacketSize, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C1, alphav, R0);
+          traits.acc(C5, alphav, R1);
+          traits.acc(C9, alphav, R2);
+          r1.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r2.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C2, alphav, R0);
+          traits.acc(C6, alphav, R1);
+          traits.acc(C10, alphav, R2);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C3, alphav, R0);
+          traits.acc(C7, alphav, R1);
+          traits.acc(C11, alphav, R2);
+          r3.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
         }
-        else
+
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
         {
-          ResPacket R0, R1, R4;
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C4, C8;
+          traits.initAcc(C0);
+          traits.initAcc(C4);
+          traits.initAcc(C8);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(0);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0, A1, A2;
+          
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
+            RhsPacket B_0;
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
+              traits.madd(A0, B_0, C0, B_0); \
+              traits.madd(A1, B_0, C4, B_0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+            } while(false)
+        
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*RhsProgress;
+            blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
+          }
+
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 3*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = ploadu<ResPacket>(r0);
-          R1 = ploadu<ResPacket>(r1);
-          R4 = ploadu<ResPacket>(r0 + ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
-          pstoreu(r0, R0);
-          R0 = ploadu<ResPacket>(r1 + ResPacketSize);
-          traits.acc(C1, alphav, R1);
-          traits.acc(C4, alphav, R4);
-          traits.acc(C5, alphav, R0);
-          pstoreu(r1, R1);
-          pstoreu(r0 + ResPacketSize, R4);
-          pstoreu(r1 + ResPacketSize, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
         }
-        
       }
-      
-      if(rows-peeled_mc>=LhsProgress)
+    }
+
+    //---------- Process 2 * LhsProgress rows at once ----------
+    if(mr>=2*Traits::LhsProgress)
+    {
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
+
+      for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
       {
-        Index i = peeled_mc;
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
-        prefetch(&blA[0]);
-
-        // gets res block as register
-        AccPacket C0, C1, C2, C3;
-                  traits.initAcc(C0);
-                  traits.initAcc(C1);
-        if(nr==4) traits.initAcc(C2);
-        if(nr==4) traits.initAcc(C3);
-
-        // performs "inner" product
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<peeled_kc; k+=4)
+        Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
         {
-          if(nr==2)
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
           {
-            LhsPacket A0;
-            RhsPacket B_0, B1;
+          
+          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 2 x nr registers.
+          
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3,
+                    C4, C5, C6, C7;
+          traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
+          traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0, A1;
 
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[2*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadLhs(&blA[1*LhsProgress], A0);
-            traits.loadRhs(&blB[3*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.loadRhs(&blB[5*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[6*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadLhs(&blA[3*LhsProgress], A0);
-            traits.loadRhs(&blB[7*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
-          }
-          else
+          for(Index k=0; k<peeled_kc; k+=pk)
           {
-            LhsPacket A0;
-            RhsPacket B_0, B1, B2, B3;
-
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadRhs(&blB[5*RhsProgress], B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.loadRhs(&blB[6*RhsProgress], B2);
-            traits.madd(A0,B3,C3,B3);
-            traits.loadLhs(&blA[1*LhsProgress], A0);
-            traits.loadRhs(&blB[7*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[8*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadRhs(&blB[9*RhsProgress], B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.loadRhs(&blB[10*RhsProgress], B2);
-            traits.madd(A0,B3,C3,B3);
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.loadRhs(&blB[11*RhsProgress], B3);
-
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[12*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadRhs(&blB[13*RhsProgress], B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.loadRhs(&blB[14*RhsProgress], B2);
-            traits.madd(A0,B3,C3,B3);
-
-            traits.loadLhs(&blA[3*LhsProgress], A0);
-            traits.loadRhs(&blB[15*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.madd(A0,B3,C3,B3);
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
+            RhsPacket B_0, B1, B2, B3, T0;
+
+   #define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, T0);                                     \
+              traits.madd(A1, B_0, C4, B_0);                                    \
+              traits.madd(A0, B1,  C1, T0);                                     \
+              traits.madd(A1, B1,  C5, B1);                                     \
+              traits.madd(A0, B2,  C2, T0);                                     \
+              traits.madd(A1, B2,  C6, B2);                                     \
+              traits.madd(A0, B3,  C3, T0);                                     \
+              traits.madd(A1, B3,  C7, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
+            } while(false)
+            
+            internal::prefetch(blB+(48+0));
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            internal::prefetch(blB+(48+16));
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*(2*Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
           }
-
-          blB += nr*4*RhsProgress;
-          blA += 4*LhsProgress;
-        }
-        // process remaining peeled loop
-        for(Index k=peeled_kc; k<depth; k++)
-        {
-          if(nr==2)
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
           {
-            LhsPacket A0;
-            RhsPacket B_0, B1;
-
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
+            RhsPacket B_0, B1, B2, B3, T0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 2*Traits::LhsProgress;
           }
-          else
-          {
-            LhsPacket A0;
-            RhsPacket B_0, B1, B2, B3;
+#undef EIGEN_GEBGP_ONESTEP
 
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
+          ResPacket R0, R1, R2, R3;
+          ResPacket alphav = pset1<ResPacket>(alpha);
 
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.madd(A0,B3,C3,B3);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r1.loadPacket(1 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C1, alphav, R2);
+          traits.acc(C5, alphav, R3);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(0 * Traits::ResPacketSize, R2);
+          r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r3.loadPacket(1 * Traits::ResPacketSize);
+          traits.acc(C2,  alphav, R0);
+          traits.acc(C6,  alphav, R1);
+          traits.acc(C3,  alphav, R2);
+          traits.acc(C7,  alphav, R3);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(0 * Traits::ResPacketSize, R2);
+          r3.storePacket(1 * Traits::ResPacketSize, R3);
           }
-
-          blB += nr*RhsProgress;
-          blA += LhsProgress;
         }
+      
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
+        {
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+          prefetch(&blA[0]);
 
-        ResPacket R0, R1, R2, R3;
-        ResPacket alphav = pset1<ResPacket>(alpha);
-
-        ResScalar* r0 = &res[(j2+0)*resStride + i];
-        ResScalar* r1 = r0 + resStride;
-        ResScalar* r2 = r1 + resStride;
-        ResScalar* r3 = r2 + resStride;
+          // gets res block as register
+          AccPacket C0, C4;
+          traits.initAcc(C0);
+          traits.initAcc(C4);
 
-                  R0 = ploadu<ResPacket>(r0);
-                  R1 = ploadu<ResPacket>(r1);
-        if(nr==4) R2 = ploadu<ResPacket>(r2);
-        if(nr==4) R3 = ploadu<ResPacket>(r3);
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(prefetch_res_offset);
 
-                  traits.acc(C0, alphav, R0);
-                  traits.acc(C1, alphav, R1);
-        if(nr==4) traits.acc(C2, alphav, R2);
-        if(nr==4) traits.acc(C3, alphav, R3);
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0, A1;
 
-                  pstoreu(r0, R0);
-                  pstoreu(r1, R1);
-        if(nr==4) pstoreu(r2, R2);
-        if(nr==4) pstoreu(r3, R3);
-      }
-      for(Index i=peeled_mc2; i<rows; i++)
-      {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA];
-        prefetch(&blA[0]);
-
-        // gets a 1 x nr res block as registers
-        ResScalar C0(0), C1(0), C2(0), C3(0);
-        // TODO directly use blockB ???
-        const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-        for(Index k=0; k<depth; k++)
-        {
-          if(nr==2)
+          for(Index k=0; k<peeled_kc; k+=pk)
           {
-            LhsScalar A0;
-            RhsScalar B_0, B1;
-
-            A0 = blA[k];
-            B_0 = blB[0];
-            B1 = blB[1];
-            MADD(cj,A0,B_0,C0,B_0);
-            MADD(cj,A0,B1,C1,B1);
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+            RhsPacket B_0, B1;
+        
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
+              traits.madd(A0, B_0, C0, B1);                                       \
+              traits.madd(A1, B_0, C4, B_0);                                      \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
+            } while(false)
+        
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*RhsProgress;
+            blA += pk*2*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
           }
-          else
+
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
           {
-            LhsScalar A0;
-            RhsScalar B_0, B1, B2, B3;
-
-            A0 = blA[k];
-            B_0 = blB[0];
-            B1 = blB[1];
-            B2 = blB[2];
-            B3 = blB[3];
-
-            MADD(cj,A0,B_0,C0,B_0);
-            MADD(cj,A0,B1,C1,B1);
-            MADD(cj,A0,B2,C2,B2);
-            MADD(cj,A0,B3,C3,B3);
+            RhsPacket B_0, B1;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 2*Traits::LhsProgress;
           }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0, R1;
+          ResPacket alphav = pset1<ResPacket>(alpha);
 
-          blB += nr;
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          }
         }
-                  res[(j2+0)*resStride + i] += alpha*C0;
-                  res[(j2+1)*resStride + i] += alpha*C1;
-        if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
-        if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
       }
     }
-    // process remaining rhs/res columns one at a time
-    // => do the same but with nr==1
-    for(Index j2=packet_cols; j2<cols; j2++)
+    //---------- Process 1 * LhsProgress rows at once ----------
+    if(mr>=1*Traits::LhsProgress)
     {
-      // unpack B
-      traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
-
-      for(Index i=0; i<peeled_mc; i+=mr)
+      // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
+      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
       {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
-        prefetch(&blA[0]);
+        // loops on each largest micro vertical panel of rhs (depth * nr)
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
+        {
+          // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 1 x nr registers.
+          
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3;
+          traits.initAcc(C0);
+          traits.initAcc(C1);
+          traits.initAcc(C2);
+          traits.initAcc(C3);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0;
+
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
+            RhsPacket B_0, B1, B2, B3;
+               
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              traits.madd(A0, B1,  C1, B1);                                     \
+              traits.madd(A0, B2,  C2, B2);                                     \
+              traits.madd(A0, B3,  C3, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \
+            } while(false)
+            
+            internal::prefetch(blB+(48+0));
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            internal::prefetch(blB+(48+16));
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*1*LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
+          }
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0, B1, B2, B3;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 1*LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
 
-        // TODO move the res loads to the stores
+          ResPacket R0, R1;
+          ResPacket alphav = pset1<ResPacket>(alpha);
 
-        // get res block as registers
-        AccPacket C0, C4;
-        traits.initAcc(C0);
-        traits.initAcc(C4);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C1,  alphav, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(0 * Traits::ResPacketSize, R1);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C2,  alphav, R0);
+          traits.acc(C3,  alphav, R1);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(0 * Traits::ResPacketSize, R1);
+        }
 
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<depth; k++)
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
         {
-          LhsPacket A0, A1;
-          RhsPacket B_0;
-          RhsPacket T0;
-
-          traits.loadLhs(&blA[0*LhsProgress], A0);
-          traits.loadLhs(&blA[1*LhsProgress], A1);
-          traits.loadRhs(&blB[0*RhsProgress], B_0);
-          traits.madd(A0,B_0,C0,T0);
-          traits.madd(A1,B_0,C4,B_0);
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
+          prefetch(&blA[0]);
 
-          blB += RhsProgress;
-          blA += 2*LhsProgress;
-        }
-        ResPacket R0, R4;
-        ResPacket alphav = pset1<ResPacket>(alpha);
+          // gets res block as register
+          AccPacket C0;
+          traits.initAcc(C0);
 
-        ResScalar* r0 = &res[(j2+0)*resStride + i];
+          LinearMapper r0 = res.getLinearMapper(i, j2);
 
-        R0 = ploadu<ResPacket>(r0);
-        R4 = ploadu<ResPacket>(r0+ResPacketSize);
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0;
 
-        traits.acc(C0, alphav, R0);
-        traits.acc(C4, alphav, R4);
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
+            RhsPacket B_0;
+        
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1");          \
+            } while(false);
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*RhsProgress;
+            blA += pk*1*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
+          }
 
-        pstoreu(r0,               R0);
-        pstoreu(r0+ResPacketSize, R4);
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 1*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+        }
       }
-      if(rows-peeled_mc>=LhsProgress)
+    }
+    //---------- Process remaining rows, 1 at once ----------
+    if(peeled_mc1<rows)
+    {
+      // loop on each panel of the rhs
+      for(Index j2=0; j2<packet_cols4; j2+=nr)
       {
-        Index i = peeled_mc;
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
-        prefetch(&blA[0]);
-
-        AccPacket C0;
-        traits.initAcc(C0);
-
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<depth; k++)
+        // loop on each row of the lhs (1*LhsProgress x depth)
+        for(Index i=peeled_mc1; i<rows; i+=1)
         {
-          LhsPacket A0;
-          RhsPacket B_0;
-          traits.loadLhs(blA, A0);
-          traits.loadRhs(blB, B_0);
-          traits.madd(A0, B_0, C0, B_0);
-          blB += RhsProgress;
-          blA += LhsProgress;
+          const LhsScalar* blA = &blockA[i*strideA+offsetA];
+          prefetch(&blA[0]);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+
+          // The following piece of code wont work for 512 bit registers
+          // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
+          // as nr (which is currently 4) for the return type.
+          typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
+          if ((SwappedTraits::LhsProgress % 4) == 0 &&
+              (SwappedTraits::LhsProgress <= 8) &&
+              (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
+          {
+            SAccPacket C0, C1, C2, C3;
+            straits.initAcc(C0);
+            straits.initAcc(C1);
+            straits.initAcc(C2);
+            straits.initAcc(C3);
+
+            const Index spk   = (std::max)(1,SwappedTraits::LhsProgress/4);
+            const Index endk  = (depth/spk)*spk;
+            const Index endk4 = (depth/(spk*4))*(spk*4);
+
+            Index k=0;
+            for(; k<endk4; k+=4*spk)
+            {
+              SLhsPacket A0,A1;
+              SRhsPacket B_0,B_1;
+
+              straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
+              straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
+
+              straits.loadRhsQuad(blA+0*spk, B_0);
+              straits.loadRhsQuad(blA+1*spk, B_1);
+              straits.madd(A0,B_0,C0,B_0);
+              straits.madd(A1,B_1,C1,B_1);
+
+              straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
+              straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
+              straits.loadRhsQuad(blA+2*spk, B_0);
+              straits.loadRhsQuad(blA+3*spk, B_1);
+              straits.madd(A0,B_0,C2,B_0);
+              straits.madd(A1,B_1,C3,B_1);
+
+              blB += 4*SwappedTraits::LhsProgress;
+              blA += 4*spk;
+            }
+            C0 = padd(padd(C0,C1),padd(C2,C3));
+            for(; k<endk; k+=spk)
+            {
+              SLhsPacket A0;
+              SRhsPacket B_0;
+
+              straits.loadLhsUnaligned(blB, A0);
+              straits.loadRhsQuad(blA, B_0);
+              straits.madd(A0,B_0,C0,B_0);
+
+              blB += SwappedTraits::LhsProgress;
+              blA += spk;
+            }
+            if(SwappedTraits::LhsProgress==8)
+            {
+              // Special case where we have to first reduce the accumulation register C0
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
+
+              SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
+              SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
+
+              if(depth-endk>0)
+              {
+                // We have to handle the last row of the rhs which corresponds to a half-packet
+                SLhsPacketHalf a0;
+                SRhsPacketHalf b0;
+                straits.loadLhsUnaligned(blB, a0);
+                straits.loadRhs(blA, b0);
+                SAccPacketHalf c0 = predux_downto4(C0);
+                straits.madd(a0,b0,c0,b0);
+                straits.acc(c0, alphav, R);
+              }
+              else
+              {
+                straits.acc(predux_downto4(C0), alphav, R);
+              }
+              res.scatterPacket(i, j2, R);
+            }
+            else
+            {
+              SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
+              SResPacket alphav = pset1<SResPacket>(alpha);
+              straits.acc(C0, alphav, R);
+              res.scatterPacket(i, j2, R);
+            }
+          }
+          else // scalar path
+          {
+            // get a 1 x 4 res block as registers
+            ResScalar C0(0), C1(0), C2(0), C3(0);
+
+            for(Index k=0; k<depth; k++)
+            {
+              LhsScalar A0;
+              RhsScalar B_0, B_1;
+
+              A0 = blA[k];
+
+              B_0 = blB[0];
+              B_1 = blB[1];
+              CJMADD(cj,A0,B_0,C0,  B_0);
+              CJMADD(cj,A0,B_1,C1,  B_1);
+              
+              B_0 = blB[2];
+              B_1 = blB[3];
+              CJMADD(cj,A0,B_0,C2,  B_0);
+              CJMADD(cj,A0,B_1,C3,  B_1);
+              
+              blB += 4;
+            }
+            res(i, j2 + 0) += alpha * C0;
+            res(i, j2 + 1) += alpha * C1;
+            res(i, j2 + 2) += alpha * C2;
+            res(i, j2 + 3) += alpha * C3;
+          }
         }
-
-        ResPacket alphav = pset1<ResPacket>(alpha);
-        ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
-        traits.acc(C0, alphav, R0);
-        pstoreu(&res[(j2+0)*resStride + i], R0);
       }
-      for(Index i=peeled_mc2; i<rows; i++)
+      // remaining columns
+      for(Index j2=packet_cols4; j2<cols; j2++)
       {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA];
-        prefetch(&blA[0]);
-
-        // gets a 1 x 1 res block as registers
-        ResScalar C0(0);
-        // FIXME directly use blockB ??
-        const RhsScalar* blB = &blockB[j2*strideB+offsetB];
-        for(Index k=0; k<depth; k++)
+        // loop on each row of the lhs (1*LhsProgress x depth)
+        for(Index i=peeled_mc1; i<rows; i+=1)
         {
-          LhsScalar A0 = blA[k];
-          RhsScalar B_0 = blB[k];
-          MADD(cj, A0, B_0, C0, B_0);
+          const LhsScalar* blA = &blockA[i*strideA+offsetA];
+          prefetch(&blA[0]);
+          // gets a 1 x 1 res block as registers
+          ResScalar C0(0);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          for(Index k=0; k<depth; k++)
+          {
+            LhsScalar A0 = blA[k];
+            RhsScalar B_0 = blB[k];
+            CJMADD(cj, A0, B_0, C0, B_0);
+          }
+          res(i, j2) += alpha * C0;
         }
-        res[(j2+0)*resStride + i] += alpha*C0;
       }
     }
   }
@@ -1114,81 +1685,193 @@ EIGEN_ASM_COMMENT("mybegin4");
 //
 //  32 33 34 35 ...
 //  36 36 38 39 ...
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
 {
-  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode>
-  ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
   typedef typename packet_traits<Scalar>::type Packet;
   enum { PacketSize = packet_traits<Scalar>::size };
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
-  EIGEN_UNUSED_VARIABLE(stride)
-  EIGEN_UNUSED_VARIABLE(offset)
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-  eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
+  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
   Index count = 0;
-  Index peeled_mc = (rows/Pack1)*Pack1;
-  for(Index i=0; i<peeled_mc; i+=Pack1)
+
+  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+  const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+  const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
+                         : Pack2>1             ? (rows/Pack2)*Pack2 : 0;
+
+  Index i=0;
+
+  // Pack 3 packets
+  if(Pack1>=3*PacketSize)
   {
-    if(PanelMode) count += Pack1 * offset;
+    for(; i<peeled_mc3; i+=3*PacketSize)
+    {
+      if(PanelMode) count += (3*PacketSize) * offset;
 
-    if(StorageOrder==ColMajor)
+      for(Index k=0; k<depth; k++)
+      {
+        Packet A, B, C;
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        B = lhs.loadPacket(i+1*PacketSize, k);
+        C = lhs.loadPacket(i+2*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
+      }
+      if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack 2 packets
+  if(Pack1>=2*PacketSize)
+  {
+    for(; i<peeled_mc2; i+=2*PacketSize)
     {
+      if(PanelMode) count += (2*PacketSize) * offset;
+
       for(Index k=0; k<depth; k++)
       {
-        Packet A, B, C, D;
-        if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
-        if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
-        if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
-        if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
-        if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
-        if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
-        if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
-        if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
+        Packet A, B;
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        B = lhs.loadPacket(i+1*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
       }
+      if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
     }
-    else
+  }
+  // Pack 1 packets
+  if(Pack1>=1*PacketSize)
+  {
+    for(; i<peeled_mc1; i+=1*PacketSize)
     {
+      if(PanelMode) count += (1*PacketSize) * offset;
+
+      for(Index k=0; k<depth; k++)
+      {
+        Packet A;
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A));
+        count+=PacketSize;
+      }
+      if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack scalars
+  if(Pack2<PacketSize && Pack2>1)
+  {
+    for(; i<peeled_mc0; i+=Pack2)
+    {
+      if(PanelMode) count += Pack2 * offset;
+
       for(Index k=0; k<depth; k++)
+        for(Index w=0; w<Pack2; w++)
+          blockA[count++] = cj(lhs(i+w, k));
+
+      if(PanelMode) count += Pack2 * (stride-offset-depth);
+    }
+  }
+  for(; i<rows; i++)
+  {
+    if(PanelMode) count += offset;
+    for(Index k=0; k<depth; k++)
+      blockA[count++] = cj(lhs(i, k));
+    if(PanelMode) count += (stride-offset-depth);
+  }
+}
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+{
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum { PacketSize = packet_traits<Scalar>::size };
+
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index count = 0;
+
+//   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+//   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+//   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+
+  int pack = Pack1;
+  Index i = 0;
+  while(pack>0)
+  {
+    Index remaining_rows = rows-i;
+    Index peeled_mc = i+(remaining_rows/pack)*pack;
+    for(; i<peeled_mc; i+=pack)
+    {
+      if(PanelMode) count += pack * offset;
+
+      const Index peeled_k = (depth/PacketSize)*PacketSize;
+      Index k=0;
+      if(pack>=PacketSize)
+      {
+        for(; k<peeled_k; k+=PacketSize)
+        {
+          for (Index m = 0; m < pack; m += PacketSize)
+          {
+            PacketBlock<Packet> kernel;
+            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
+            ptranspose(kernel);
+            for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+          }
+          count += PacketSize*pack;
+        }
+      }
+      for(; k<depth; k++)
       {
-        // TODO add a vectorized transpose here
         Index w=0;
-        for(; w<Pack1-3; w+=4)
+        for(; w<pack-3; w+=4)
         {
           Scalar a(cj(lhs(i+w+0, k))),
-                  b(cj(lhs(i+w+1, k))),
-                  c(cj(lhs(i+w+2, k))),
-                  d(cj(lhs(i+w+3, k)));
+                 b(cj(lhs(i+w+1, k))),
+                 c(cj(lhs(i+w+2, k))),
+                 d(cj(lhs(i+w+3, k)));
           blockA[count++] = a;
           blockA[count++] = b;
           blockA[count++] = c;
           blockA[count++] = d;
         }
-        if(Pack1%4)
-          for(;w<Pack1;++w)
+        if(pack%4)
+          for(;w<pack;++w)
             blockA[count++] = cj(lhs(i+w, k));
       }
+
+      if(PanelMode) count += pack * (stride-offset-depth);
     }
-    if(PanelMode) count += Pack1 * (stride-offset-depth);
-  }
-  if(rows-peeled_mc>=Pack2)
-  {
-    if(PanelMode) count += Pack2*offset;
-    for(Index k=0; k<depth; k++)
-      for(Index w=0; w<Pack2; w++)
-        blockA[count++] = cj(lhs(peeled_mc+w, k));
-    if(PanelMode) count += Pack2 * (stride-offset-depth);
-    peeled_mc += Pack2;
+
+    pack -= PacketSize;
+    if(pack<Pack2 && (pack+PacketSize)!=Pack2)
+      pack = Pack2;
   }
-  for(Index i=peeled_mc; i<rows; i++)
+
+  for(; i<rows; i++)
   {
     if(PanelMode) count += offset;
     for(Index k=0; k<depth; k++)
@@ -1204,53 +1887,123 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder,
 //  4  5  6  7   16 17 18 19   25 28
 //  8  9 10 11   20 21 22 23   26 29
 //  .  .  .  .    .  .  .  .    .  .
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
-  EIGEN_UNUSED_VARIABLE(stride)
-  EIGEN_UNUSED_VARIABLE(offset)
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  Index packet_cols = (cols/nr) * nr;
+  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
   Index count = 0;
-  for(Index j2=0; j2<packet_cols; j2+=nr)
+  const Index peeled_k = (depth/PacketSize)*PacketSize;
+//   if(nr>=8)
+//   {
+//     for(Index j2=0; j2<packet_cols8; j2+=8)
+//     {
+//       // skip what we have before
+//       if(PanelMode) count += 8 * offset;
+//       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+//       const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+//       const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+//       const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+//       const Scalar* b4 = &rhs[(j2+4)*rhsStride];
+//       const Scalar* b5 = &rhs[(j2+5)*rhsStride];
+//       const Scalar* b6 = &rhs[(j2+6)*rhsStride];
+//       const Scalar* b7 = &rhs[(j2+7)*rhsStride];
+//       Index k=0;
+//       if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
+//       {
+//         for(; k<peeled_k; k+=PacketSize) {
+//           PacketBlock<Packet> kernel;
+//           for (int p = 0; p < PacketSize; ++p) {
+//             kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
+//           }
+//           ptranspose(kernel);
+//           for (int p = 0; p < PacketSize; ++p) {
+//             pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
+//             count+=PacketSize;
+//           }
+//         }
+//       }
+//       for(; k<depth; k++)
+//       {
+//         blockB[count+0] = cj(b0[k]);
+//         blockB[count+1] = cj(b1[k]);
+//         blockB[count+2] = cj(b2[k]);
+//         blockB[count+3] = cj(b3[k]);
+//         blockB[count+4] = cj(b4[k]);
+//         blockB[count+5] = cj(b5[k]);
+//         blockB[count+6] = cj(b6[k]);
+//         blockB[count+7] = cj(b7[k]);
+//         count += 8;
+//       }
+//       // skip what we have after
+//       if(PanelMode) count += 8 * (stride-offset-depth);
+//     }
+//   }
+
+  if(nr>=4)
   {
-    // skip what we have before
-    if(PanelMode) count += nr * offset;
-    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
-    const Scalar* b1 = &rhs[(j2+1)*rhsStride];
-    const Scalar* b2 = &rhs[(j2+2)*rhsStride];
-    const Scalar* b3 = &rhs[(j2+3)*rhsStride];
-    for(Index k=0; k<depth; k++)
+    for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
     {
-                blockB[count+0] = cj(b0[k]);
-                blockB[count+1] = cj(b1[k]);
-      if(nr==4) blockB[count+2] = cj(b2[k]);
-      if(nr==4) blockB[count+3] = cj(b3[k]);
-      count += nr;
+      // skip what we have before
+      if(PanelMode) count += 4 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k=0;
+      if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
+      {
+        for(; k<peeled_k; k+=PacketSize) {
+          PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
+          kernel.packet[0] = dm0.loadPacket(k);
+          kernel.packet[1%PacketSize] = dm1.loadPacket(k);
+          kernel.packet[2%PacketSize] = dm2.loadPacket(k);
+          kernel.packet[3%PacketSize] = dm3.loadPacket(k);
+          ptranspose(kernel);
+          pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
+          pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
+          pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
+          count+=4*PacketSize;
+        }
+      }
+      for(; k<depth; k++)
+      {
+        blockB[count+0] = cj(dm0(k));
+        blockB[count+1] = cj(dm1(k));
+        blockB[count+2] = cj(dm2(k));
+        blockB[count+3] = cj(dm3(k));
+        count += 4;
+      }
+      // skip what we have after
+      if(PanelMode) count += 4 * (stride-offset-depth);
     }
-    // skip what we have after
-    if(PanelMode) count += nr * (stride-offset-depth);
   }
 
   // copy the remaining columns one at a time (nr==1)
-  for(Index j2=packet_cols; j2<cols; ++j2)
+  for(Index j2=packet_cols4; j2<cols; ++j2)
   {
     if(PanelMode) count += offset;
-    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+    const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
     for(Index k=0; k<depth; k++)
     {
-      blockB[count] = cj(b0[k]);
+      blockB[count] = cj(dm0(k));
       count += 1;
     }
     if(PanelMode) count += (stride-offset-depth);
@@ -1258,48 +2011,93 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
 }
 
 // this version is optimized for row major matrices
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
 {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
-  EIGEN_UNUSED_VARIABLE(stride)
-  EIGEN_UNUSED_VARIABLE(offset)
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  Index packet_cols = (cols/nr) * nr;
+  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
   Index count = 0;
-  for(Index j2=0; j2<packet_cols; j2+=nr)
+
+//   if(nr>=8)
+//   {
+//     for(Index j2=0; j2<packet_cols8; j2+=8)
+//     {
+//       // skip what we have before
+//       if(PanelMode) count += 8 * offset;
+//       for(Index k=0; k<depth; k++)
+//       {
+//         if (PacketSize==8) {
+//           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+//           pstoreu(blockB+count, cj.pconj(A));
+//         } else if (PacketSize==4) {
+//           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+//           Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
+//           pstoreu(blockB+count, cj.pconj(A));
+//           pstoreu(blockB+count+PacketSize, cj.pconj(B));
+//         } else {
+//           const Scalar* b0 = &rhs[k*rhsStride + j2];
+//           blockB[count+0] = cj(b0[0]);
+//           blockB[count+1] = cj(b0[1]);
+//           blockB[count+2] = cj(b0[2]);
+//           blockB[count+3] = cj(b0[3]);
+//           blockB[count+4] = cj(b0[4]);
+//           blockB[count+5] = cj(b0[5]);
+//           blockB[count+6] = cj(b0[6]);
+//           blockB[count+7] = cj(b0[7]);
+//         }
+//         count += 8;
+//       }
+//       // skip what we have after
+//       if(PanelMode) count += 8 * (stride-offset-depth);
+//     }
+//   }
+  if(nr>=4)
   {
-    // skip what we have before
-    if(PanelMode) count += nr * offset;
-    for(Index k=0; k<depth; k++)
+    for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
     {
-      const Scalar* b0 = &rhs[k*rhsStride + j2];
-                blockB[count+0] = cj(b0[0]);
-                blockB[count+1] = cj(b0[1]);
-      if(nr==4) blockB[count+2] = cj(b0[2]);
-      if(nr==4) blockB[count+3] = cj(b0[3]);
-      count += nr;
+      // skip what we have before
+      if(PanelMode) count += 4 * offset;
+      for(Index k=0; k<depth; k++)
+      {
+        if (PacketSize==4) {
+          Packet A = rhs.loadPacket(k, j2);
+          pstoreu(blockB+count, cj.pconj(A));
+          count += PacketSize;
+        } else {
+          const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+          blockB[count+0] = cj(dm0(0));
+          blockB[count+1] = cj(dm0(1));
+          blockB[count+2] = cj(dm0(2));
+          blockB[count+3] = cj(dm0(3));
+          count += 4;
+        }
+      }
+      // skip what we have after
+      if(PanelMode) count += 4 * (stride-offset-depth);
     }
-    // skip what we have after
-    if(PanelMode) count += nr * (stride-offset-depth);
   }
   // copy the remaining columns one at a time (nr==1)
-  for(Index j2=packet_cols; j2<cols; ++j2)
+  for(Index j2=packet_cols4; j2<cols; ++j2)
   {
     if(PanelMode) count += offset;
-    const Scalar* b0 = &rhs[j2];
     for(Index k=0; k<depth; k++)
     {
-      blockB[count] = cj(b0[k*rhsStride]);
+      blockB[count] = cj(rhs(k, j2));
       count += 1;
     }
     if(PanelMode) count += stride-offset-depth;
@@ -1312,8 +2110,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
   * \sa setCpuCacheSize */
 inline std::ptrdiff_t l1CacheSize()
 {
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l1;
 }
 
@@ -1321,19 +2119,29 @@ inline std::ptrdiff_t l1CacheSize()
   * \sa setCpuCacheSize */
 inline std::ptrdiff_t l2CacheSize()
 {
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l2;
 }
 
+/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
+rs.                                                                                                                
+* \sa setCpuCacheSize */
+inline std::ptrdiff_t l3CacheSize()
+{
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  return l3;
+}
+
 /** Set the cpu L1 and L2 cache sizes (in bytes).
   * These values are use to adjust the size of the blocks
   * for the algorithms working per blocks.
   *
   * \sa computeProductBlockingSizes */
-inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
 {
-  internal::manage_caching_sizes(SetAction, &l1, &l2);
+  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 3f5ffcf51..6440e1d09 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
 #define EIGEN_GENERAL_MATRIX_MATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -23,7 +23,9 @@ template<
   typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef gebp_traits<RhsScalar,LhsScalar> Traits;
+
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(
     Index rows, Index cols, Index depth,
     const LhsScalar* lhs, Index lhsStride,
@@ -51,42 +53,44 @@ template<
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
 {
 
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
   const LhsScalar* _lhs, Index lhsStride,
   const RhsScalar* _rhs, Index rhsStride,
-  ResScalar* res, Index resStride,
+  ResScalar* _res, Index resStride,
   ResScalar alpha,
   level3_blocking<LhsScalar,RhsScalar>& blocking,
   GemmParallelInfo<Index>* info = 0)
 {
-  const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-  const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+  typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+  LhsMapper lhs(_lhs,lhsStride);
+  RhsMapper rhs(_rhs,rhsStride);
+  ResMapper res(_res, resStride);
 
   Index kc = blocking.kc();                   // cache block size along the K direction
   Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
-  //Index nc = blocking.nc(); // cache block size along the N direction
+  Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
 
-  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
-  gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+  gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+  gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
 
 #ifdef EIGEN_HAS_OPENMP
   if(info)
   {
     // this is the parallel version!
-    Index tid = omp_get_thread_num();
-    Index threads = omp_get_num_threads();
-    
-    std::size_t sizeA = kc*mc;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
-    
-    RhsScalar* blockB = blocking.blockB();
-    eigen_internal_assert(blockB!=0);
+    int tid = omp_get_thread_num();
+    int threads = omp_get_num_threads();
+
+    LhsScalar* blockA = blocking.blockA();
+    eigen_internal_assert(blockA!=0);
+
+    std::size_t sizeB = kc*nc;
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
 
     // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
     for(Index k=0; k<depth; k+=kc)
@@ -94,54 +98,56 @@ static void run(Index rows, Index cols, Index depth,
       const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
 
       // In order to reduce the chance that a thread has to wait for the other,
-      // let's start by packing A'.
-      pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
+      // let's start by packing B'.
+      pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
 
-      // Pack B_k to B' in a parallel fashion:
-      // each thread packs the sub block B_k,j to B'_j where j is the thread id.
+      // Pack A_k to A' in a parallel fashion:
+      // each thread packs the sub block A_k,i to A'_i where i is the thread id.
 
-      // However, before copying to B'_j, we have to make sure that no other thread is still using it,
+      // However, before copying to A'_i, we have to make sure that no other thread is still using it,
       // i.e., we test that info[tid].users equals 0.
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(info[tid].users!=0) {}
       info[tid].users += threads;
 
-      pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
+      pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
 
-      // Notify the other threads that the part B'_j is ready to go.
+      // Notify the other threads that the part A'_i is ready to go.
       info[tid].sync = k;
 
-      // Computes C_i += A' * B' per B'_j
-      for(Index shift=0; shift<threads; ++shift)
+      // Computes C_i += A' * B' per A'_i
+      for(int shift=0; shift<threads; ++shift)
       {
-        Index j = (tid+shift)%threads;
+        int i = (tid+shift)%threads;
 
-        // At this point we have to make sure that B'_j has been updated by the thread j,
+        // At this point we have to make sure that A'_i has been updated by the thread i,
         // we use testAndSetOrdered to mimic a volatile access.
         // However, no need to wait for the B' part which has been updated by the current thread!
-        if(shift>0)
-          while(info[j].sync!=k) {}
+        if (shift>0) {
+          while(info[i].sync!=k) {
+          }
+        }
 
-        gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
+        gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
       }
 
-      // Then keep going as usual with the remaining A'
-      for(Index i=mc; i<rows; i+=mc)
+      // Then keep going as usual with the remaining B'
+      for(Index j=nc; j<cols; j+=nc)
       {
-        const Index actual_mc = (std::min)(i+mc,rows)-i;
+        const Index actual_nc = (std::min)(j+nc,cols)-j;
 
-        // pack A_i,k to A'
-        pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
+        // pack B_k,j to B'
+        pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
 
-        // C_i += A' * B'
-        gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
+        // C_j += A' * B'
+        gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
       }
 
-      // Release all the sub blocks B'_j of B' for the current thread,
+      // Release all the sub blocks A'_i of A' for the current thread,
       // i.e., we simply decrement the number of users by 1
-      for(Index j=0; j<threads; ++j)
+      for(Index i=0; i<threads; ++i)
         #pragma omp atomic
-        --(info[j].users);
+        info[i].users -= 1;
     }
   }
   else
@@ -151,38 +157,42 @@ static void run(Index rows, Index cols, Index depth,
 
     // this is the sequential version!
     std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = kc*nc;
 
     ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());
+
+    const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
 
     // For each horizontal panel of the rhs, and corresponding panel of the lhs...
-    // (==GEMM_VAR1)
-    for(Index k2=0; k2<depth; k2+=kc)
+    for(Index i2=0; i2<rows; i2+=mc)
     {
-      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+      const Index actual_mc = (std::min)(i2+mc,rows)-i2;
 
-      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
-      // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
-      // Note that this panel will be read as many times as the number of blocks in the lhs's
-      // vertical panel which is, in practice, a very low number.
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
-
-      // For each mc x kc block of the lhs's vertical panel...
-      // (==GEPP_VAR1)
-      for(Index i2=0; i2<rows; i2+=mc)
+      for(Index k2=0; k2<depth; k2+=kc)
       {
-        const Index actual_mc = (std::min)(i2+mc,rows)-i2;
-
-        // We pack the lhs's block into a sequential chunk of memory (L1 caching)
-        // Note that this block will be read a very high number of times, which is equal to the number of
-        // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
-        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
-
-        // Everything is packed, we can now call the block * panel kernel:
-        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+        const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+        // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+        // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+        // Note that this panel will be read as many times as the number of blocks in the rhs's
+        // horizontal panel which is, in practice, a very low number.
+        pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
+
+        // For each kc x nc block of the rhs's horizontal panel...
+        for(Index j2=0; j2<cols; j2+=nc)
+        {
+          const Index actual_nc = (std::min)(j2+nc,cols)-j2;
+
+          // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+          // Note that this block will be read a very high number of times, which is equal to the number of
+          // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+          if((!pack_rhs_once) || i2==0)
+            pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+
+          // Everything is packed, we can now call the panel * block kernel:
+          gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
+        }
       }
     }
   }
@@ -191,26 +201,21 @@ static void run(Index rows, Index cols, Index depth,
 };
 
 /*********************************************************************************
-*  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
+*  Specialization of generic_product_impl for "large" GEMM, i.e.,
 *  implementation of the high level wrapper to general_matrix_matrix_product
 **********************************************************************************/
 
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
-{};
-
 template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
 struct gemm_functor
 {
-  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha,
-                  BlockingType& blocking)
+  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
     : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
   {}
 
-  void initParallelSession() const
+  void initParallelSession(Index num_threads) const
   {
-    m_blocking.allocateB();
+    m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
+    m_blocking.allocateA();
   }
 
   void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
@@ -219,12 +224,14 @@ struct gemm_functor
       cols = m_rhs.cols();
 
     Gemm::run(rows, cols, m_lhs.cols(),
-              /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
-              /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
+              &m_lhs.coeffRef(row,0), m_lhs.outerStride(),
+              &m_rhs.coeffRef(0,col), m_rhs.outerStride(),
               (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
               m_actualAlpha, m_blocking, info);
   }
 
+  typedef typename Gemm::Traits Traits;
+
   protected:
     const Lhs& m_lhs;
     const Rhs& m_rhs;
@@ -245,29 +252,27 @@ class level3_blocking
   protected:
     LhsScalar* m_blockA;
     RhsScalar* m_blockB;
-    RhsScalar* m_blockW;
 
-    DenseIndex m_mc;
-    DenseIndex m_nc;
-    DenseIndex m_kc;
+    Index m_mc;
+    Index m_nc;
+    Index m_kc;
 
   public:
 
     level3_blocking()
-      : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0)
+      : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
     {}
 
-    inline DenseIndex mc() const { return m_mc; }
-    inline DenseIndex nc() const { return m_nc; }
-    inline DenseIndex kc() const { return m_kc; }
+    inline Index mc() const { return m_mc; }
+    inline Index nc() const { return m_nc; }
+    inline Index kc() const { return m_kc; }
 
     inline LhsScalar* blockA() { return m_blockA; }
     inline RhsScalar* blockB() { return m_blockB; }
-    inline RhsScalar* blockW() { return m_blockW; }
 };
 
 template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */>
   : public level3_blocking<
       typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
       typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
@@ -282,29 +287,38 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
     enum {
       SizeA = ActualRows * MaxDepth,
-      SizeB = ActualCols * MaxDepth,
-      SizeW = MaxDepth * Traits::WorkSpaceFactor
+      SizeB = ActualCols * MaxDepth
     };
 
-    EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
-    EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
-    EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
+    EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA];
+    EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB];
+#else
+    EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+    EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+#endif
 
   public:
 
-    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/)
+    gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/)
     {
       this->m_mc = ActualRows;
       this->m_nc = ActualCols;
       this->m_kc = MaxDepth;
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
       this->m_blockA = m_staticA;
       this->m_blockB = m_staticB;
-      this->m_blockW = m_staticW;
+#else
+      this->m_blockA = reinterpret_cast<LhsScalar*>((internal::UIntPtr(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+      this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+#endif
     }
 
+    void initParallel(Index, Index, Index, Index)
+    {}
+
     inline void allocateA() {}
     inline void allocateB() {}
-    inline void allocateW() {}
     inline void allocateAll() {}
 };
 
@@ -321,22 +335,42 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
     typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
-    DenseIndex m_sizeA;
-    DenseIndex m_sizeB;
-    DenseIndex m_sizeW;
+    Index m_sizeA;
+    Index m_sizeB;
 
   public:
 
-    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth)
+    gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking)
     {
       this->m_mc = Transpose ? cols : rows;
       this->m_nc = Transpose ? rows : cols;
       this->m_kc = depth;
 
-      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc);
+      if(l3_blocking)
+      {
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
+      }
+      else  // no l3 blocking
+      {
+        Index n = this->m_nc;
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n, num_threads);
+      }
+
+      m_sizeA = this->m_mc * this->m_kc;
+      m_sizeB = this->m_kc * this->m_nc;
+    }
+
+    void initParallel(Index rows, Index cols, Index depth, Index num_threads)
+    {
+      this->m_mc = Transpose ? cols : rows;
+      this->m_nc = Transpose ? rows : cols;
+      this->m_kc = depth;
+
+      eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);
+      Index m = this->m_mc;
+      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
       m_sizeA = this->m_mc * this->m_kc;
       m_sizeB = this->m_kc * this->m_nc;
-      m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
     }
 
     void allocateA()
@@ -351,77 +385,108 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
         this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
     }
 
-    void allocateW()
-    {
-      if(this->m_blockW==0)
-        this->m_blockW = aligned_new<RhsScalar>(m_sizeW);
-    }
-
     void allocateAll()
     {
       allocateA();
       allocateB();
-      allocateW();
     }
 
     ~gemm_blocking_space()
     {
       aligned_delete(this->m_blockA, m_sizeA);
       aligned_delete(this->m_blockB, m_sizeB);
-      aligned_delete(this->m_blockW, m_sizeW);
     }
 };
 
 } // end namespace internal
 
+namespace internal {
+
 template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemmProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
 {
-    enum {
-      MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
-    };
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-    
-    typedef typename  Lhs::Scalar LhsScalar;
-    typedef typename  Rhs::Scalar RhsScalar;
-    typedef           Scalar      ResScalar;
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  typedef typename Lhs::Scalar LhsScalar;
+  typedef typename Rhs::Scalar RhsScalar;
 
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {
-      typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
-    }
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
 
-    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-    {
-      eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
 
-      typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-      typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+  enum {
+    MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
+  };
 
-      Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                                 * RhsBlasTraits::extractScalarFactor(m_rhs);
+  typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
 
-      typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
-              Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
+  template<typename Dst>
+  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+      lazyproduct::evalTo(dst, lhs, rhs);
+    else
+    {
+      dst.setZero();
+      scaleAndAddTo(dst, lhs, rhs, Scalar(1));
+    }
+  }
 
-      typedef internal::gemm_functor<
-        Scalar, Index,
-        internal::general_matrix_matrix_product<
-          Index,
-          LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
-          RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
-          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
-        _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
+  template<typename Dst>
+  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+      lazyproduct::addTo(dst, lhs, rhs);
+    else
+      scaleAndAddTo(dst,lhs, rhs, Scalar(1));
+  }
 
-      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols());
+  template<typename Dst>
+  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+      lazyproduct::subTo(dst, lhs, rhs);
+    else
+      scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
+  }
 
-      internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
-    }
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
+  {
+    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
+    if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
+      return;
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
+
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
+            Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
+
+    typedef internal::gemm_functor<
+      Scalar, Index,
+      internal::general_matrix_matrix_product<
+        Index,
+        LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+        RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+        (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
+      ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
+
+    BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
+    internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
+        (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
+  }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_GENERAL_MATRIX_MATRIX_H
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 5c3763909..7122efa60 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -20,7 +20,7 @@ namespace internal {
 /**********************************************************************
 * This file implements a general A * B product while
 * evaluating only one triangular part of the product.
-* This is more general version of self adjoint product (C += A A^T)
+* This is a more general version of self adjoint product (C += A A^T)
 * as the level 3 SYRK Blas routine.
 **********************************************************************/
 
@@ -40,15 +40,16 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat
                           typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
 struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
-                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
+                                      const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
   {
     general_matrix_matrix_triangular_product<Index,
         RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
         LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
         ColMajor, UpLo==Lower?Upper:Lower>
-      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking);
   }
 };
 
@@ -56,32 +57,36 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat
                           typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
 struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride,
+                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
   {
-    const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
-    Index kc = depth; // cache block size along the K direction
-    Index mc = size;  // cache block size along the M direction
-    Index nc = size;  // cache block size along the N direction
-    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
+    typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
+    Index kc = blocking.kc();
+    Index mc = (std::min)(size,blocking.mc());
+
     // !!! mc must be a multiple of nr:
     if(mc > Traits::nr)
       mc = (mc/Traits::nr)*Traits::nr;
 
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*size;
-    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0);
-    RhsScalar* blockB = allocatedBlockB + sizeW;
-    
-    gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
-    gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*size;
+
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
     tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
 
     for(Index k2=0; k2<depth; k2+=kc)
@@ -89,29 +94,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
       const Index actual_kc = (std::min)(k2+kc,depth)-k2;
 
       // note that the actual rhs is the transpose/adjoint of mat
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
 
       for(Index i2=0; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
 
-        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
         // the selected actual_mc * size panel of res is split into three different part:
         //  1 - before the diagonal => processed with gebp or skipped
         //  2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
         //  3 - after the diagonal => processed with gebp or skipped
         if (UpLo==Lower)
-          gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-               -1, -1, 0, 0, allocatedBlockB);
+          gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
+               (std::min)(size,i2), alpha, -1, -1, 0, 0);
+
 
-        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
+        sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
 
         if (UpLo==Upper)
         {
           Index j2 = i2+actual_mc;
-          gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-               -1, -1, 0, 0, allocatedBlockB);
+          gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
+               actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
         }
       }
     }
@@ -132,14 +138,17 @@ struct tribb_kernel
 {
   typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
   typedef typename Traits::ResScalar ResScalar;
-  
+
   enum {
-    BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
+    BlockSize  = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
   };
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace)
+  void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
   {
-    gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
-    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
+    typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
+    ResMapper res(_res, resStride);
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+
+    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));
 
     // let's process the block per panel of actual_mc x BlockSize,
     // again, each is split into three parts, etc.
@@ -149,20 +158,20 @@ struct tribb_kernel
       const RhsScalar* actual_b = blockB+j*depth;
 
       if(UpLo==Upper)
-        gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+        gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
+                    -1, -1, 0, 0);
 
       // selfadjoint micro block
       {
         Index i = j;
         buffer.setZero();
         // 1 - apply the kernel on the temporary buffer
-        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+        gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+                    -1, -1, 0, 0);
         // 2 - triangular accumulation
         for(Index j1=0; j1<actualBlockSize; ++j1)
         {
-          ResScalar* r = res + (j+j1)*resStride + i;
+          ResScalar* r = &res(i, j + j1);
           for(Index i1=UpLo==Lower ? j1 : 0;
               UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
             r[i1] += buffer(i1,j1);
@@ -172,8 +181,8 @@ struct tribb_kernel
       if(UpLo==Lower)
       {
         Index i = j+actualBlockSize;
-        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+        gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
+                    depth, actualBlockSize, alpha, -1, -1, 0, 0);
       }
     }
   }
@@ -190,10 +199,9 @@ struct general_product_to_triangular_selector;
 template<typename MatrixType, typename ProductType, int UpLo>
 struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
 {
-  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
   {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     
     typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
     typedef internal::blas_traits<Lhs> LhsBlasTraits;
@@ -209,6 +217,9 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
 
     Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
 
+    if(!beta)
+      mat.template triangularView<UpLo>().setZero();
+
     enum {
       StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
       UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
@@ -236,10 +247,8 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
 template<typename MatrixType, typename ProductType, int UpLo>
 struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 {
-  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
   {
-    typedef typename MatrixType::Index Index;
-    
     typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
     typedef internal::blas_traits<Lhs> LhsBlasTraits;
     typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
@@ -254,23 +263,42 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 
     typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
 
+    if(!beta)
+      mat.template triangularView<UpLo>().setZero();
+
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+      LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
+      RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0
+    };
+
+    Index size = mat.cols();
+    Index depth = actualLhs.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
+          MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualRhs::MaxColsAtCompileTime> BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+
     internal::general_matrix_matrix_triangular_product<Index,
-      typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
-      typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
-      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
-      ::run(mat.cols(), actualLhs.cols(),
+      typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      IsRowMajor ? RowMajor : ColMajor, UpLo>
+      ::run(size, depth,
             &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha);
+            mat.data(), mat.outerStride(), actualAlpha, blocking);
   }
 };
 
 template<typename MatrixType, unsigned int UpLo>
-template<typename ProductDerived, typename _Lhs, typename _Rhs>
-TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
+template<typename ProductType>
+TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
 {
-  general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
+  eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
+  
+  general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
   
-  return *this;
+  return derived();
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
index 3deed068e..5b7c15cca 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -25,15 +25,15 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Level 3 BLAS SYRK/HERK implementation.
  ********************************************************************************
 */
 
-#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
-#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -44,34 +44,35 @@ struct general_matrix_matrix_rankupdate :
 
 
 // try to go to BLAS specialization
-#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \
+#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar) \
 template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
                           int RhsStorageOrder, bool ConjugateRhs, int  UpLo> \
 struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
                Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
-                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
+                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
   { \
     if (lhs==rhs) { \
       general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
-      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
     } else { \
       general_matrix_matrix_triangular_product<Index, \
         Scalar, LhsStorageOrder, ConjugateLhs, \
         Scalar, RhsStorageOrder, ConjugateRhs, \
         ColMajor, UpLo, BuiltIn> \
-      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
     } \
   } \
 };
 
-EIGEN_MKL_RANKUPDATE_SPECIALIZE(double)
-//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
-EIGEN_MKL_RANKUPDATE_SPECIALIZE(float)
-//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double)
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float)
+// TODO handle complex cases
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex)
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex)
 
 // SYRK for float/double
-#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \
+#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC) \
 template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
 struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
   enum { \
@@ -80,23 +81,19 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
     conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
   }; \
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
-                          const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
+                          const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
   /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
 \
-   MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
-   char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
-   MKLTYPE alpha_, beta_; \
-\
-/* Set alpha_ & beta_ */ \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
-   MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
+   BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
+   char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
+   EIGTYPE beta(1); \
+   BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
   } \
 };
 
 // HERK for complex data
-#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \
+#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC) \
 template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
 struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
   enum { \
@@ -105,18 +102,15 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
     conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
   }; \
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
-                          const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
+                          const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
    typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
 \
-   MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
-   char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
+   BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
+   char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'C':'N'); \
    RTYPE alpha_, beta_; \
    const EIGTYPE* a_ptr; \
 \
-/* Set alpha_ & beta_ */ \
-/*   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\
-/*   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \
    alpha_ = alpha.real(); \
    beta_ = 1.0; \
 /* Copy with conjugation in some cases*/ \
@@ -127,20 +121,21 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
      lda = a.outerStride(); \
      a_ptr = a.data(); \
    } else a_ptr=lhs; \
-   MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \
+   BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \
   } \
 };
 
 
-EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk)
-EIGEN_MKL_RANKUPDATE_R(float,  float,  ssyrk)
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
+EIGEN_BLAS_RANKUPDATE_R(float,  float,  ssyrk_)
 
-//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
-//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8,  double, cherk)
+// TODO hanlde complex cases
+// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
+// EIGEN_BLAS_RANKUPDATE_C(scomplex, float,  float, cherk_)
 
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
index 060af328e..7a3bdbf20 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   General matrix-matrix product functionality based on ?GEMM.
  ********************************************************************************
 */
 
-#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
-#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
 
 namespace Eigen { 
 
@@ -46,13 +46,15 @@ namespace internal {
 
 // gemm specialization
 
-#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \
 template< \
   typename Index, \
   int LhsStorageOrder, bool ConjugateLhs, \
   int RhsStorageOrder, bool ConjugateRhs> \
 struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
 { \
+typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
+\
 static void run(Index rows, Index cols, Index depth, \
   const EIGTYPE* _lhs, Index lhsStride, \
   const EIGTYPE* _rhs, Index rhsStride, \
@@ -64,55 +66,50 @@ static void run(Index rows, Index cols, Index depth, \
   using std::conj; \
 \
   char transa, transb; \
-  MKL_INT m, n, k, lda, ldb, ldc; \
+  BlasIndex m, n, k, lda, ldb, ldc; \
   const EIGTYPE *a, *b; \
-  MKLTYPE alpha_, beta_; \
+  EIGTYPE beta(1); \
   MatrixX##EIGPREFIX a_tmp, b_tmp; \
-  EIGTYPE myone(1);\
 \
 /* Set transpose options */ \
   transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
   transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
 \
 /* Set m, n, k */ \
-  m = (MKL_INT)rows;  \
-  n = (MKL_INT)cols;  \
-  k = (MKL_INT)depth; \
-\
-/* Set alpha_ & beta_ */ \
-  assign_scalar_eig2mkl(alpha_, alpha); \
-  assign_scalar_eig2mkl(beta_, myone); \
+  m = convert_index<BlasIndex>(rows);  \
+  n = convert_index<BlasIndex>(cols);  \
+  k = convert_index<BlasIndex>(depth); \
 \
 /* Set lda, ldb, ldc */ \
-  lda = (MKL_INT)lhsStride; \
-  ldb = (MKL_INT)rhsStride; \
-  ldc = (MKL_INT)resStride; \
+  lda = convert_index<BlasIndex>(lhsStride); \
+  ldb = convert_index<BlasIndex>(rhsStride); \
+  ldc = convert_index<BlasIndex>(resStride); \
 \
 /* Set a, b, c */ \
   if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
     Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
     a_tmp = lhs.conjugate(); \
     a = a_tmp.data(); \
-    lda = a_tmp.outerStride(); \
+    lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
   } else a = _lhs; \
 \
   if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
     Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
     b_tmp = rhs.conjugate(); \
     b = b_tmp.data(); \
-    ldb = b_tmp.outerStride(); \
+    ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
   } else b = _rhs; \
 \
-  MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+  BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 }};
 
-GEMM_SPECIALIZATION(double,   d,  double,        d)
-GEMM_SPECIALIZATION(float,    f,  float,         s)
-GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z)
-GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8,  c)
+GEMM_SPECIALIZATION(double,   d,  double, d)
+GEMM_SPECIALIZATION(float,    f,  float,  s)
+GEMM_SPECIALIZATION(dcomplex, cd, double, z)
+GEMM_SPECIALIZATION(scomplex, cf, float,  c)
 
 } // end namespase internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 09387703e..3c1a7fc40 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
 #define EIGEN_GENERAL_MATRIX_VECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -26,11 +26,39 @@ namespace internal {
  *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
+ *
+ * Accesses to the matrix coefficients follow the following logic:
+ *
+ * - if all columns have the same alignment then
+ *   - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
+ *   - otherwise perform unaligned loads only (-> NoneAligned case)
+ * - otherwise
+ *   - if even columns have the same alignment then
+ *     // odd columns are guaranteed to have the same alignment too
+ *     - if even or odd columns have the same alignment as the result, then
+ *       // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
+ *       - perform half aligned and half unaligned loads (-> EvenAligned case)
+ *     - otherwise perform unaligned loads only (-> NoneAligned case)
+ *   - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
+ *     - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
+ *       perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
+ *       // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
+ *   - otherwise,
+ *     // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
+ *     // we currently fall back to the NoneAligned case
+ *
+ * The same reasoning apply for the transposed case.
+ *
+ * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
+ * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
+ * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
+ * compared to unaligned loads on a 4 byte boundary.
+ *
  */
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
 enum {
   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@@ -50,31 +78,35 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
 EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index resIncr, RhsScalar alpha);
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
+  RhsScalar alpha);
 };
 
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index resIncr, RhsScalar alpha)
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
+  RhsScalar alpha)
 {
-  EIGEN_UNUSED_VARIABLE(resIncr)
+  EIGEN_UNUSED_VARIABLE(resIncr);
   eigen_internal_assert(resIncr==1);
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
-  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
     pstore(&res[j], \
       padd(pload<ResPacket>(&res[j]), \
         padd( \
-          padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \
-                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \
-          padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \
-                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))
+      padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j),    ptmp0), \
+      pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j),   ptmp1)),   \
+      padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j),    ptmp2), \
+      pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j),   ptmp3)) )))
+
+  typedef typename LhsMapper::VectorMapper LhsScalars;
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
@@ -88,10 +120,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   const Index ResPacketAlignedMask = ResPacketSize-1;
 //  const Index PeelAlignedMask = ResPacketSize*peels-1;
   const Index size = rows;
-  
+
+  const Index lhsStride = lhs.stride();
+
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type.
-  Index alignedStart = internal::first_aligned(res,size);
+  Index alignedStart = internal::first_default_aligned(res,size);
   Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
 
@@ -101,19 +135,26 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
                        : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
+  const Index lhsAlignmentOffset = lhs.firstAligned(size);
 
   // find how many columns do we have to skip to be aligned with the result (if possible)
   Index skipColumns = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
+  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
   {
     alignedSize = 0;
     alignedStart = 0;
+    alignmentPattern = NoneAligned;
+  }
+  else if(LhsPacketSize > 4)
+  {
+    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+    // Currently, it seems to be better to perform unaligned loads anyway
+    alignmentPattern = NoneAligned;
   }
   else if (LhsPacketSize>1)
   {
-    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
 
     while (skipColumns<LhsPacketSize &&
           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
@@ -130,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
       // note that the skiped columns are processed later.
     }
 
-    eigen_internal_assert(  (alignmentPattern==NoneAligned)
+    /*    eigen_internal_assert(  (alignmentPattern==NoneAligned)
                       || (skipColumns + columnsAtOnce >= cols)
                       || LhsPacketSize > size
-                      || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
   }
   else if(Vectorizable)
   {
@@ -142,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
     alignmentPattern = AllAligned;
   }
 
-  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
+  const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
 
   Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
   {
-    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
-              ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
-              ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
-              ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
+    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
+              ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
+              ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
+              ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
 
     // this helps a lot generating better binary code
-    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+    const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0),   lhs1 = lhs.getVectorMapper(0, i+offset1),
+                     lhs2 = lhs.getVectorMapper(0, i+2),   lhs3 = lhs.getVectorMapper(0, i+offset3);
 
     if (Vectorizable)
     {
@@ -163,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
       // process initial unaligned coeffs
       for (Index j=0; j<alignedStart; ++j)
       {
-        res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
-        res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
-        res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
-        res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+        res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+        res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+        res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+        res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
       }
 
       if (alignedSize>alignedStart)
@@ -175,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
         {
           case AllAligned:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
             break;
           case EvenAligned:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
             break;
           case FirstAligned:
           {
@@ -189,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
               ResPacket T0, T1;
 
-              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
-              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
-              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
 
               for (; j<peeledSize; j+=peels*ResPacketSize)
               {
-                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
-                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
-                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
 
-                A00 = pload<LhsPacket>(&lhs0[j]);
-                A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
+                A00 = lhs0.template load<LhsPacket, Aligned>(j);
+                A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
                 T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
                 T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
 
                 T0  = pcj.pmadd(A01, ptmp1, T0);
-                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
                 T0  = pcj.pmadd(A02, ptmp2, T0);
-                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
                 T0  = pcj.pmadd(A03, ptmp3, T0);
                 pstore(&res[j],T0);
-                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
                 T1  = pcj.pmadd(A11, ptmp1, T1);
                 T1  = pcj.pmadd(A12, ptmp2, T1);
                 T1  = pcj.pmadd(A13, ptmp3, T1);
@@ -218,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
               }
             }
             for (; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
             break;
           }
           default:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
             break;
         }
       }
@@ -232,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
     /* process remaining coeffs (or all if there is no explicit vectorization) */
     for (Index j=alignedSize; j<size; ++j)
     {
-      res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
-      res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
-      res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
-      res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+      res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+      res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+      res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+      res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
     }
   }
 
@@ -246,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   {
     for (Index k=start; k<end; ++k)
     {
-      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
-      const LhsScalar* lhs0 = lhs + k*lhsStride;
+      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
+      const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
 
       if (Vectorizable)
       {
         /* explicit vectorization */
         // process first unaligned result's coeffs
         for (Index j=0; j<alignedStart; ++j)
-          res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
+          res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
         // process aligned result's coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
         else
           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
       }
 
       // process remaining scalars (or all if no explicit vectorization)
       for (Index i=alignedSize; i<size; ++i)
-        res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
+        res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
     }
     if (skipColumns)
     {
@@ -290,10 +331,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
  *  - alpha is always a complex (or converted to a complex)
  *  - no vectorization
  */
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
 enum {
   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@@ -310,73 +351,84 @@ typedef typename packet_traits<ResScalar>::type  _ResPacket;
 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-  
+
 EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index resIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
   ResScalar alpha);
 };
 
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
   ResScalar* res, Index resIncr,
   ResScalar alpha)
 {
-  EIGEN_UNUSED_VARIABLE(rhsIncr);
-  eigen_internal_assert(rhsIncr==1);
+  eigen_internal_assert(rhs.stride()==1);
+
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
 
-  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
-    RhsPacket b = pload<RhsPacket>(&rhs[j]); \
-    ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
-    ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
-    ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
-    ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
+    RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);  \
+    ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
+    ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
+    ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
+    ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
 
+  typedef typename LhsMapper::VectorMapper LhsScalars;
+
   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
   const Index rowsAtOnce = 4;
   const Index peels = 2;
   const Index RhsPacketAlignedMask = RhsPacketSize-1;
   const Index LhsPacketAlignedMask = LhsPacketSize-1;
-//   const Index PeelAlignedMask = RhsPacketSize*peels-1;
   const Index depth = cols;
+  const Index lhsStride = lhs.stride();
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type
   // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = internal::first_aligned(rhs, depth);
+  Index alignedStart = rhs.firstAligned(depth);
   Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
 
   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
   Index alignmentPattern = alignmentStep==0 ? AllAligned
-                         : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                         : FirstAligned;
+                           : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+                           : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
+  const Index lhsAlignmentOffset = lhs.firstAligned(depth);
+  const Index rhsAlignmentOffset = rhs.firstAligned(rows);
 
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   Index skipRows = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
+  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
+      (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
+      (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
   {
     alignedSize = 0;
     alignedStart = 0;
+    alignmentPattern = NoneAligned;
+  }
+  else if(LhsPacketSize > 4)
+  {
+    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+    alignmentPattern = NoneAligned;
   }
   else if (LhsPacketSize>1)
   {
-    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
 
     while (skipRows<LhsPacketSize &&
            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
@@ -392,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
       skipRows = (std::min)(skipRows,Index(rows));
       // note that the skiped columns are processed later.
     }
-    eigen_internal_assert(  alignmentPattern==NoneAligned
+    /*    eigen_internal_assert(  alignmentPattern==NoneAligned
                       || LhsPacketSize==1
                       || (skipRows + rowsAtOnce >= rows)
                       || LhsPacketSize > depth
-                      || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
   }
   else if(Vectorizable)
   {
@@ -405,18 +457,19 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     alignmentPattern = AllAligned;
   }
 
-  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
+  const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
 
   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
   {
-    EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+    // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
+    EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
     ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
 
     // this helps the compiler generating good binary code
-    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+    const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0),    lhs1 = lhs.getVectorMapper(i+offset1, 0),
+                     lhs2 = lhs.getVectorMapper(i+2, 0),    lhs3 = lhs.getVectorMapper(i+offset3, 0);
 
     if (Vectorizable)
     {
@@ -428,9 +481,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
       // FIXME this loop get vectorized by the compiler !
       for (Index j=0; j<alignedStart; ++j)
       {
-        RhsScalar b = rhs[j];
-        tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
-        tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+        RhsScalar b = rhs(j, 0);
+        tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+        tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
       }
 
       if (alignedSize>alignedStart)
@@ -439,11 +492,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
         {
           case AllAligned:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
             break;
           case EvenAligned:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
             break;
           case FirstAligned:
           {
@@ -457,39 +510,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
                * than basic unaligned loads.
                */
               LhsPacket A01, A02, A03, A11, A12, A13;
-              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
-              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
-              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
 
               for (; j<peeledSize; j+=peels*RhsPacketSize)
               {
-                RhsPacket b = pload<RhsPacket>(&rhs[j]);
-                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
-                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
-                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
+                RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
 
-                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
                 ptmp1 = pcj.pmadd(A01, b, ptmp1);
-                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
                 ptmp2 = pcj.pmadd(A02, b, ptmp2);
-                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
                 ptmp3 = pcj.pmadd(A03, b, ptmp3);
-                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
 
-                b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
-                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
+                b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
                 ptmp1 = pcj.pmadd(A11, b, ptmp1);
                 ptmp2 = pcj.pmadd(A12, b, ptmp2);
                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
               }
             }
             for (; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
             break;
           }
           default:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
             break;
         }
         tmp0 += predux(ptmp0);
@@ -503,9 +556,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     // FIXME this loop get vectorized by the compiler !
     for (Index j=alignedSize; j<depth; ++j)
     {
-      RhsScalar b = rhs[j];
-      tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
-      tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+      RhsScalar b = rhs(j, 0);
+      tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+      tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
     }
     res[i*resIncr]            += alpha*tmp0;
     res[(i+offset1)*resIncr]  += alpha*tmp1;
@@ -520,30 +573,30 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
   {
     for (Index i=start; i<end; ++i)
     {
-      EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+      EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
       ResPacket ptmp0 = pset1<ResPacket>(tmp0);
-      const LhsScalar* lhs0 = lhs + i*lhsStride;
+      const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
       // process first unaligned result's coeffs
       // FIXME this loop get vectorized by the compiler !
       for (Index j=0; j<alignedStart; ++j)
-        tmp0 += cj.pmul(lhs0[j], rhs[j]);
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
 
       if (alignedSize>alignedStart)
       {
         // process aligned rhs coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
         else
           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
         tmp0 += predux(ptmp0);
       }
 
       // process remaining scalars
       // FIXME this loop get vectorized by the compiler !
       for (Index j=alignedSize; j<depth; ++j)
-        tmp0 += cj.pmul(lhs0[j], rhs[j]);
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
       res[i*resIncr] += alpha*tmp0;
     }
     if (skipRows)
diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
index 1cb9fe6b5..e3a5d5892 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   General matrix-vector product functionality based on ?GEMV.
  ********************************************************************************
 */
 
-#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
-#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
 
 namespace Eigen { 
 
@@ -46,47 +46,46 @@ namespace internal {
 
 // gemv specialization
 
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
-struct general_matrix_vector_product_gemv :
-  general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {};
+template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct general_matrix_vector_product_gemv;
 
-#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
+#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar) \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \
 static void run( \
   Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
-  const Scalar* rhs, Index rhsIncr, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &rhs, \
   Scalar* res, Index resIncr, Scalar alpha) \
 { \
   if (ConjugateLhs) { \
-    general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+    general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,BuiltIn>::run( \
+      rows, cols, lhs, rhs, res, resIncr, alpha); \
   } else { \
     general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
   } \
 } \
 }; \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ConjugateRhs,Specialized> { \
 static void run( \
   Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
-  const Scalar* rhs, Index rhsIncr, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &rhs, \
   Scalar* res, Index resIncr, Scalar alpha) \
 { \
     general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
 } \
 }; \
 
-EIGEN_MKL_GEMV_SPECIALIZE(double)
-EIGEN_MKL_GEMV_SPECIALIZE(float)
-EIGEN_MKL_GEMV_SPECIALIZE(dcomplex)
-EIGEN_MKL_GEMV_SPECIALIZE(scomplex)
+EIGEN_BLAS_GEMV_SPECIALIZE(double)
+EIGEN_BLAS_GEMV_SPECIALIZE(float)
+EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
 
-#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \
+#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \
 template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
 struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
 { \
@@ -98,16 +97,15 @@ static void run( \
   const EIGTYPE* rhs, Index rhsIncr, \
   EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
 { \
-  MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \
-  MKLTYPE alpha_, beta_; \
-  const EIGTYPE *x_ptr, myone(1); \
+  BlasIndex m=convert_index<BlasIndex>(rows), n=convert_index<BlasIndex>(cols), \
+            lda=convert_index<BlasIndex>(lhsStride), incx=convert_index<BlasIndex>(rhsIncr), incy=convert_index<BlasIndex>(resIncr); \
+  const EIGTYPE beta(1); \
+  const EIGTYPE *x_ptr; \
   char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
   if (LhsStorageOrder==RowMajor) { \
-    m=cols; \
-    n=rows; \
+    m = convert_index<BlasIndex>(cols); \
+    n = convert_index<BlasIndex>(rows); \
   }\
-  assign_scalar_eig2mkl(alpha_, alpha); \
-  assign_scalar_eig2mkl(beta_, myone); \
   GEMVVector x_tmp; \
   if (ConjugateRhs) { \
     Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
@@ -115,17 +113,17 @@ static void run( \
     x_ptr=x_tmp.data(); \
     incx=1; \
   } else x_ptr=rhs; \
-  MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
+  BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
 }\
 };
 
-EIGEN_MKL_GEMV_SPECIALIZATION(double,   double,        d)
-EIGEN_MKL_GEMV_SPECIALIZATION(float,    float,         s)
-EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8,  c)
+EIGEN_BLAS_GEMV_SPECIALIZATION(double,   double, d)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float,    float,  s)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float,  c)
 
 } // end namespase internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 6937ee332..c2f084c82 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -49,8 +49,8 @@ inline void initParallel()
 {
   int nbt;
   internal::manage_multi_threading(GetAction, &nbt);
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
 }
 
 /** \returns the max number of threads reserved for Eigen
@@ -73,17 +73,17 @@ namespace internal {
 
 template<typename Index> struct GemmParallelInfo
 {
-  GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {}
+  GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
 
-  int volatile sync;
+  Index volatile sync;
   int volatile users;
 
-  Index rhs_start;
-  Index rhs_length;
+  Index lhs_start;
+  Index lhs_length;
 };
 
 template<bool Condition, typename Functor, typename Index>
-void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
+void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
 {
   // TODO when EIGEN_USE_BLAS is defined,
   // we should still enable OMP for other scalar types
@@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
   // the matrix product when multithreading is enabled. This is a temporary
   // fix to support row-major destination matrices. This whole
   // parallelizer mechanism has to be redisigned anyway.
+  EIGEN_UNUSED_VARIABLE(depth);
   EIGEN_UNUSED_VARIABLE(transpose);
   func(0,rows, 0,cols);
 #else
@@ -102,56 +103,56 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
   // - we are not already in a parallel code
   // - the sizes are large enough
 
-  // 1- are we already in a parallel session?
-  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
-  if((!Condition) || (omp_get_num_threads()>1))
-    return func(0,rows, 0,cols);
+  // compute the maximal number of threads from the size of the product:
+  // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once.
+  Index size = transpose ? rows : cols;
+  Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr);
 
-  Index size = transpose ? cols : rows;
+  // compute the maximal number of threads from the total amount of work:
+  double work = static_cast<double>(rows) * static_cast<double>(cols) *
+      static_cast<double>(depth);
+  double kMinTaskSize = 50000;  // FIXME improve this heuristic.
+  pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize));
 
-  // 2- compute the maximal number of threads from the size of the product:
-  // FIXME this has to be fine tuned
-  Index max_threads = std::max<Index>(1,size / 32);
+  // compute the number of threads we are going to use
+  Index threads = std::min<Index>(nbThreads(), pb_max_threads);
 
-  // 3 - compute the number of threads we are going to use
-  Index threads = std::min<Index>(nbThreads(), max_threads);
-
-  if(threads==1)
+  // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session,
+  // then abort multi-threading
+  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
+  if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
     return func(0,rows, 0,cols);
 
   Eigen::initParallel();
-  func.initParallelSession();
+  func.initParallelSession(threads);
 
   if(transpose)
     std::swap(rows,cols);
 
-  GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
+  ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
 
   #pragma omp parallel num_threads(threads)
   {
     Index i = omp_get_thread_num();
     // Note that the actual number of threads might be lower than the number of request ones.
     Index actual_threads = omp_get_num_threads();
-    
+
     Index blockCols = (cols / actual_threads) & ~Index(0x3);
-    Index blockRows = (rows / actual_threads) & ~Index(0x7);
-    
+    Index blockRows = (rows / actual_threads);
+    blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+
     Index r0 = i*blockRows;
     Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
 
     Index c0 = i*blockCols;
     Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
 
-    info[i].rhs_start = c0;
-    info[i].rhs_length = actualBlockCols;
+    info[i].lhs_start = r0;
+    info[i].lhs_length = actualBlockRows;
 
-    if(transpose)
-      func(0, cols, r0, actualBlockRows, info);
-    else
-      func(r0, actualBlockRows, 0,cols, info);
+    if(transpose) func(c0, actualBlockCols, 0, rows, info);
+    else          func(0, rows, c0, actualBlockCols, info);
   }
-
-  delete[] info;
 #endif
 }
 
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 99cf9e0ae..da6f82abc 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {
 
 // pack a selfadjoint block diagonal for use with the gebp_kernel
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
+template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
 struct symm_pack_lhs
 {
   template<int BlockRows> inline
@@ -45,25 +45,32 @@ struct symm_pack_lhs
   }
   void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
   {
+    enum { PacketSize = packet_traits<Scalar>::size };
     const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
     Index count = 0;
-    Index peeled_mc = (rows/Pack1)*Pack1;
-    for(Index i=0; i<peeled_mc; i+=Pack1)
-    {
-      pack<Pack1>(blockA, lhs, cols, i, count);
-    }
-
-    if(rows-peeled_mc>=Pack2)
-    {
-      pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
-      peeled_mc += Pack2;
-    }
+    //Index peeled_mc3 = (rows/Pack1)*Pack1;
+    
+    const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+    const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+    const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+    
+    if(Pack1>=3*PacketSize)
+      for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
+        pack<3*PacketSize>(blockA, lhs, cols, i, count);
+    
+    if(Pack1>=2*PacketSize)
+      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
+        pack<2*PacketSize>(blockA, lhs, cols, i, count);
+    
+    if(Pack1>=1*PacketSize)
+      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
+        pack<1*PacketSize>(blockA, lhs, cols, i, count);
 
     // do the same with mr==1
-    for(Index i=peeled_mc; i<rows; i++)
+    for(Index i=peeled_mc1; i<rows; i++)
     {
       for(Index k=0; k<i; k++)
-        blockA[count++] = lhs(i, k);              // normal
+        blockA[count++] = lhs(i, k);                   // normal
 
       blockA[count++] = numext::real(lhs(i, i));       // real (diagonal)
 
@@ -82,7 +89,8 @@ struct symm_pack_rhs
     Index end_k = k2 + rows;
     Index count = 0;
     const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
-    Index packet_cols = (cols/nr)*nr;
+    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
 
     // first part: normal case
     for(Index j2=0; j2<k2; j2+=nr)
@@ -91,79 +99,151 @@ struct symm_pack_rhs
       {
         blockB[count+0] = rhs(k,j2+0);
         blockB[count+1] = rhs(k,j2+1);
-        if (nr==4)
+        if (nr>=4)
         {
           blockB[count+2] = rhs(k,j2+2);
           blockB[count+3] = rhs(k,j2+3);
         }
+        if (nr>=8)
+        {
+          blockB[count+4] = rhs(k,j2+4);
+          blockB[count+5] = rhs(k,j2+5);
+          blockB[count+6] = rhs(k,j2+6);
+          blockB[count+7] = rhs(k,j2+7);
+        }
         count += nr;
       }
     }
 
     // second part: diagonal block
-    for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr)
+    Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
+    if(nr>=8)
     {
-      // again we can split vertically in three different parts (transpose, symmetric, normal)
-      // transpose
-      for(Index k=k2; k<j2; k++)
+      for(Index j2=k2; j2<end8; j2+=8)
       {
-        blockB[count+0] = numext::conj(rhs(j2+0,k));
-        blockB[count+1] = numext::conj(rhs(j2+1,k));
-        if (nr==4)
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for(Index k=k2; k<j2; k++)
         {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
           blockB[count+2] = numext::conj(rhs(j2+2,k));
           blockB[count+3] = numext::conj(rhs(j2+3,k));
+          blockB[count+4] = numext::conj(rhs(j2+4,k));
+          blockB[count+5] = numext::conj(rhs(j2+5,k));
+          blockB[count+6] = numext::conj(rhs(j2+6,k));
+          blockB[count+7] = numext::conj(rhs(j2+7,k));
+          count += 8;
         }
-        count += nr;
-      }
-      // symmetric
-      Index h = 0;
-      for(Index k=j2; k<j2+nr; k++)
-      {
-        // normal
-        for (Index w=0 ; w<h; ++w)
-          blockB[count+w] = rhs(k,j2+w);
+        // symmetric
+        Index h = 0;
+        for(Index k=j2; k<j2+8; k++)
+        {
+          // normal
+          for (Index w=0 ; w<h; ++w)
+            blockB[count+w] = rhs(k,j2+w);
 
-        blockB[count+h] = numext::real(rhs(k,k));
+          blockB[count+h] = numext::real(rhs(k,k));
 
-        // transpose
-        for (Index w=h+1 ; w<nr; ++w)
-          blockB[count+w] = numext::conj(rhs(j2+w,k));
-        count += nr;
-        ++h;
+          // transpose
+          for (Index w=h+1 ; w<8; ++w)
+            blockB[count+w] = numext::conj(rhs(j2+w,k));
+          count += 8;
+          ++h;
+        }
+        // normal
+        for(Index k=j2+8; k<end_k; k++)
+        {
+          blockB[count+0] = rhs(k,j2+0);
+          blockB[count+1] = rhs(k,j2+1);
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+          blockB[count+4] = rhs(k,j2+4);
+          blockB[count+5] = rhs(k,j2+5);
+          blockB[count+6] = rhs(k,j2+6);
+          blockB[count+7] = rhs(k,j2+7);
+          count += 8;
+        }
       }
-      // normal
-      for(Index k=j2+nr; k<end_k; k++)
+    }
+    if(nr>=4)
+    {
+      for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
       {
-        blockB[count+0] = rhs(k,j2+0);
-        blockB[count+1] = rhs(k,j2+1);
-        if (nr==4)
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for(Index k=k2; k<j2; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          count += 4;
+        }
+        // symmetric
+        Index h = 0;
+        for(Index k=j2; k<j2+4; k++)
         {
+          // normal
+          for (Index w=0 ; w<h; ++w)
+            blockB[count+w] = rhs(k,j2+w);
+
+          blockB[count+h] = numext::real(rhs(k,k));
+
+          // transpose
+          for (Index w=h+1 ; w<4; ++w)
+            blockB[count+w] = numext::conj(rhs(j2+w,k));
+          count += 4;
+          ++h;
+        }
+        // normal
+        for(Index k=j2+4; k<end_k; k++)
+        {
+          blockB[count+0] = rhs(k,j2+0);
+          blockB[count+1] = rhs(k,j2+1);
           blockB[count+2] = rhs(k,j2+2);
           blockB[count+3] = rhs(k,j2+3);
+          count += 4;
         }
-        count += nr;
       }
     }
 
     // third part: transposed
-    for(Index j2=k2+rows; j2<packet_cols; j2+=nr)
+    if(nr>=8)
     {
-      for(Index k=k2; k<end_k; k++)
+      for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
       {
-        blockB[count+0] = numext::conj(rhs(j2+0,k));
-        blockB[count+1] = numext::conj(rhs(j2+1,k));
-        if (nr==4)
+        for(Index k=k2; k<end_k; k++)
         {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
           blockB[count+2] = numext::conj(rhs(j2+2,k));
           blockB[count+3] = numext::conj(rhs(j2+3,k));
+          blockB[count+4] = numext::conj(rhs(j2+4,k));
+          blockB[count+5] = numext::conj(rhs(j2+5,k));
+          blockB[count+6] = numext::conj(rhs(j2+6,k));
+          blockB[count+7] = numext::conj(rhs(j2+7,k));
+          count += 8;
+        }
+      }
+    }
+    if(nr>=4)
+    {
+      for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
+      {
+        for(Index k=k2; k<end_k; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          count += 4;
         }
-        count += nr;
       }
     }
 
     // copy the remaining columns one at a time (=> the same with nr==1)
-    for(Index j2=packet_cols; j2<cols; ++j2)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
     {
       // transpose
       Index half = (std::min)(end_k,j2);
@@ -211,7 +291,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
     const Scalar* lhs, Index lhsStride,
     const Scalar* rhs, Index rhsStride,
     Scalar* res,       Index resStride,
-    const Scalar& alpha)
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     product_selfadjoint_matrix<Scalar, Index,
       EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
@@ -219,7 +299,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
       EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
       LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
       ColMajor>
-      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha);
+      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha, blocking);
   }
 };
 
@@ -234,7 +314,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
     Scalar* res,        Index resStride,
-    const Scalar& alpha);
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
 
 template <typename Scalar, typename Index,
@@ -244,33 +324,35 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha)
+    Scalar* _res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     Index size = rows;
 
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
     typedef gebp_traits<Scalar,Scalar> Traits;
 
-    Index kc = size;  // cache block size along the K direction
-    Index mc = rows;  // cache block size along the M direction
-    Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    // kc must smaller than mc
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    // kc must be smaller than mc
     kc = (std::min)(kc,mc);
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*cols;
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB + sizeW;
-
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
 
     for(Index k2=0; k2<size; k2+=kc)
     {
@@ -279,7 +361,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       // we have selected one row panel of rhs and one column panel of lhs
       // pack rhs's panel into a sequential chunk of memory
       // and expand each coeff to a constant packet for further reuse
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
 
       // the select lhs's panel has to be split in three different parts:
       //  1 - the transposed panel above the diagonal block => transposed packed copy
@@ -289,9 +371,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       {
         const Index actual_mc = (std::min)(i2+mc,k2)-i2;
         // transposed packed copy
-        pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
+        pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
       // the block diagonal
       {
@@ -299,16 +381,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
         // symmetric packed copy
         pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
 
-        gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
 
       for(Index i2=k2+kc; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
-        gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
-          (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+          (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
     }
   }
@@ -325,7 +407,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
     Scalar* res,        Index resStride,
-    const Scalar& alpha);
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
 
 template <typename Scalar, typename Index,
@@ -335,27 +417,27 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha)
+    Scalar* _res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     Index size = cols;
 
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-
     typedef gebp_traits<Scalar,Scalar> Traits;
 
-    Index kc = size; // cache block size along the K direction
-    Index mc = rows;  // cache block size along the M direction
-    Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*cols;
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB + sizeW;
-
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    ResMapper res(_res,resStride);
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=0; k2<size; k2+=kc)
@@ -368,9 +450,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
       for(Index i2=0; i2<rows; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,rows)-i2;
-        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
     }
   }
@@ -382,55 +464,58 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
 ***************************************************************************/
 
 namespace internal {
+  
 template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-}
-
-template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  
   enum {
     LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
     LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
     RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
     RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
   };
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  
+  template<typename Dest>
+  static void run(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
 
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
+
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType;
+
+    BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false);
 
     internal::product_selfadjoint_matrix<Scalar, Index,
-      EIGEN_LOGICAL_XOR(LhsIsUpper,
-                        internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
+      EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
       NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
-      EIGEN_LOGICAL_XOR(RhsIsUpper,
-                        internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
+      EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
       NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
       internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
       ::run(
         lhs.rows(), rhs.cols(),                 // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(),  // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(),  // rhs info
+        &lhs.coeffRef(0,0), lhs.outerStride(),  // lhs info
+        &rhs.coeffRef(0,0), rhs.outerStride(),  // rhs info
         &dst.coeffRef(0,0), dst.outerStride(),  // result info
-        actualAlpha                             // alpha
+        actualAlpha, blocking                   // alpha
       );
   }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
index dfa687fef..a45238d69 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
  ********************************************************************************
 */
 
-#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
-#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
 
 namespace Eigen { 
 
@@ -40,7 +40,7 @@ namespace internal {
 
 /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
 
-#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -52,28 +52,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
     EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
     char side='L', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
+    BlasIndex m, n, lda, ldb, ldc; \
     const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
+    EIGTYPE beta(1); \
     MatrixX##EIGPREFIX b_tmp; \
-    EIGTYPE myone(1);\
 \
 /* Set transpose options */ \
 /* Set m, n, k */ \
-    m = (MKL_INT)rows;  \
-    n = (MKL_INT)cols;  \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
+    m = convert_index<BlasIndex>(rows);  \
+    n = convert_index<BlasIndex>(cols);  \
 \
 /* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)lhsStride; \
-    ldb = (MKL_INT)rhsStride; \
-    ldc = (MKL_INT)resStride; \
+    lda = convert_index<BlasIndex>(lhsStride); \
+    ldb = convert_index<BlasIndex>(rhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
 \
 /* Set a, b, c */ \
     if (LhsStorageOrder==RowMajor) uplo='U'; \
@@ -83,16 +78,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
       Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
       b_tmp = rhs.adjoint(); \
       b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } else b = _rhs; \
 \
-    MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+    BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 \
   } \
 };
 
 
-#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -103,36 +98,31 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
     EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
     char side='L', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
+    BlasIndex m, n, lda, ldb, ldc; \
     const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
+    EIGTYPE beta(1); \
     MatrixX##EIGPREFIX b_tmp; \
     Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
-    EIGTYPE myone(1); \
 \
 /* Set transpose options */ \
 /* Set m, n, k */ \
-    m = (MKL_INT)rows; \
-    n = (MKL_INT)cols; \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
+    m = convert_index<BlasIndex>(rows); \
+    n = convert_index<BlasIndex>(cols); \
 \
 /* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)lhsStride; \
-    ldb = (MKL_INT)rhsStride; \
-    ldc = (MKL_INT)resStride; \
+    lda = convert_index<BlasIndex>(lhsStride); \
+    ldb = convert_index<BlasIndex>(rhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
 \
 /* Set a, b, c */ \
     if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
       Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
       a_tmp = lhs.conjugate(); \
       a = a_tmp.data(); \
-      lda = a_tmp.outerStride(); \
+      lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
     } else a = _lhs; \
     if (LhsStorageOrder==RowMajor) uplo='U'; \
 \
@@ -151,23 +141,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
         b_tmp = rhs.transpose(); \
       } \
       b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } \
 \
-    MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+    BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 \
   } \
 };
 
-EIGEN_MKL_SYMM_L(double, double, d, d)
-EIGEN_MKL_SYMM_L(float, float, f, s)
-EIGEN_MKL_HEMM_L(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_HEMM_L(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_SYMM_L(double, double, d, d)
+EIGEN_BLAS_SYMM_L(float, float, f, s)
+EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z)
+EIGEN_BLAS_HEMM_L(scomplex, float, cf, c)
 
 
 /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
 
-#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -179,27 +169,22 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
     EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
     char side='R', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
+    BlasIndex m, n, lda, ldb, ldc; \
     const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
+    EIGTYPE beta(1); \
     MatrixX##EIGPREFIX b_tmp; \
-    EIGTYPE myone(1);\
 \
 /* Set m, n, k */ \
-    m = (MKL_INT)rows;  \
-    n = (MKL_INT)cols;  \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
+    m = convert_index<BlasIndex>(rows);  \
+    n = convert_index<BlasIndex>(cols);  \
 \
 /* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)rhsStride; \
-    ldb = (MKL_INT)lhsStride; \
-    ldc = (MKL_INT)resStride; \
+    lda = convert_index<BlasIndex>(rhsStride); \
+    ldb = convert_index<BlasIndex>(lhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
 \
 /* Set a, b, c */ \
     if (RhsStorageOrder==RowMajor) uplo='U'; \
@@ -209,16 +194,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
       Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
       b_tmp = lhs.adjoint(); \
       b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } else b = _lhs; \
 \
-    MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+    BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 \
   } \
 };
 
 
-#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -229,35 +214,30 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
     EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
     char side='R', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
+    BlasIndex m, n, lda, ldb, ldc; \
     const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
+    EIGTYPE beta(1); \
     MatrixX##EIGPREFIX b_tmp; \
     Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
-    EIGTYPE myone(1); \
 \
 /* Set m, n, k */ \
-    m = (MKL_INT)rows; \
-    n = (MKL_INT)cols; \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
+    m = convert_index<BlasIndex>(rows); \
+    n = convert_index<BlasIndex>(cols); \
 \
 /* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)rhsStride; \
-    ldb = (MKL_INT)lhsStride; \
-    ldc = (MKL_INT)resStride; \
+    lda = convert_index<BlasIndex>(rhsStride); \
+    ldb = convert_index<BlasIndex>(lhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
 \
 /* Set a, b, c */ \
     if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
       Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
       a_tmp = rhs.conjugate(); \
       a = a_tmp.data(); \
-      lda = a_tmp.outerStride(); \
+      lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
     } else a = _rhs; \
     if (RhsStorageOrder==RowMajor) uplo='U'; \
 \
@@ -276,20 +256,20 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
         b_tmp = lhs.transpose(); \
       } \
       b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } \
 \
-    MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+    BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
   } \
 };
 
-EIGEN_MKL_SYMM_R(double, double, d, d)
-EIGEN_MKL_SYMM_R(float, float, f, s)
-EIGEN_MKL_HEMM_R(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_HEMM_R(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_SYMM_R(double, double, d, d)
+EIGEN_BLAS_SYMM_R(float, float, f, s)
+EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z)
+EIGEN_BLAS_HEMM_R(scomplex, float, cf, c)
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index f698f67f9..3fd180e6c 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -30,7 +30,7 @@ struct selfadjoint_matrix_vector_product
 static EIGEN_DONT_INLINE void run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
-  const Scalar* _rhs, Index rhsIncr,
+  const Scalar*  rhs,
   Scalar* res,
   Scalar alpha);
 };
@@ -39,11 +39,12 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
-  const Scalar* _rhs, Index rhsIncr,
+  const Scalar*  rhs,
   Scalar* res,
   Scalar alpha)
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
 
   enum {
@@ -54,23 +55,13 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
   conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> cj0;
   conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
-  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd;
+  conj_helper<RealScalar,Scalar,false, ConjugateRhs> cjd;
 
   conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> pcj0;
   conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-  // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
-  // if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
-  // this is because we need to extract packets
-  ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0);  
-  if (rhsIncr!=1)
-  {
-    const Scalar* it = _rhs;
-    for (Index i=0; i<size; ++i, it+=rhsIncr)
-      rhs[i] = *it;
-  }
 
   Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
   if (FirstTriangular)
@@ -92,12 +83,11 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
     Scalar t3(0);
     Packet ptmp3 = pset1<Packet>(t3);
 
-    size_t starti = FirstTriangular ? 0 : j+2;
-    size_t endi   = FirstTriangular ? j : size;
-    size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
-    size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
+    Index starti = FirstTriangular ? 0 : j+2;
+    Index endi   = FirstTriangular ? j : size;
+    Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
+    Index alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
 
-    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
     res[j]   += cjd.pmul(numext::real(A0[j]), t0);
     res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
     if(FirstTriangular)
@@ -111,11 +101,11 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
       t2 += cj1.pmul(A0[j+1], rhs[j+1]);
     }
 
-    for (size_t i=starti; i<alignedStart; ++i)
+    for (Index i=starti; i<alignedStart; ++i)
     {
-      res[i] += t0 * A0[i] + t1 * A1[i];
-      t2 += numext::conj(A0[i]) * rhs[i];
-      t3 += numext::conj(A1[i]) * rhs[i];
+      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+      t3 += cj1.pmul(A1[i], rhs[i]);
     }
     // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
     // gcc 4.2 does this optimization automatically.
@@ -123,7 +113,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
     const Scalar* EIGEN_RESTRICT a1It  = A1  + alignedStart;
     const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
           Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
-    for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
+    for (Index i=alignedStart; i<alignedEnd; i+=PacketSize)
     {
       Packet A0i = ploadu<Packet>(a0It);  a0It  += PacketSize;
       Packet A1i = ploadu<Packet>(a1It);  a1It  += PacketSize;
@@ -135,7 +125,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
       ptmp3 = pcj1.pmadd(A1i,  Bi, ptmp3);
       pstore(resIt,Xi); resIt += PacketSize;
     }
-    for (size_t i=alignedEnd; i<endi; i++)
+    for (Index i=alignedEnd; i<endi; i++)
     {
       res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
       t2 += cj1.pmul(A0[i], rhs[i]);
@@ -151,7 +141,6 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
     Scalar t1 = cjAlpha * rhs[j];
     Scalar t2(0);
-    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
     res[j] += cjd.pmul(numext::real(A0[j]), t1);
     for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
     {
@@ -169,45 +158,44 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 ***************************************************************************/
 
 namespace internal {
-template<typename Lhs, int LhsMode, typename Rhs>
-struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
-{};
-}
 
 template<typename Lhs, int LhsMode, typename Rhs>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
-
-  enum {
-    LhsUpLo = LhsMode&(Upper|Lower)
-  };
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+  
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+  enum { LhsUpLo = LhsMode&(Upper|Lower) };
+
+  template<typename Dest>
+  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     typedef typename Dest::Scalar ResScalar;
-    typedef typename Base::RhsScalar RhsScalar;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef typename Rhs::Scalar RhsScalar;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
     
-    eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols());
+    eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());
 
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
 
     enum {
       EvalToDest = (Dest::InnerStrideAtCompileTime==1),
-      UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1)
+      UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime==1)
     };
     
     internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
-    internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs;
+    internal::gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!UseRhs> static_rhs;
 
     ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
                                                   EvalToDest ? dest.data() : static_dest.data());
@@ -218,7 +206,7 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
     if(!EvalToDest)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = dest.size();
+      Index size = dest.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
       MappedDest(actualDestPtr, dest.size()) = dest;
@@ -227,18 +215,19 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
     if(!UseRhs)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = rhs.size();
+      Index size = rhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
     }
       
       
-    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
+    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+                                                int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
       (
         lhs.rows(),                             // size
         &lhs.coeffRef(0,0),  lhs.outerStride(), // lhs info
-        actualRhsPtr, 1,                        // rhs info
+        actualRhsPtr,                           // rhs info
         actualDestPtr,                          // result info
         actualAlpha                             // scale factor
       );
@@ -248,34 +237,24 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
   }
 };
 
-namespace internal {
-template<typename Lhs, typename Rhs, int RhsMode>
-struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-}
-
 template<typename Lhs, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,0,true,Rhs,RhsMode,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  enum { RhsUpLo = RhsMode&(Upper|Lower)  };
 
-  enum {
-    RhsUpLo = RhsMode&(Upper|Lower)
-  };
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  template<typename Dest>
+  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     // let's simply transpose the product
     Transpose<Dest> destT(dest);
-    SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
-                             Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha);
+    selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
+                             Transpose<const Lhs>, 0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha);
   }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
index 86684b66d..38f23accf 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV.
  ********************************************************************************
 */
 
-#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
-#define EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
 
 namespace Eigen { 
 
@@ -47,31 +47,31 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 struct selfadjoint_matrix_vector_product_symv :
   selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {};
 
-#define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \
+#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar) \
 template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
 struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
 static void run( \
   Index size, const Scalar*  lhs, Index lhsStride, \
-  const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \
+  const Scalar* _rhs, Scalar* res, Scalar alpha) { \
     enum {\
       IsColMajor = StorageOrder==ColMajor \
     }; \
     if (IsColMajor == ConjugateLhs) {\
       selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \
-        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
+        size, lhs, lhsStride, _rhs, res, alpha);  \
     } else {\
       selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \
-        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
+        size, lhs, lhsStride, _rhs, res, alpha);  \
     }\
   } \
 }; \
 
-EIGEN_MKL_SYMV_SPECIALIZE(double)
-EIGEN_MKL_SYMV_SPECIALIZE(float)
-EIGEN_MKL_SYMV_SPECIALIZE(dcomplex)
-EIGEN_MKL_SYMV_SPECIALIZE(scomplex)
+EIGEN_BLAS_SYMV_SPECIALIZE(double)
+EIGEN_BLAS_SYMV_SPECIALIZE(float)
+EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_SYMV_SPECIALIZE(scomplex)
 
-#define EIGEN_MKL_SYMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLFUNC) \
+#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
 template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
 struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \
 { \
@@ -79,36 +79,33 @@ typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
 \
 static void run( \
 Index size, const EIGTYPE*  lhs, Index lhsStride, \
-const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \
+const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
 { \
   enum {\
     IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
     IsLower = UpLo == Lower ? 1 : 0 \
   }; \
-  MKL_INT n=size, lda=lhsStride, incx=rhsIncr, incy=1; \
-  MKLTYPE alpha_, beta_; \
-  const EIGTYPE *x_ptr, myone(1); \
+  BlasIndex n=convert_index<BlasIndex>(size), lda=convert_index<BlasIndex>(lhsStride), incx=1, incy=1; \
+  EIGTYPE beta(1); \
+  const EIGTYPE *x_ptr; \
   char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
-  assign_scalar_eig2mkl(alpha_, alpha); \
-  assign_scalar_eig2mkl(beta_, myone); \
   SYMVVector x_tmp; \
   if (ConjugateRhs) { \
-    Map<const SYMVVector, 0, InnerStride<> > map_x(_rhs,size,1,InnerStride<>(incx)); \
+    Map<const SYMVVector, 0 > map_x(_rhs,size,1); \
     x_tmp=map_x.conjugate(); \
     x_ptr=x_tmp.data(); \
-    incx=1; \
   } else x_ptr=_rhs; \
-  MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
+  BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
 }\
 };
 
-EIGEN_MKL_SYMV_SPECIALIZATION(double,   double,        dsymv)
-EIGEN_MKL_SYMV_SPECIALIZATION(float,    float,         ssymv)
-EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
-EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, MKL_Complex8,  chemv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(double,   double, dsymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float,    float,  ssymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float,  chemv_)
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index 6ca4ae6c0..f038d686f 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -53,7 +53,6 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true>
   static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
   {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef internal::blas_traits<OtherType> OtherBlasTraits;
     typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
     typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
@@ -86,7 +85,6 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
   static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
   {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef internal::blas_traits<OtherType> OtherBlasTraits;
     typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
     typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
@@ -94,15 +92,27 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
 
     Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
 
-    enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+      OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0
+    };
+
+    Index size = mat.cols();
+    Index depth = actualOther.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,Scalar,Scalar,
+              MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualOtherType::MaxColsAtCompileTime> BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+
 
     internal::general_matrix_matrix_triangular_product<Index,
-      Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
-      Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
-      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
-      ::run(mat.cols(), actualOther.cols(),
+      Scalar, OtherIsRowMajor ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+      Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
+      IsRowMajor ? RowMajor : ColMajor, UpLo>
+      ::run(size, depth,
             &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha);
+            mat.data(), mat.outerStride(), actualAlpha, blocking);
   }
 };
 
diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index 8594a97ce..2ae364111 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -79,11 +79,11 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
   if (IsRowMajor)
     actualAlpha = numext::conj(actualAlpha);
 
-  internal::selfadjoint_rank2_update_selector<Scalar, Index,
-    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type,
-    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type,
+  typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type UType;
+  typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type VType;
+  internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
     (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
-    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha);
+    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha);
 
   return *this;
 }
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 8110507b5..6ec5a8a0b 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     // strip zeros
@@ -117,30 +117,36 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     Index depth     = IsLower ? diagSize : _depth;
     Index cols      = _cols;
     
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    // The small panel size must not be larger than blocking size.
+    // Usually this should never be the case because SmallPanelWidth^2 is very small
+    // compared to L2 cache size, but let's be safe:
+    Index panelWidth = (std::min)(Index(SmallPanelWidth),(std::min)(kc,mc));
 
     std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
 
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
 
-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
     triangularBuffer.setZero();
     if((Mode&ZeroDiag)==ZeroDiag)
       triangularBuffer.diagonal().setZero();
     else
       triangularBuffer.diagonal().setOnes();
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=IsLower ? depth : 0;
         IsLower ? k2>0 : k2<depth;
@@ -156,7 +162,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         k2 = k2+actual_kc-kc;
       }
 
-      pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
+      pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
 
       // the selected lhs's panel has to be split in three different parts:
       //  1 - the part which is zero => skip it
@@ -167,9 +173,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
       if(IsLower || actual_k2<rows)
       {
         // for each small vertical panels of lhs
-        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+        for (Index k1=0; k1<actual_kc; k1+=panelWidth)
         {
-          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+          Index actualPanelWidth = std::min<Index>(actual_kc-k1, panelWidth);
           Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
           Index startBlock   = actual_k2+k1;
           Index blockBOffset = k1;
@@ -184,20 +190,22 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
             for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
               triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
           }
-          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
+          pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
 
-          gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
-                      actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+          gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
+                      actualPanelWidth, actualPanelWidth, cols, alpha,
+                      actualPanelWidth, actual_kc, 0, blockBOffset);
 
           // GEBP with remaining micro panel
           if (lengthTarget>0)
           {
             Index startTarget  = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
 
-            pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
+            pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
 
-            gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
-                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+            gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
+                        lengthTarget, actualPanelWidth, cols, alpha,
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
           }
         }
       }
@@ -208,10 +216,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         for(Index i2=start; i2<end; i2+=mc)
         {
           const Index actual_mc = (std::min)(i2+mc,end)-i2;
-          gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
-            (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+            (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
-          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+          gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
+                      actual_kc, cols, alpha, -1, -1, 0, 0);
         }
       }
     }
@@ -249,40 +258,43 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
+    const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
     // strip zeros
     Index diagSize  = (std::min)(_cols,_depth);
     Index rows      = _rows;
     Index depth     = IsLower ? _depth : diagSize;
     Index cols      = IsLower ? diagSize : _cols;
     
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
 
     std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = kc*cols+EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar);
 
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
 
-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
     triangularBuffer.setZero();
     if((Mode&ZeroDiag)==ZeroDiag)
       triangularBuffer.diagonal().setZero();
     else
       triangularBuffer.diagonal().setOnes();
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
 
     for(Index k2=IsLower ? 0 : depth;
         IsLower ? k2<depth  : k2>0;
@@ -304,8 +316,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
 
       Scalar* geb = blockB+ts*ts;
+      geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar));
 
-      pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
+      pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
 
       // pack the triangular part of the rhs padding the unrolled blocks with zeros
       if(ts>0)
@@ -318,7 +331,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
           Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
           // general part
           pack_rhs_panel(blockB+j2*actual_kc,
-                         &rhs(actual_k2+panelOffset, actual_j2), rhsStride,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
 
@@ -332,7 +345,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
           }
 
           pack_rhs_panel(blockB+j2*actual_kc,
-                         triangularBuffer.data(), triangularBuffer.outerStride(),
+                         RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
                          actualPanelWidth, actualPanelWidth,
                          actual_kc, j2);
         }
@@ -341,7 +354,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       for (Index i2=0; i2<rows; i2+=mc)
       {
         const Index actual_mc = (std::min)(mc,rows-i2);
-        pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
         // triangular kernel
         if(ts>0)
@@ -352,19 +365,18 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
             Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
             Index blockOffset = IsLower ? j2 : 0;
 
-            gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
+            gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
                         blockA, blockB+j2*actual_kc,
                         actual_mc, panelLength, actualPanelWidth,
                         alpha,
                         actual_kc, actual_kc,  // strides
-                        blockOffset, blockOffset,// offsets
-                        blockW); // workspace
+                        blockOffset, blockOffset);// offsets
           }
         }
-        gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
+        gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
                     blockA, geb, actual_mc, actual_kc, rs,
                     alpha,
-                    -1, -1, 0, 0, blockW);
+                    -1, -1, 0, 0);
       }
     }
   }
@@ -373,28 +385,28 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
 * Wrapper to product_triangular_matrix_matrix
 ***************************************************************************/
 
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> >
-  : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> >
-{};
-
 } // end namespace internal
 
+namespace internal {
 template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
-  : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs >
+struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
   {
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typedef typename Dest::Scalar     Scalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+    
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
 
     typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
               Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
@@ -405,23 +417,25 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
     Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
                                          : ((IsLower)  ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
 
-    BlockingType blocking(stripedRows, stripedCols, stripedDepth);
+    BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
 
     internal::product_triangular_matrix_matrix<Scalar, Index,
       Mode, LhsIsTriangular,
-      (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
-      (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
       (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor>
       ::run(
         stripedRows, stripedCols, stripedDepth,   // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(), // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(), // rhs info
+        &lhs.coeffRef(0,0), lhs.outerStride(),    // lhs info
+        &rhs.coeffRef(0,0), rhs.outerStride(),    // rhs info
         &dst.coeffRef(0,0), dst.outerStride(),    // result info
         actualAlpha, blocking
       );
   }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
index ba41a1c99..aecded6bb 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Triangular matrix * matrix product functionality based on ?TRMM.
  ********************************************************************************
 */
 
-#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
-#define EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
 
 namespace Eigen { 
 
@@ -50,7 +50,7 @@ struct product_triangular_matrix_matrix_trmm :
 
 
 // try to go to BLAS specialization
-#define EIGEN_MKL_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
+#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
 template <typename Index, int Mode, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -65,17 +65,17 @@ struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
   } \
 };
 
-EIGEN_MKL_TRMM_SPECIALIZE(double, true)
-EIGEN_MKL_TRMM_SPECIALIZE(double, false)
-EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, true)
-EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, false)
-EIGEN_MKL_TRMM_SPECIALIZE(float, true)
-EIGEN_MKL_TRMM_SPECIALIZE(float, false)
-EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true)
-EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(double, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(double, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
 
 // implements col-major += alpha * op(triangular) * op(general)
-#define EIGEN_MKL_TRMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, int Mode, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -106,13 +106,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
    typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
    typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
 \
-/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
+/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \
    if (rows != depth) { \
 \
-     int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+     /* FIXME handle mkl_domain_get_max_threads */ \
+     /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;\
 \
      if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
-     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
+     /* Most likely no benefit to call TRMM or GEMM from BLAS */ \
        product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
        LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
            _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
@@ -121,27 +122,23 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
      /* Make sense to call GEMM */ \
        Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
        MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
-       MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
        general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
        rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
 \
-     /*std::cout << "TRMM_L: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
+     /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
      } \
      return; \
    } \
    char side = 'L', transa, uplo, diag = 'N'; \
    EIGTYPE *b; \
    const EIGTYPE *a; \
-   MKL_INT m, n, lda, ldb; \
-   MKLTYPE alpha_; \
-\
-/* Set alpha_*/ \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+   BlasIndex m, n, lda, ldb; \
 \
 /* Set m, n */ \
-   m = (MKL_INT)diagSize; \
-   n = (MKL_INT)cols; \
+   m = convert_index<BlasIndex>(diagSize); \
+   n = convert_index<BlasIndex>(cols); \
 \
 /* Set trans */ \
    transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
@@ -152,7 +149,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
 \
    if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \
    b = b_tmp.data(); \
-   ldb = b_tmp.outerStride(); \
+   ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
 \
 /* Set uplo */ \
    uplo = IsLower ? 'L' : 'U'; \
@@ -168,14 +165,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
      else if (IsUnitDiag) \
        a_tmp.diagonal().setOnes();\
      a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
    } else { \
      a = _lhs; \
-     lda = lhsStride; \
+     lda = convert_index<BlasIndex>(lhsStride); \
    } \
-   /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \
+   /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
 /* call ?trmm*/ \
-   MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
+   BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
 \
 /* Add op(a_triangular)*b into res*/ \
    Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@@ -183,13 +180,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
   } \
 };
 
-EIGEN_MKL_TRMM_L(double, double, d, d)
-EIGEN_MKL_TRMM_L(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMM_L(float, float, f, s)
-EIGEN_MKL_TRMM_L(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_TRMM_L(double, double, d, d)
+EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z)
+EIGEN_BLAS_TRMM_L(float, float, f, s)
+EIGEN_BLAS_TRMM_L(scomplex, float, cf, c)
 
 // implements col-major += alpha * op(general) * op(triangular)
-#define EIGEN_MKL_TRMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, int Mode, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -220,13 +217,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
    typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
    typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
 \
-/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
+/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \
    if (cols != depth) { \
 \
-     int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+     int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/; \
 \
      if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
-     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
+     /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \
        product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
        LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
            _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
@@ -235,27 +232,23 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
      /* Make sense to call GEMM */ \
        Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
        MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
-       MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
        general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
        rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
 \
-     /*std::cout << "TRMM_R: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
+     /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
      } \
      return; \
    } \
    char side = 'R', transa, uplo, diag = 'N'; \
    EIGTYPE *b; \
    const EIGTYPE *a; \
-   MKL_INT m, n, lda, ldb; \
-   MKLTYPE alpha_; \
-\
-/* Set alpha_*/ \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+   BlasIndex m, n, lda, ldb; \
 \
 /* Set m, n */ \
-   m = (MKL_INT)rows; \
-   n = (MKL_INT)diagSize; \
+   m = convert_index<BlasIndex>(rows); \
+   n = convert_index<BlasIndex>(diagSize); \
 \
 /* Set trans */ \
    transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
@@ -266,7 +259,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
 \
    if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \
    b = b_tmp.data(); \
-   ldb = b_tmp.outerStride(); \
+   ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
 \
 /* Set uplo */ \
    uplo = IsLower ? 'L' : 'U'; \
@@ -282,14 +275,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
      else if (IsUnitDiag) \
        a_tmp.diagonal().setOnes();\
      a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
    } else { \
      a = _rhs; \
-     lda = rhsStride; \
+     lda = convert_index<BlasIndex>(rhsStride); \
    } \
-   /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \
+   /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
 /* call ?trmm*/ \
-   MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
+   BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
 \
 /* Add op(a_triangular)*b into res*/ \
    Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@@ -297,13 +290,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
   } \
 };
 
-EIGEN_MKL_TRMM_R(double, double, d, d)
-EIGEN_MKL_TRMM_R(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMM_R(float, float, f, s)
-EIGEN_MKL_TRMM_R(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_TRMM_R(double, double, d, d)
+EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z)
+EIGEN_BLAS_TRMM_R(float, float, f, s)
+EIGEN_BLAS_TRMM_R(scomplex, float, cf, c)
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 6117d5a82..4b292e74d 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
 #define EIGEN_TRIANGULARMATRIXVECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -20,20 +20,20 @@ struct triangular_matrix_vector_product;
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   enum {
     IsLower = ((Mode&Lower)==Lower),
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
     HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
   };
   static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
 };
 
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
 EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
   ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
   {
     static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
     Index size = (std::min)(_rows,_cols);
@@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
     typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
-    
+
     typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
     const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
     typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
@@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
     typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
     ResMap res(_res,rows);
 
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
     for (Index pi=0; pi<size; pi+=PanelWidth)
     {
       Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
@@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
       if (r>0)
       {
         Index s = IsLower ? pi+actualPanelWidth : 0;
-        general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
             r, actualPanelWidth,
-            &lhs.coeffRef(s,pi), lhsStride,
-            &rhs.coeffRef(pi), rhsIncr,
+            LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
+            RhsMapper(&rhs.coeffRef(pi), rhsIncr),
             &res.coeffRef(s), resIncr, alpha);
       }
     }
     if((!IsLower) && cols>size)
     {
-      general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
           rows, cols-size,
-          &lhs.coeffRef(0,size), lhsStride,
-          &rhs.coeffRef(size), rhsIncr,
+          LhsMapper(&lhs.coeffRef(0,size), lhsStride),
+          RhsMapper(&rhs.coeffRef(size), rhsIncr),
           _res, resIncr, alpha);
     }
   }
@@ -88,7 +91,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   enum {
     IsLower = ((Mode&Lower)==Lower),
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
@@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
 
     typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
     ResMap res(_res,rows,InnerStride<>(resIncr));
-    
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
     for (Index pi=0; pi<diagSize; pi+=PanelWidth)
     {
       Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
@@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
       if (r>0)
       {
         Index s = IsLower ? 0 : pi + actualPanelWidth;
-        general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
             actualPanelWidth, r,
-            &lhs.coeffRef(pi,s), lhsStride,
-            &rhs.coeffRef(s), rhsIncr,
+            LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
+            RhsMapper(&rhs.coeffRef(s), rhsIncr),
             &res.coeffRef(pi), resIncr, alpha);
       }
     }
     if(IsLower && rows>diagSize)
     {
-      general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
             rows-diagSize, cols,
-            &lhs.coeffRef(diagSize,0), lhsStride,
-            &rhs.coeffRef(0), rhsIncr,
+            LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
+            RhsMapper(&rhs.coeffRef(0), rhsIncr),
             &res.coeffRef(diagSize), resIncr, alpha);
     }
   }
@@ -157,83 +163,66 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
 * Wrapper to product_triangular_vector
 ***************************************************************************/
 
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true> >
- : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true>, Lhs, Rhs> >
-{};
-
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false> >
- : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false>, Lhs, Rhs> >
-{};
-
-
-template<int StorageOrder>
+template<int Mode,int StorageOrder>
 struct trmv_selector;
 
 } // end namespace internal
 
+namespace internal {
+
 template<int Mode, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,true,Lhs,false,Rhs,true>
-  : public ProductBase<TriangularProduct<Mode,true,Lhs,false,Rhs,true>, Lhs, Rhs >
+struct triangular_product_impl<Mode,true,Lhs,false,Rhs,true>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha)
   {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+    eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols());
   
-    internal::trmv_selector<(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha);
+    internal::trmv_selector<Mode,(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(lhs, rhs, dst, alpha);
   }
 };
 
 template<int Mode, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,false,Lhs,true,Rhs,false>
-  : public ProductBase<TriangularProduct<Mode,false,Lhs,true,Rhs,false>, Lhs, Rhs >
+struct triangular_product_impl<Mode,false,Lhs,true,Rhs,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha)
   {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+    eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols());
 
-    typedef TriangularProduct<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),true,Transpose<const Rhs>,false,Transpose<const Lhs>,true> TriangularProductTranspose;
     Transpose<Dest> dstT(dst);
-    internal::trmv_selector<(int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>::run(
-      TriangularProductTranspose(m_rhs.transpose(),m_lhs.transpose()), dstT, alpha);
+    internal::trmv_selector<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),
+                            (int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>
+            ::run(rhs.transpose(),lhs.transpose(), dstT, alpha);
   }
 };
 
+} // end namespace internal
+
 namespace internal {
 
 // TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
   
-template<> struct trmv_selector<ColMajor>
+template<int Mode> struct trmv_selector<Mode,ColMajor>
 {
-  template<int Mode, typename Lhs, typename Rhs, typename Dest>
-  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::LhsScalar   LhsScalar;
-    typedef typename ProductType::RhsScalar   RhsScalar;
-    typedef typename ProductType::Scalar      ResScalar;
-    typedef typename ProductType::RealScalar  RealScalar;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
-
-    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
-
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    typedef typename Lhs::Scalar      LhsScalar;
+    typedef typename Rhs::Scalar      RhsScalar;
+    typedef typename Dest::Scalar     ResScalar;
+    typedef typename Dest::RealScalar RealScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+                                  * RhsBlasTraits::extractScalarFactor(rhs);
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
@@ -247,7 +236,7 @@ template<> struct trmv_selector<ColMajor>
 
     bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
     bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-    
+
     RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
 
     ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
@@ -267,7 +256,7 @@ template<> struct trmv_selector<ColMajor>
       else
         MappedDest(actualDestPtr, dest.size()) = dest;
     }
-    
+
     internal::triangular_matrix_vector_product
       <Index,Mode,
        LhsScalar, LhsBlasTraits::NeedToConjugate,
@@ -288,33 +277,32 @@ template<> struct trmv_selector<ColMajor>
   }
 };
 
-template<> struct trmv_selector<RowMajor>
+template<int Mode> struct trmv_selector<Mode,RowMajor>
 {
-  template<int Mode, typename Lhs, typename Rhs, typename Dest>
-  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
-    typedef typename ProductType::LhsScalar LhsScalar;
-    typedef typename ProductType::RhsScalar RhsScalar;
-    typedef typename ProductType::Scalar    ResScalar;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::_ActualRhsType _ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-
-    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
-
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    typedef typename Lhs::Scalar      LhsScalar;
+    typedef typename Rhs::Scalar      RhsScalar;
+    typedef typename Dest::Scalar     ResScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+                                  * RhsBlasTraits::extractScalarFactor(rhs);
 
     enum {
-      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
     };
 
-    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
 
     ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
         DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
@@ -322,12 +310,12 @@ template<> struct trmv_selector<RowMajor>
     if(!DirectlyUseRhs)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = actualRhs.size();
+      Index size = actualRhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
-    
+
     internal::triangular_matrix_vector_product
       <Index,Mode,
        LhsScalar, LhsBlasTraits::NeedToConjugate,
diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
index 09f110da7..07bf26ce5 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Triangular matrix-vector product functionality based on ?TRMV.
  ********************************************************************************
 */
 
-#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
-#define EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
+#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
 
 namespace Eigen { 
 
@@ -47,7 +47,7 @@ template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename Rh
 struct triangular_matrix_vector_product_trmv :
   triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {};
 
-#define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \
+#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
  static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
@@ -65,13 +65,13 @@ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs
   } \
 };
 
-EIGEN_MKL_TRMV_SPECIALIZE(double)
-EIGEN_MKL_TRMV_SPECIALIZE(float)
-EIGEN_MKL_TRMV_SPECIALIZE(dcomplex)
-EIGEN_MKL_TRMV_SPECIALIZE(scomplex)
+EIGEN_BLAS_TRMV_SPECIALIZE(double)
+EIGEN_BLAS_TRMV_SPECIALIZE(float)
+EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
 
 // implements col-major: res += alpha * op(triangular) * vector
-#define EIGEN_MKL_TRMV_CM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
   enum { \
@@ -105,17 +105,15 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
 /* Square part handling */\
 \
    char trans, uplo, diag; \
-   MKL_INT m, n, lda, incx, incy; \
+   BlasIndex m, n, lda, incx, incy; \
    EIGTYPE const *a; \
-   MKLTYPE alpha_, beta_; \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+   EIGTYPE beta(1); \
 \
 /* Set m, n */ \
-   n = (MKL_INT)size; \
-   lda = lhsStride; \
+   n = convert_index<BlasIndex>(size); \
+   lda = convert_index<BlasIndex>(lhsStride); \
    incx = 1; \
-   incy = resIncr; \
+   incy = convert_index<BlasIndex>(resIncr); \
 \
 /* Set uplo, trans and diag*/ \
    trans = 'N'; \
@@ -123,40 +121,39 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    diag = IsUnitDiag ? 'U' : 'N'; \
 \
 /* call ?TRMV*/ \
-   MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
+   BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
 \
 /* Add op(a_tr)rhs into res*/ \
-   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
-/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
+   BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
    if (size<(std::max)(rows,cols)) { \
-     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
      if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
      x = x_tmp.data(); \
      if (size<rows) { \
        y = _res + size*resIncr; \
        a = _lhs + size; \
-       m = rows-size; \
-       n = size; \
+       m = convert_index<BlasIndex>(rows-size); \
+       n = convert_index<BlasIndex>(size); \
      } \
      else { \
        x += size; \
        y = _res; \
        a = _lhs + size*lda; \
-       m = size; \
-       n = cols-size; \
+       m = convert_index<BlasIndex>(size); \
+       n = convert_index<BlasIndex>(cols-size); \
      } \
-     MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
+     BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
    } \
   } \
 };
 
-EIGEN_MKL_TRMV_CM(double, double, d, d)
-EIGEN_MKL_TRMV_CM(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMV_CM(float, float, f, s)
-EIGEN_MKL_TRMV_CM(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_TRMV_CM(double,   double, d,  d)
+EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z)
+EIGEN_BLAS_TRMV_CM(float,    float,  f,  s)
+EIGEN_BLAS_TRMV_CM(scomplex, float,  cf, c)
 
 // implements row-major: res += alpha * op(triangular) * vector
-#define EIGEN_MKL_TRMV_RM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
   enum { \
@@ -190,17 +187,15 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
 /* Square part handling */\
 \
    char trans, uplo, diag; \
-   MKL_INT m, n, lda, incx, incy; \
+   BlasIndex m, n, lda, incx, incy; \
    EIGTYPE const *a; \
-   MKLTYPE alpha_, beta_; \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+   EIGTYPE beta(1); \
 \
 /* Set m, n */ \
-   n = (MKL_INT)size; \
-   lda = lhsStride; \
+   n = convert_index<BlasIndex>(size); \
+   lda = convert_index<BlasIndex>(lhsStride); \
    incx = 1; \
-   incy = resIncr; \
+   incy = convert_index<BlasIndex>(resIncr); \
 \
 /* Set uplo, trans and diag*/ \
    trans = ConjLhs ? 'C' : 'T'; \
@@ -208,40 +203,39 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    diag = IsUnitDiag ? 'U' : 'N'; \
 \
 /* call ?TRMV*/ \
-   MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
+   BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
 \
 /* Add op(a_tr)rhs into res*/ \
-   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
-/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
+   BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
    if (size<(std::max)(rows,cols)) { \
-     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
      if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
      x = x_tmp.data(); \
      if (size<rows) { \
        y = _res + size*resIncr; \
        a = _lhs + size*lda; \
-       m = rows-size; \
-       n = size; \
+       m = convert_index<BlasIndex>(rows-size); \
+       n = convert_index<BlasIndex>(size); \
      } \
      else { \
        x += size; \
        y = _res; \
        a = _lhs + size; \
-       m = size; \
-       n = cols-size; \
+       m = convert_index<BlasIndex>(size); \
+       n = convert_index<BlasIndex>(cols-size); \
      } \
-     MKLPREFIX##gemv(&trans, &n, &m, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
+     BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
    } \
   } \
 };
 
-EIGEN_MKL_TRMV_RM(double, double, d, d)
-EIGEN_MKL_TRMV_RM(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMV_RM(float, float, f, s)
-EIGEN_MKL_TRMV_RM(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_TRMV_RM(double,   double, d,  d)
+EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z)
+EIGEN_BLAS_TRMV_RM(float,    float,  f,  s)
+EIGEN_BLAS_TRMV_RM(scomplex, float,  cf, c)
 
 } // end namespase internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
+#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index f103eae72..223c38b86 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
     level3_blocking<Scalar,Scalar>& blocking)
   {
     Index cols = otherSize;
-    const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
-    blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
+
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+    typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
+    TriMapper tri(_tri, triStride);
+    OtherMapper other(_other, otherStride);
 
     typedef gebp_traits<Scalar,Scalar> Traits;
+
     enum {
       SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
       IsLower = (Mode&Lower) == Lower
@@ -66,22 +70,20 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
 
     std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
 
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
 
     conj_if<Conjugate> conj;
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
+    gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
 
     // the goal here is to subdivise the Rhs panels such that we keep some cache
     // coherence when accessing the rhs elements
-    std::ptrdiff_t l1, l2;
-    manage_caching_sizes(GetAction, &l1, &l2);
-    Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
+    Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max<Index>(otherStride,size)) : 0;
     subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
 
     for(Index k2=IsLower ? 0 : size;
@@ -115,8 +117,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           {
             // TODO write a small kernel handling this (can be shared with trsv)
             Index i  = IsLower ? k2+k1+k : k2-k1-k-1;
-            Index s  = IsLower ? k2+k1 : i+1;
             Index rs = actualPanelWidth - k - 1; // remaining size
+            Index s  = TriStorageOrder==RowMajor ? (IsLower ? k2+k1 : i+1)
+                                                 :  IsLower ? i+1 : i-rs;
 
             Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
             for (Index j=j2; j<j2+actual_cols; ++j)
@@ -133,7 +136,6 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
               }
               else
               {
-                Index s = IsLower ? i+1 : i-rs;
                 Scalar b = (other(i,j) *= a);
                 Scalar* r = &other(s,j);
                 const Scalar* l = &tri(s,i);
@@ -148,17 +150,17 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           Index blockBOffset = IsLower ? k1 : lengthTarget;
 
           // update the respective rows of B from other
-          pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+          pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
 
           // GEBP
           if (lengthTarget>0)
           {
             Index startTarget  = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
 
-            pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
+            pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
 
-            gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
-                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+            gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
           }
         }
       }
@@ -172,16 +174,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           const Index actual_mc = (std::min)(mc,end-i2);
           if (actual_mc>0)
           {
-            pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
+            pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
 
-            gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW);
+            gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
           }
         }
       }
     }
   }
 
-/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right
+/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
  */
 template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
 struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
@@ -200,8 +202,12 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
     level3_blocking<Scalar,Scalar>& blocking)
   {
     Index rows = otherSize;
-    const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
-    blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+    LhsMapper lhs(_other, otherStride);
+    RhsMapper rhs(_tri, triStride);
 
     typedef gebp_traits<Scalar,Scalar> Traits;
     enum {
@@ -215,17 +221,15 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
 
     std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*size;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
 
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
 
     conj_if<Conjugate> conj;
-    gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+    gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
 
     for(Index k2=IsLower ? size : 0;
         IsLower ? k2>0 : k2<size;
@@ -238,7 +242,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
       Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
       Scalar* geb = blockB+actual_kc*actual_kc;
 
-      if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
+      if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
 
       // triangular packing (we only pack the panels off the diagonal,
       // neglecting the blocks overlapping the diagonal
@@ -252,7 +256,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
 
           if (panelLength>0)
           pack_rhs_panel(blockB+j2*actual_kc,
-                         &rhs(actual_k2+panelOffset, actual_j2), triStride,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
         }
@@ -280,13 +284,12 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
             // GEBP
             if(panelLength>0)
             {
-              gebp_kernel(&lhs(i2,absolute_j2), otherStride,
+              gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
                           blockA, blockB+j2*actual_kc,
                           actual_mc, panelLength, actualPanelWidth,
                           Scalar(-1),
                           actual_kc, actual_kc, // strides
-                          panelOffset, panelOffset, // offsets
-                          blockW);  // workspace
+                          panelOffset, panelOffset); // offsets
             }
 
             // unblocked triangular solve
@@ -302,22 +305,25 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
                 for (Index i=0; i<actual_mc; ++i)
                   r[i] -= a[i] * b;
               }
-              Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j));
-              for (Index i=0; i<actual_mc; ++i)
-                r[i] *= b;
+              if((Mode & UnitDiag)==0)
+              {
+                Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j));
+                for (Index i=0; i<actual_mc; ++i)
+                  r[i] *= inv_rjj;
+              }
             }
 
             // pack the just computed part of lhs to A
-            pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
+            pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
                            actualPanelWidth, actual_mc,
                            actual_kc, j2);
           }
         }
 
         if (rs>0)
-          gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
+          gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
                       actual_mc, actual_kc, rs, Scalar(-1),
-                      -1, -1, 0, 0, blockW);
+                      -1, -1, 0, 0);
       }
     }
   }
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
index 6a0bb8339..88c0fb794 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
@@ -25,20 +25,20 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Triangular matrix * matrix product functionality based on ?TRMM.
  ********************************************************************************
 */
 
-#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
-#define EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
 
 namespace Eigen {
 
 namespace internal {
 
 // implements LeftSide op(triangular)^-1 * general
-#define EIGEN_MKL_TRSM_L(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \
 template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
 struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
 { \
@@ -53,13 +53,11 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
       const EIGTYPE* _tri, Index triStride, \
       EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
   { \
-   MKL_INT m = size, n = otherSize, lda, ldb; \
+   BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \
    char side = 'L', uplo, diag='N', transa; \
    /* Set alpha_ */ \
-   MKLTYPE alpha; \
-   EIGTYPE myone(1); \
-   assign_scalar_eig2mkl(alpha, myone); \
-   ldb = otherStride;\
+   EIGTYPE alpha(1); \
+   ldb = convert_index<BlasIndex>(otherStride);\
 \
    const EIGTYPE *a; \
 /* Set trans */ \
@@ -75,25 +73,25 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
    if (conjA) { \
      a_tmp = tri.conjugate(); \
      a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
    } else { \
      a = _tri; \
-     lda = triStride; \
+     lda = convert_index<BlasIndex>(triStride); \
    } \
    if (IsUnitDiag) diag='U'; \
 /* call ?trsm*/ \
-   MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
+   BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
  } \
 };
 
-EIGEN_MKL_TRSM_L(double, double, d)
-EIGEN_MKL_TRSM_L(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_TRSM_L(float, float, s)
-EIGEN_MKL_TRSM_L(scomplex, MKL_Complex8, c)
+EIGEN_BLAS_TRSM_L(double,   double, d)
+EIGEN_BLAS_TRSM_L(dcomplex, double, z)
+EIGEN_BLAS_TRSM_L(float,    float,  s)
+EIGEN_BLAS_TRSM_L(scomplex, float,  c)
 
 
 // implements RightSide general * op(triangular)^-1
-#define EIGEN_MKL_TRSM_R(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \
 template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
 struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
 { \
@@ -108,13 +106,11 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
       const EIGTYPE* _tri, Index triStride, \
       EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
   { \
-   MKL_INT m = otherSize, n = size, lda, ldb; \
+   BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \
    char side = 'R', uplo, diag='N', transa; \
    /* Set alpha_ */ \
-   MKLTYPE alpha; \
-   EIGTYPE myone(1); \
-   assign_scalar_eig2mkl(alpha, myone); \
-   ldb = otherStride;\
+   EIGTYPE alpha(1); \
+   ldb = convert_index<BlasIndex>(otherStride);\
 \
    const EIGTYPE *a; \
 /* Set trans */ \
@@ -130,26 +126,26 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
    if (conjA) { \
      a_tmp = tri.conjugate(); \
      a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
    } else { \
      a = _tri; \
-     lda = triStride; \
+     lda = convert_index<BlasIndex>(triStride); \
    } \
    if (IsUnitDiag) diag='U'; \
 /* call ?trsm*/ \
-   MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
+   BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
    /*std::cout << "TRMS_L specialization!\n";*/ \
  } \
 };
 
-EIGEN_MKL_TRSM_R(double, double, d)
-EIGEN_MKL_TRSM_R(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_TRSM_R(float, float, s)
-EIGEN_MKL_TRSM_R(scomplex, MKL_Complex8, c)
+EIGEN_BLAS_TRSM_R(double,   double, d)
+EIGEN_BLAS_TRSM_R(dcomplex, double, z)
+EIGEN_BLAS_TRSM_R(float,    float,  s)
+EIGEN_BLAS_TRSM_R(scomplex, float,  c)
 
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h
index ce4d10088..b994759b2 100644
--- a/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/Eigen/src/Core/products/TriangularSolverVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
 #define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co
       >::run(size, _lhs, lhsStride, rhs);
   }
 };
-    
+
 // forward and backward substitution, row-major, rhs is a vector
 template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
 struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
@@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
   {
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+
     typename internal::conditional<
                           Conjugate,
                           const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
@@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
-        general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
           actualPanelWidth, r,
-          &lhs.coeffRef(startRow,startCol), lhsStride,
-          rhs + startCol, 1,
+          LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
+          RhsMapper(rhs + startCol, 1),
           rhs + startRow, 1,
           RhsScalar(-1));
       }
@@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         Index s = IsLower ? pi   : i+1;
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
-        
+
         if(!(Mode & UnitDiag))
           rhs[i] /= cjLhs(i,i);
       }
@@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
   {
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
     typename internal::conditional<Conjugate,
                                    const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
                                    const LhsMap&
@@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
         // 2 - it is slighlty faster at runtime
-        general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
-            &lhs.coeffRef(endBlock,startBlock), lhsStride,
-            rhs+startBlock, 1,
+            LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
+            RhsMapper(rhs+startBlock, 1),
             rhs+endBlock, 1, RhsScalar(-1));
       }
     }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index a28f16fa0..6e6ee119b 100644..100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -18,13 +18,13 @@ namespace Eigen {
 namespace internal {
 
 // forward declarations
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
 struct gebp_kernel;
 
-template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
 struct gemm_pack_rhs;
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_lhs;
 
 template<
@@ -34,7 +34,9 @@ template<
   int ResStorageOrder>
 struct general_matrix_matrix_product;
 
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized>
+template<typename Index,
+         typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
+         typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
 struct general_matrix_vector_product;
 
 
@@ -42,22 +44,35 @@ template<bool Conjugate> struct conj_if;
 
 template<> struct conj_if<true> {
   template<typename T>
-  inline T operator()(const T& x) { return numext::conj(x); }
+  inline T operator()(const T& x) const { return numext::conj(x); }
   template<typename T>
-  inline T pconj(const T& x) { return internal::pconj(x); }
+  inline T pconj(const T& x) const { return internal::pconj(x); }
 };
 
 template<> struct conj_if<false> {
   template<typename T>
-  inline const T& operator()(const T& x) { return x; }
+  inline const T& operator()(const T& x) const { return x; }
   template<typename T>
-  inline const T& pconj(const T& x) { return x; }
+  inline const T& pconj(const T& x) const { return x; }
+};
+
+// Generic implementation for custom complex types.
+template<typename LhsScalar, typename RhsScalar, bool ConjLhs, bool ConjRhs>
+struct conj_helper
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType Scalar;
+
+  EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const
+  { return conj_if<ConjLhs>()(x) *  conj_if<ConjRhs>()(y); }
 };
 
 template<typename Scalar> struct conj_helper<Scalar,Scalar,false,false>
 {
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); }
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); }
 };
 
 template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, false,true>
@@ -109,39 +124,142 @@ template<typename RealScalar,bool Conj> struct conj_helper<RealScalar, std::comp
 };
 
 template<typename From,typename To> struct get_factor {
-  static EIGEN_STRONG_INLINE To run(const From& x) { return x; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
 };
 
 template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::Real> {
+  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
 };
 
+
+template<typename Scalar, typename Index>
+class BlasVectorMapper {
+  public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_data[i];
+  }
+  template <typename Packet, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index i) const {
+    return (UIntPtr(m_data+i)%sizeof(Packet))==0;
+  }
+
+  protected:
+  Scalar* m_data;
+};
+
+template<typename Scalar, typename Index, int AlignmentType>
+class BlasLinearMapper {
+  public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+    internal::prefetch(&operator()(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    return ploadt<HalfPacket, AlignmentType>(m_data + i);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
+    pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
+  }
+
+  protected:
+  Scalar *m_data;
+};
+
 // Lightweight helper class to access matrix coefficients.
-// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
-// and so I hope better compilation performance (time and code quality).
-template<typename Scalar, typename Index, int StorageOrder>
-class blas_data_mapper
-{
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
+class blas_data_mapper {
   public:
-    blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
-    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
+  typedef BlasVectorMapper<Scalar, Index> VectorMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
+  getSubMapper(Index i, Index j) const {
+    return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j));
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(&operator()(i, j));
+  }
+
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    return ploadt<Packet, AlignmentType>(&operator()(i, j));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
+    if (UIntPtr(m_data)%sizeof(Scalar)) {
+      return -1;
+    }
+    return internal::first_default_aligned(m_data, size);
+  }
+
   protected:
-    Scalar* EIGEN_RESTRICT m_data;
-    Index m_stride;
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
 };
 
 // lightweight helper class to access matrix coefficients (const version)
 template<typename Scalar, typename Index, int StorageOrder>
-class const_blas_data_mapper
-{
+class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
   public:
-    const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
-    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
-  protected:
-    const Scalar* EIGEN_RESTRICT m_data;
-    Index m_stride;
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
+
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
+    return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
+  }
 };
 
 
@@ -188,17 +306,33 @@ struct blas_traits<CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> >
 };
 
 // pop scalar multiple
-template<typename Scalar, typename NestedXpr>
-struct blas_traits<CwiseUnaryOp<scalar_multiple_op<Scalar>, NestedXpr> >
+template<typename Scalar, typename NestedXpr, typename Plain>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> >
  : blas_traits<NestedXpr>
 {
   typedef blas_traits<NestedXpr> Base;
-  typedef CwiseUnaryOp<scalar_multiple_op<Scalar>, NestedXpr> XprType;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
   static inline Scalar extractScalarFactor(const XprType& x)
-  { return x.functor().m_other * Base::extractScalarFactor(x.nestedExpression()); }
+  { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
+template<typename Scalar, typename NestedXpr, typename Plain>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > >
+ : blas_traits<NestedXpr>
+{
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
+  typedef typename Base::ExtractType ExtractType;
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); }
+  static inline Scalar extractScalarFactor(const XprType& x)
+  { return Base::extractScalarFactor(x.lhs()) * x.rhs().functor().m_other; }
+};
+template<typename Scalar, typename Plain1, typename Plain2>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain1>,
+                                                            const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain2> > >
+ : blas_traits<CwiseNullaryOp<scalar_constant_op<Scalar>,Plain1> >
+{};
 
 // pop opposite
 template<typename Scalar, typename NestedXpr>
@@ -230,7 +364,7 @@ struct blas_traits<Transpose<NestedXpr> >
   enum {
     IsTransposed = Base::IsTransposed ? 0 : 1
   };
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); }
   static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
 };
 
diff --git a/Eigen/src/Core/util/CMakeLists.txt b/Eigen/src/Core/util/CMakeLists.txt
deleted file mode 100644
index a1e2e521f..000000000
--- a/Eigen/src/Core/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_util_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_Core_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/util COMPONENT Devel
-  )
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 1e6277c4f..7587d6842 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -30,6 +30,14 @@ const int DynamicIndex = 0xffffff;
   */
 const int Infinity = -1;
 
+/** This value means that the cost to evaluate an expression coefficient is either very expensive or
+  * cannot be known at compile time.
+  *
+  * This value has to be positive to (1) simplify cost computation, and (2) allow to distinguish between a very expensive and very very expensive expressions.
+  * It thus must also be large enough to make sure unrolling won't happen and that sub expressions will be evaluated, but not too large to avoid overflow.
+  */
+const int HugeCost = 10000;
+
 /** \defgroup flags Flags
   * \ingroup Core_Module
   *
@@ -48,19 +56,19 @@ const int Infinity = -1;
   * for a matrix, this means that the storage order is row-major.
   * If this bit is not set, the storage order is column-major.
   * For an expression, this determines the storage order of
-  * the matrix created by evaluation of that expression. 
-  * \sa \ref TopicStorageOrders */
+  * the matrix created by evaluation of that expression.
+  * \sa \blank  \ref TopicStorageOrders */
 const unsigned int RowMajorBit = 0x1;
 
 /** \ingroup flags
-  *
   * means the expression should be evaluated by the calling expression */
 const unsigned int EvalBeforeNestingBit = 0x2;
 
 /** \ingroup flags
-  *
+  * \deprecated
   * means the expression should be evaluated before any assignment */
-const unsigned int EvalBeforeAssigningBit = 0x4;
+EIGEN_DEPRECATED
+const unsigned int EvalBeforeAssigningBit = 0x4; // FIXME deprecated
 
 /** \ingroup flags
   *
@@ -141,17 +149,46 @@ const unsigned int LvalueBit = 0x20;
   */
 const unsigned int DirectAccessBit = 0x40;
 
-/** \ingroup flags
+/** \deprecated \ingroup flags
   *
-  * means the first coefficient packet is guaranteed to be aligned */
-const unsigned int AlignedBit = 0x80;
+  * means the first coefficient packet is guaranteed to be aligned.
+  * An expression cannot has the AlignedBit without the PacketAccessBit flag.
+  * In other words, this means we are allow to perform an aligned packet access to the first element regardless
+  * of the expression kind:
+  * \code
+  * expression.packet<Aligned>(0);
+  * \endcode
+  */
+EIGEN_DEPRECATED const unsigned int AlignedBit = 0x80;
 
 const unsigned int NestByRefBit = 0x100;
 
+/** \ingroup flags
+  *
+  * for an expression, this means that the storage order
+  * can be either row-major or column-major.
+  * The precise choice will be decided at evaluation time or when
+  * combined with other expressions.
+  * \sa \blank  \ref RowMajorBit, \ref TopicStorageOrders */
+const unsigned int NoPreferredStorageOrderBit = 0x200;
+
+/** \ingroup flags
+  *
+  * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format,
+  * that is, the expression provides:
+  * \code
+    inline const Scalar* valuePtr() const;
+    inline const Index* innerIndexPtr() const;
+    inline const Index* outerIndexPtr() const;
+    inline const Index* innerNonZeroPtr() const;
+    \endcode
+  */
+const unsigned int CompressedAccessBit = 0x400;
+
+
 // list of flags that are inherited by default
 const unsigned int HereditaryBits = RowMajorBit
-                                  | EvalBeforeNestingBit
-                                  | EvalBeforeAssigningBit;
+                                  | EvalBeforeNestingBit;
 
 /** \defgroup enums Enumerations
   * \ingroup Core_Module
@@ -160,9 +197,9 @@ const unsigned int HereditaryBits = RowMajorBit
   */
 
 /** \ingroup enums
-  * Enum containing possible values for the \p Mode parameter of 
-  * MatrixBase::selfadjointView() and MatrixBase::triangularView(). */
-enum {
+  * Enum containing possible values for the \c Mode or \c UpLo parameter of
+  * MatrixBase::selfadjointView() and MatrixBase::triangularView(), and selfadjoint solvers. */
+enum UpLoType {
   /** View matrix as a lower triangular matrix. */
   Lower=0x1,                      
   /** View matrix as an upper triangular matrix. */
@@ -186,12 +223,31 @@ enum {
 };
 
 /** \ingroup enums
-  * Enum for indicating whether an object is aligned or not. */
-enum { 
-  /** Object is not correctly aligned for vectorization. */
-  Unaligned=0, 
-  /** Object is aligned for vectorization. */
-  Aligned=1 
+  * Enum for indicating whether a buffer is aligned or not. */
+enum AlignmentType {
+  Unaligned=0,        /**< Data pointer has no specific alignment. */
+  Aligned8=8,         /**< Data pointer is aligned on a 8 bytes boundary. */
+  Aligned16=16,       /**< Data pointer is aligned on a 16 bytes boundary. */
+  Aligned32=32,       /**< Data pointer is aligned on a 32 bytes boundary. */
+  Aligned64=64,       /**< Data pointer is aligned on a 64 bytes boundary. */
+  Aligned128=128,     /**< Data pointer is aligned on a 128 bytes boundary. */
+  AlignedMask=255,
+  Aligned=16,         /**< \deprecated Synonym for Aligned16. */
+#if EIGEN_MAX_ALIGN_BYTES==128
+  AlignedMax = Aligned128
+#elif EIGEN_MAX_ALIGN_BYTES==64
+  AlignedMax = Aligned64
+#elif EIGEN_MAX_ALIGN_BYTES==32
+  AlignedMax = Aligned32
+#elif EIGEN_MAX_ALIGN_BYTES==16
+  AlignedMax = Aligned16
+#elif EIGEN_MAX_ALIGN_BYTES==8
+  AlignedMax = Aligned8
+#elif EIGEN_MAX_ALIGN_BYTES==0
+  AlignedMax = Unaligned
+#else
+#error Invalid value for EIGEN_MAX_ALIGN_BYTES
+#endif
 };
 
 /** \ingroup enums
@@ -217,7 +273,7 @@ enum DirectionType {
 
 /** \internal \ingroup enums
   * Enum to specify how to traverse the entries of a matrix. */
-enum {
+enum TraversalType {
   /** \internal Default traversal, no vectorization, no index-based access */
   DefaultTraversal,
   /** \internal No vectorization, use index-based access to have only one for loop instead of 2 nested loops */
@@ -239,7 +295,7 @@ enum {
 
 /** \internal \ingroup enums
   * Enum to specify whether to unroll loops when traversing over the entries of a matrix. */
-enum {
+enum UnrollingType {
   /** \internal Do not unroll loops. */
   NoUnrolling,
   /** \internal Unroll only the inner loop, but not the outer loop. */
@@ -251,7 +307,7 @@ enum {
 
 /** \internal \ingroup enums
   * Enum to specify whether to use the default (built-in) implementation or the specialization. */
-enum {
+enum SpecializedType {
   Specialized,
   BuiltIn
 };
@@ -259,7 +315,7 @@ enum {
 /** \ingroup enums
   * Enum containing possible values for the \p _Options template parameter of
   * Matrix, Array and BandMatrix. */
-enum {
+enum StorageOptions {
   /** Storage order is column major (see \ref TopicStorageOrders). */
   ColMajor = 0,
   /** Storage order is row major (see \ref TopicStorageOrders). */
@@ -272,7 +328,7 @@ enum {
 
 /** \ingroup enums
   * Enum for specifying whether to apply or solve on the left or right. */
-enum {
+enum SideType {
   /** Apply transformation on the left. */
   OnTheLeft = 1,  
   /** Apply transformation on the right. */
@@ -297,7 +353,7 @@ enum Default_t    { Default };
 
 /** \internal \ingroup enums
   * Used in AmbiVector. */
-enum {
+enum AmbiVectorMode {
   IsDense         = 0,
   IsSparse
 };
@@ -406,10 +462,16 @@ namespace Architecture
     Generic = 0x0,
     SSE = 0x1,
     AltiVec = 0x2,
+    VSX = 0x3,
+    NEON = 0x4,
 #if defined EIGEN_VECTORIZE_SSE
     Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
     Target = AltiVec
+#elif defined EIGEN_VECTORIZE_VSX
+    Target = VSX
+#elif defined EIGEN_VECTORIZE_NEON
+    Target = NEON
 #else
     Target = Generic
 #endif
@@ -417,8 +479,9 @@ namespace Architecture
 }
 
 /** \internal \ingroup enums
-  * Enum used as template parameter in GeneralProduct. */
-enum { CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
+  * Enum used as template parameter in Product and product evaluators. */
+enum ProductImplType
+{ DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
 
 /** \internal \ingroup enums
   * Enum used in experimental parallel implementation. */
@@ -427,24 +490,57 @@ enum Action {GetAction, SetAction};
 /** The type used to identify a dense storage. */
 struct Dense {};
 
+/** The type used to identify a general sparse storage. */
+struct Sparse {};
+
+/** The type used to identify a general solver (factored) storage. */
+struct SolverStorage {};
+
+/** The type used to identify a permutation storage. */
+struct PermutationStorage {};
+
+/** The type used to identify a permutation storage. */
+struct TranspositionsStorage {};
+
 /** The type used to identify a matrix expression */
 struct MatrixXpr {};
 
 /** The type used to identify an array expression */
 struct ArrayXpr {};
 
+// An evaluator must define its shape. By default, it can be one of the following:
+struct DenseShape             { static std::string debugName() { return "DenseShape"; } };
+struct SolverShape            { static std::string debugName() { return "SolverShape"; } };
+struct HomogeneousShape       { static std::string debugName() { return "HomogeneousShape"; } };
+struct DiagonalShape          { static std::string debugName() { return "DiagonalShape"; } };
+struct BandShape              { static std::string debugName() { return "BandShape"; } };
+struct TriangularShape        { static std::string debugName() { return "TriangularShape"; } };
+struct SelfAdjointShape       { static std::string debugName() { return "SelfAdjointShape"; } };
+struct PermutationShape       { static std::string debugName() { return "PermutationShape"; } };
+struct TranspositionsShape    { static std::string debugName() { return "TranspositionsShape"; } };
+struct SparseShape            { static std::string debugName() { return "SparseShape"; } };
+
 namespace internal {
-  /** \internal
-  * Constants for comparison functors
-  */
-  enum ComparisonName {
-    cmp_EQ = 0,
-    cmp_LT = 1,
-    cmp_LE = 2,
-    cmp_UNORD = 3,
-    cmp_NEQ = 4
-  };
-}
+
+  // random access iterators based on coeff*() accessors.
+struct IndexBased {};
+
+// evaluator based on iterators to access coefficients. 
+struct IteratorBased {};
+
+/** \internal
+ * Constants for comparison functors
+ */
+enum ComparisonName {
+  cmp_EQ = 0,
+  cmp_LT = 1,
+  cmp_LE = 2,
+  cmp_UNORD = 3,
+  cmp_NEQ = 4,
+  cmp_GT = 5,
+  cmp_GE = 6
+};
+} // end namespace internal
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 6a0bf0629..7559e129c 100644..100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -10,24 +10,31 @@
   // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
   // 4273 - QtAlignedMalloc, inconsistent DLL linkage
   // 4324 - structure was padded due to declspec(align())
+  // 4503 - decorated name length exceeded, name was truncated
   // 4512 - assignment operator could not be generated
   // 4522 - 'class' : multiple assignment operators specified
   // 4700 - uninitialized local variable 'xyz' used
+  // 4714 - function marked as __forceinline not inlined
   // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
+  // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning( push )
   #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4512 4522 4700 4717 )
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
   //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
   //        typedef that may be a reference type.
   // 279  - controlling expression is constant
   //        ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is a legitimate use case.
+  // 1684 - conversion from pointer to same-sized integral type (potential portability problem)
+  // 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning push
   #endif
-  #pragma warning disable 2196 279
+  #pragma warning disable 2196 279 1684 2259
+
 #elif defined __clang__
   // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
   //     this is really a stupid warning as it warns on compile-time expressions involving enums
@@ -35,6 +42,34 @@
     #pragma clang diagnostic push
   #endif
   #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+
+#elif defined __GNUC__ && __GNUC__>=6
+
+  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+    #pragma GCC diagnostic push
+  #endif
+  #pragma GCC diagnostic ignored "-Wignored-attributes"
+
+#endif
+
+#if defined __NVCC__
+  // Disable the "statement is unreachable" message
+  #pragma diag_suppress code_is_unreachable
+  // Disable the "dynamic initialization in unreachable code" message
+  #pragma diag_suppress initialization_not_reachable
+  // Disable the "invalid error number" message that we get with older versions of nvcc
+  #pragma diag_suppress 1222
+  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler)
+  #pragma diag_suppress 2527
+  #pragma diag_suppress 2529
+  #pragma diag_suppress 2651
+  #pragma diag_suppress 2653
+  #pragma diag_suppress 2668
+  #pragma diag_suppress 2669
+  #pragma diag_suppress 2670
+  #pragma diag_suppress 2671
+  #pragma diag_suppress 2735
+  #pragma diag_suppress 2737
 #endif
 
 #endif // not EIGEN_WARNINGS_DISABLED
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index d6a814586..ea107393a 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -36,6 +36,10 @@ template<typename Derived> struct accessors_level
   };
 };
 
+template<typename T> struct evaluator_traits;
+
+template< typename T> struct evaluator;
+
 } // end namespace internal
 
 template<typename T> struct NumTraits;
@@ -51,18 +55,18 @@ class DenseCoeffsBase;
 
 template<typename _Scalar, int _Rows, int _Cols,
          int _Options = AutoAlign |
-#if defined(__GNUC__) && __GNUC__==3 && __GNUC_MINOR__==4
+#if EIGEN_GNUC_AT(3,4)
     // workaround a bug in at least gcc 3.4.6
     // the innermost ?: ternary operator is misparsed. We write it slightly
     // differently and this makes gcc 3.4.6 happy, but it's ugly.
     // The error would only show up with EIGEN_DEFAULT_TO_ROW_MAJOR is defined
     // (when EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION is RowMajor)
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
                           : !(_Cols==1 && _Rows!=1) ?  EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION
-                          : ColMajor ),
+                          : Eigen::ColMajor ),
 #else
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
-                          : (_Cols==1 && _Rows!=1) ? ColMajor
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
+                          : (_Cols==1 && _Rows!=1) ? Eigen::ColMajor
                           : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
 #endif
          int _MaxRows = _Rows,
@@ -87,10 +91,11 @@ template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
 template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
 template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
 template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
-template<typename BinOp,     typename Lhs, typename Rhs>  class SelfCwiseBinaryOp;
-template<typename Derived,   typename Lhs, typename Rhs>  class ProductBase;
-template<typename Lhs, typename Rhs, int Mode>            class GeneralProduct;
-template<typename Lhs, typename Rhs, int NestingFlags>    class CoeffBasedProduct;
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>  class CwiseTernaryOp;
+template<typename Decomposition, typename Rhstype>        class Solve;
+template<typename XprType>                                class Inverse;
+
+template<typename Lhs, typename Rhs, int Option = DefaultProduct> class Product;
 
 template<typename Derived> class DiagonalBase;
 template<typename _DiagonalVectorType> class DiagonalWrapper;
@@ -108,7 +113,12 @@ template<typename Derived,
          int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
 > class MapBase;
 template<int InnerStrideAtCompileTime, int OuterStrideAtCompileTime> class Stride;
+template<int Value = Dynamic> class InnerStride;
+template<int Value = Dynamic> class OuterStride;
 template<typename MatrixType, int MapOptions=Unaligned, typename StrideType = Stride<0,0> > class Map;
+template<typename Derived> class RefBase;
+template<typename PlainObjectType, int Options = 0,
+         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
 
 template<typename Derived> class TriangularBase;
 template<typename MatrixType, unsigned int Mode> class TriangularView;
@@ -119,10 +129,10 @@ template<typename MatrixType> struct CommaInitializer;
 template<typename Derived> class ReturnByValue;
 template<typename ExpressionType> class ArrayWrapper;
 template<typename ExpressionType> class MatrixWrapper;
+template<typename Derived> class SolverBase;
+template<typename XprType> class InnerIterator;
 
 namespace internal {
-template<typename DecompositionType, typename Rhs> struct solve_retval_base;
-template<typename DecompositionType, typename Rhs> struct solve_retval;
 template<typename DecompositionType> struct kernel_retval_base;
 template<typename DecompositionType> struct kernel_retval;
 template<typename DecompositionType> struct image_retval_base;
@@ -135,6 +145,21 @@ template<typename _Scalar, int Rows=Dynamic, int Cols=Dynamic, int Supers=Dynami
 
 namespace internal {
 template<typename Lhs, typename Rhs> struct product_type;
+
+template<bool> struct EnableIf;
+
+/** \internal
+  * \class product_evaluator
+  * Products need their own evaluator with more template arguments allowing for
+  * easier partial template specializations.
+  */
+template< typename T,
+          int ProductTag = internal::product_type<typename T::Lhs,typename T::Rhs>::ret,
+          typename LhsShape = typename evaluator_traits<typename T::Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<typename T::Rhs>::Shape,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar
+        > struct product_evaluator;
 }
 
 template<typename Lhs, typename Rhs,
@@ -150,9 +175,11 @@ namespace internal {
 // with optional conjugation of the arguments.
 template<typename LhsScalar, typename RhsScalar, bool ConjLhs=false, bool ConjRhs=false> struct conj_helper;
 
-template<typename Scalar> struct scalar_sum_op;
-template<typename Scalar> struct scalar_difference_op;
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_sum_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_difference_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_conj_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_min_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_max_op;
 template<typename Scalar> struct scalar_opposite_op;
 template<typename Scalar> struct scalar_conjugate_op;
 template<typename Scalar> struct scalar_real_op;
@@ -160,6 +187,7 @@ template<typename Scalar> struct scalar_imag_op;
 template<typename Scalar> struct scalar_abs_op;
 template<typename Scalar> struct scalar_abs2_op;
 template<typename Scalar> struct scalar_sqrt_op;
+template<typename Scalar> struct scalar_rsqrt_op;
 template<typename Scalar> struct scalar_exp_op;
 template<typename Scalar> struct scalar_log_op;
 template<typename Scalar> struct scalar_cos_op;
@@ -167,24 +195,29 @@ template<typename Scalar> struct scalar_sin_op;
 template<typename Scalar> struct scalar_acos_op;
 template<typename Scalar> struct scalar_asin_op;
 template<typename Scalar> struct scalar_tan_op;
-template<typename Scalar> struct scalar_pow_op;
 template<typename Scalar> struct scalar_inverse_op;
 template<typename Scalar> struct scalar_square_op;
 template<typename Scalar> struct scalar_cube_op;
 template<typename Scalar, typename NewType> struct scalar_cast_op;
-template<typename Scalar> struct scalar_multiple_op;
-template<typename Scalar> struct scalar_quotient1_op;
-template<typename Scalar> struct scalar_min_op;
-template<typename Scalar> struct scalar_max_op;
 template<typename Scalar> struct scalar_random_op;
-template<typename Scalar> struct scalar_add_op;
 template<typename Scalar> struct scalar_constant_op;
 template<typename Scalar> struct scalar_identity_op;
-
+template<typename Scalar,bool iscpx> struct scalar_sign_op;
+template<typename Scalar,typename ScalarExponent> struct scalar_pow_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_hypot_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
-template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
 
+// SpecialFunctions module
+template<typename Scalar> struct scalar_lgamma_op;
+template<typename Scalar> struct scalar_digamma_op;
+template<typename Scalar> struct scalar_erf_op;
+template<typename Scalar> struct scalar_erfc_op;
+template<typename Scalar> struct scalar_igamma_op;
+template<typename Scalar> struct scalar_igammac_op;
+template<typename Scalar> struct scalar_zeta_op;
+template<typename Scalar> struct scalar_betainc_op;
+
 } // end namespace internal
 
 struct IOFormat;
@@ -192,18 +225,18 @@ struct IOFormat;
 // Array module
 template<typename _Scalar, int _Rows, int _Cols,
          int _Options = AutoAlign |
-#if defined(__GNUC__) && __GNUC__==3 && __GNUC_MINOR__==4
+#if EIGEN_GNUC_AT(3,4)
     // workaround a bug in at least gcc 3.4.6
     // the innermost ?: ternary operator is misparsed. We write it slightly
     // differently and this makes gcc 3.4.6 happy, but it's ugly.
     // The error would only show up with EIGEN_DEFAULT_TO_ROW_MAJOR is defined
     // (when EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION is RowMajor)
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
                           : !(_Cols==1 && _Rows!=1) ?  EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION
-                          : ColMajor ),
+                          : Eigen::ColMajor ),
 #else
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
-                          : (_Cols==1 && _Rows!=1) ? ColMajor
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
+                          : (_Cols==1 && _Rows!=1) ? Eigen::ColMajor
                           : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
 #endif
          int _MaxRows = _Rows, int _MaxCols = _Cols> class Array;
@@ -221,7 +254,9 @@ template<typename MatrixType> struct inverse_impl;
 template<typename MatrixType> class HouseholderQR;
 template<typename MatrixType> class ColPivHouseholderQR;
 template<typename MatrixType> class FullPivHouseholderQR;
+template<typename MatrixType> class CompleteOrthogonalDecomposition;
 template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
+template<typename MatrixType> class BDCSVD;
 template<typename MatrixType, int UpLo = Lower> class LLT;
 template<typename MatrixType, int UpLo = Lower> class LDLT;
 template<typename VectorsType, typename CoeffsType, int Side=OnTheLeft> class HouseholderSequence;
@@ -234,36 +269,16 @@ template<typename Derived> class QuaternionBase;
 template<typename Scalar> class Rotation2D;
 template<typename Scalar> class AngleAxis;
 template<typename Scalar,int Dim> class Translation;
-
-#ifdef EIGEN2_SUPPORT
-template<typename Derived, int _Dim> class eigen2_RotationBase;
-template<typename Lhs, typename Rhs> class eigen2_Cross;
-template<typename Scalar> class eigen2_Quaternion;
-template<typename Scalar> class eigen2_Rotation2D;
-template<typename Scalar> class eigen2_AngleAxis;
-template<typename Scalar,int Dim> class eigen2_Transform;
-template <typename _Scalar, int _AmbientDim> class eigen2_ParametrizedLine;
-template <typename _Scalar, int _AmbientDim> class eigen2_Hyperplane;
-template<typename Scalar,int Dim> class eigen2_Translation;
-template<typename Scalar,int Dim> class eigen2_Scaling;
-#endif
-
-#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-template<typename Scalar> class Quaternion;
-template<typename Scalar,int Dim> class Transform;
-template <typename _Scalar, int _AmbientDim> class ParametrizedLine;
-template <typename _Scalar, int _AmbientDim> class Hyperplane;
-template<typename Scalar,int Dim> class Scaling;
-#endif
-
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+template<typename Scalar,int Dim> class AlignedBox;
 template<typename Scalar, int Options = AutoAlign> class Quaternion;
 template<typename Scalar,int Dim,int Mode,int _Options=AutoAlign> class Transform;
 template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class ParametrizedLine;
 template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class Hyperplane;
 template<typename Scalar> class UniformScaling;
 template<typename MatrixType,int Direction> class Homogeneous;
-#endif
+
+// Sparse module:
+template<typename Derived> class SparseMatrixBase;
 
 // MatrixFunctions module
 template<typename Derived> struct MatrixExponentialReturnValue;
@@ -271,7 +286,7 @@ template<typename Derived> class MatrixFunctionReturnValue;
 template<typename Derived> class MatrixSquareRootReturnValue;
 template<typename Derived> class MatrixLogarithmReturnValue;
 template<typename Derived> class MatrixPowerReturnValue;
-template<typename Derived, typename Lhs, typename Rhs> class MatrixPowerProduct;
+template<typename Derived> class MatrixComplexPowerReturnValue;
 
 namespace internal {
 template <typename Scalar>
@@ -282,18 +297,6 @@ struct stem_function
 };
 }
 
-
-#ifdef EIGEN2_SUPPORT
-template<typename ExpressionType> class Cwise;
-template<typename MatrixType> class Minor;
-template<typename MatrixType> class LU;
-template<typename MatrixType> class QR;
-template<typename MatrixType> class SVD;
-namespace internal {
-template<typename MatrixType, unsigned int Mode> struct eigen2_part_return_type;
-}
-#endif
-
 } // end namespace Eigen
 
 #endif // EIGEN_FORWARDDECLARATIONS_H
diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h
index 8acca9c8c..26b59669e 100644..100755
--- a/Eigen/src/Core/util/MKL_support.h
+++ b/Eigen/src/Core/util/MKL_support.h
@@ -49,7 +49,7 @@
   #define EIGEN_USE_LAPACKE
 #endif
 
-#if defined(EIGEN_USE_BLAS) || defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML)
+#if defined(EIGEN_USE_MKL_VML)
   #define EIGEN_USE_MKL
 #endif
 
@@ -64,7 +64,6 @@
 #   ifndef EIGEN_USE_MKL
     /*If the MKL version is too old, undef everything*/
 #       undef   EIGEN_USE_MKL_ALL
-#       undef   EIGEN_USE_BLAS
 #       undef   EIGEN_USE_LAPACKE
 #       undef   EIGEN_USE_MKL_VML
 #       undef   EIGEN_USE_LAPACKE_STRICT
@@ -73,54 +72,57 @@
 #endif
 
 #if defined EIGEN_USE_MKL
-#include <mkl_lapacke.h>
-#define EIGEN_MKL_VML_THRESHOLD 128
-
-namespace Eigen {
 
-typedef std::complex<double> dcomplex;
-typedef std::complex<float>  scomplex;
+#define EIGEN_MKL_VML_THRESHOLD 128
 
-namespace internal {
+/* MKL_DOMAIN_BLAS, etc are defined only in 10.3 update 7 */
+/* MKL_BLAS, etc are not defined in 11.2 */
+#ifdef MKL_DOMAIN_ALL
+#define EIGEN_MKL_DOMAIN_ALL MKL_DOMAIN_ALL
+#else
+#define EIGEN_MKL_DOMAIN_ALL MKL_ALL
+#endif
 
-template<typename MKLType, typename EigenType>
-static inline void assign_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) {
-  mklScalar=eigenScalar;
-}
+#ifdef MKL_DOMAIN_BLAS
+#define EIGEN_MKL_DOMAIN_BLAS MKL_DOMAIN_BLAS
+#else
+#define EIGEN_MKL_DOMAIN_BLAS MKL_BLAS
+#endif
 
-template<typename MKLType, typename EigenType>
-static inline void assign_conj_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) {
-  mklScalar=eigenScalar;
-}
+#ifdef MKL_DOMAIN_FFT
+#define EIGEN_MKL_DOMAIN_FFT MKL_DOMAIN_FFT
+#else
+#define EIGEN_MKL_DOMAIN_FFT MKL_FFT
+#endif
 
-template <>
-inline void assign_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=eigenScalar.imag();
-}
+#ifdef MKL_DOMAIN_VML
+#define EIGEN_MKL_DOMAIN_VML MKL_DOMAIN_VML
+#else
+#define EIGEN_MKL_DOMAIN_VML MKL_VML
+#endif
 
-template <>
-inline void assign_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=eigenScalar.imag();
-}
+#ifdef MKL_DOMAIN_PARDISO
+#define EIGEN_MKL_DOMAIN_PARDISO MKL_DOMAIN_PARDISO
+#else
+#define EIGEN_MKL_DOMAIN_PARDISO MKL_PARDISO
+#endif
+#endif
 
-template <>
-inline void assign_conj_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=-eigenScalar.imag();
-}
+namespace Eigen {
 
-template <>
-inline void assign_conj_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=-eigenScalar.imag();
-}
+typedef std::complex<double> dcomplex;
+typedef std::complex<float>  scomplex;
 
-} // end namespace internal
+#if defined(EIGEN_USE_MKL)
+typedef MKL_INT BlasIndex;
+#else
+typedef int BlasIndex;
+#endif
 
 } // end namespace Eigen
 
+#if defined(EIGEN_USE_BLAS)
+#include "../../misc/blas.h"
 #endif
 
 #endif // EIGEN_MKL_SUPPORT_H
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index f69970f05..427d3cd6b 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -12,84 +12,324 @@
 #define EIGEN_MACROS_H
 
 #define EIGEN_WORLD_VERSION 3
-#define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 5
+#define EIGEN_MAJOR_VERSION 3
+#define EIGEN_MINOR_VERSION 3
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
                                                                  EIGEN_MINOR_VERSION>=z))))
+
+// Compiler identification, EIGEN_COMP_*
+
+/// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
 #ifdef __GNUC__
-  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__==x && __GNUC_MINOR__>=y) || __GNUC__>x)
+  #define EIGEN_COMP_GNUC 1
 #else
-  #define EIGEN_GNUC_AT_LEAST(x,y) 0
+  #define EIGEN_COMP_GNUC 0
 #endif
- 
-#ifdef __GNUC__
-  #define EIGEN_GNUC_AT_MOST(x,y) ((__GNUC__==x && __GNUC_MINOR__<=y) || __GNUC__<x)
+
+/// \internal EIGEN_COMP_CLANG set to major+minor version (e.g., 307 for clang 3.7) if the compiler is clang
+#if defined(__clang__)
+  #define EIGEN_COMP_CLANG (__clang_major__*100+__clang_minor__)
 #else
-  #define EIGEN_GNUC_AT_MOST(x,y) 0
+  #define EIGEN_COMP_CLANG 0
 #endif
 
-#if EIGEN_GNUC_AT_MOST(4,3) && !defined(__clang__)
-  // see bug 89
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
+
+/// \internal EIGEN_COMP_LLVM set to 1 if the compiler backend is llvm
+#if defined(__llvm__)
+  #define EIGEN_COMP_LLVM 1
 #else
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
+  #define EIGEN_COMP_LLVM 0
+#endif
+
+/// \internal EIGEN_COMP_ICC set to __INTEL_COMPILER if the compiler is Intel compiler, 0 otherwise
+#if defined(__INTEL_COMPILER)
+  #define EIGEN_COMP_ICC __INTEL_COMPILER
+#else
+  #define EIGEN_COMP_ICC 0
+#endif
+
+/// \internal EIGEN_COMP_MINGW set to 1 if the compiler is mingw
+#if defined(__MINGW32__)
+  #define EIGEN_COMP_MINGW 1
+#else
+  #define EIGEN_COMP_MINGW 0
+#endif
+
+/// \internal EIGEN_COMP_SUNCC set to 1 if the compiler is Solaris Studio
+#if defined(__SUNPRO_CC)
+  #define EIGEN_COMP_SUNCC 1
+#else
+  #define EIGEN_COMP_SUNCC 0
+#endif
+
+/// \internal EIGEN_COMP_MSVC set to _MSC_VER if the compiler is Microsoft Visual C++, 0 otherwise.
+#if defined(_MSC_VER)
+  #define EIGEN_COMP_MSVC _MSC_VER
+#else
+  #define EIGEN_COMP_MSVC 0
+#endif
+
+// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
+//  name  ver   MSC_VER
+//  2008    9      1500
+//  2010   10      1600
+//  2012   11      1700
+//  2013   12      1800
+//  2015   14      1900
+//  "15"   15      1900
+
+/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl
+#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
+  #define EIGEN_COMP_MSVC_STRICT _MSC_VER
+#else
+  #define EIGEN_COMP_MSVC_STRICT 0
+#endif
+
+/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++
+#if defined(__IBMCPP__) || defined(__xlc__)
+  #define EIGEN_COMP_IBM 1
+#else
+  #define EIGEN_COMP_IBM 0
+#endif
+
+/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler
+#if defined(__PGI)
+  #define EIGEN_COMP_PGI 1
+#else
+  #define EIGEN_COMP_PGI 0
+#endif
+
+/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler
+#if defined(__CC_ARM) || defined(__ARMCC_VERSION)
+  #define EIGEN_COMP_ARM 1
+#else
+  #define EIGEN_COMP_ARM 0
+#endif
+
+/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler
+#if defined(__EMSCRIPTEN__)
+  #define EIGEN_COMP_EMSCRIPTEN 1
+#else
+  #define EIGEN_COMP_EMSCRIPTEN 0
+#endif
+
+
+/// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.)
+#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN)
+  #define EIGEN_COMP_GNUC_STRICT 1
+#else
+  #define EIGEN_COMP_GNUC_STRICT 0
 #endif
 
-#if defined(__GNUC__) && (__GNUC__ <= 3)
+
+#if EIGEN_COMP_GNUC
+  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__==x && __GNUC_MINOR__>=y) || __GNUC__>x)
+  #define EIGEN_GNUC_AT_MOST(x,y)  ((__GNUC__==x && __GNUC_MINOR__<=y) || __GNUC__<x)
+  #define EIGEN_GNUC_AT(x,y)       ( __GNUC__==x && __GNUC_MINOR__==y )
+#else
+  #define EIGEN_GNUC_AT_LEAST(x,y) 0
+  #define EIGEN_GNUC_AT_MOST(x,y)  0
+  #define EIGEN_GNUC_AT(x,y)       0
+#endif
+
+// FIXME: could probably be removed as we do not support gcc 3.x anymore
+#if EIGEN_COMP_GNUC && (__GNUC__ <= 3)
 #define EIGEN_GCC3_OR_OLDER 1
 #else
 #define EIGEN_GCC3_OR_OLDER 0
 #endif
 
-// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
-// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
-// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
-// certain common platform (compiler+architecture combinations) to avoid these problems.
-// Only static alignment is really problematic (relies on nonstandard compiler extensions that don't
-// work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
-// when we have to disable static alignment.
-#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__))
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+
+// Architecture identification, EIGEN_ARCH_*
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
+  #define EIGEN_ARCH_x86_64 1
 #else
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #define EIGEN_ARCH_x86_64 0
 #endif
 
-// static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
-#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
- && !EIGEN_GCC3_OR_OLDER \
- && !defined(__SUNPRO_CC) \
- && !defined(__QNXNTO__)
-  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
+  #define EIGEN_ARCH_i386 1
 #else
-  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #define EIGEN_ARCH_i386 0
 #endif
 
-#ifdef EIGEN_DONT_ALIGN
-  #ifndef EIGEN_DONT_ALIGN_STATICALLY
-    #define EIGEN_DONT_ALIGN_STATICALLY
-  #endif
-  #define EIGEN_ALIGN 0
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_i386
+  #define EIGEN_ARCH_i386_OR_x86_64 1
 #else
-  #define EIGEN_ALIGN 1
+  #define EIGEN_ARCH_i386_OR_x86_64 0
 #endif
 
-// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
-// alignment (EIGEN_DONT_ALIGN_STATICALLY) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only EIGEN_ALIGN_STATICALLY should be used.
-#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT && !defined(EIGEN_DONT_ALIGN_STATICALLY)
-  #define EIGEN_ALIGN_STATICALLY 1
+/// \internal EIGEN_ARCH_ARM set to 1 if the architecture is ARM
+#if defined(__arm__)
+  #define EIGEN_ARCH_ARM 1
 #else
-  #define EIGEN_ALIGN_STATICALLY 0
-  #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-    #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-  #endif
+  #define EIGEN_ARCH_ARM 0
+#endif
+
+/// \internal EIGEN_ARCH_ARM64 set to 1 if the architecture is ARM64
+#if defined(__aarch64__)
+  #define EIGEN_ARCH_ARM64 1
+#else
+  #define EIGEN_ARCH_ARM64 0
+#endif
+
+#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
+  #define EIGEN_ARCH_ARM_OR_ARM64 1
+#else
+  #define EIGEN_ARCH_ARM_OR_ARM64 0
+#endif
+
+/// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
+#if defined(__mips__) || defined(__mips)
+  #define EIGEN_ARCH_MIPS 1
+#else
+  #define EIGEN_ARCH_MIPS 0
 #endif
 
+/// \internal EIGEN_ARCH_SPARC set to 1 if the architecture is SPARC
+#if defined(__sparc__) || defined(__sparc)
+  #define EIGEN_ARCH_SPARC 1
+#else
+  #define EIGEN_ARCH_SPARC 0
+#endif
+
+/// \internal EIGEN_ARCH_IA64 set to 1 if the architecture is Intel Itanium
+#if defined(__ia64__)
+  #define EIGEN_ARCH_IA64 1
+#else
+  #define EIGEN_ARCH_IA64 0
+#endif
+
+/// \internal EIGEN_ARCH_PPC set to 1 if the architecture is PowerPC
+#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC)
+  #define EIGEN_ARCH_PPC 1
+#else
+  #define EIGEN_ARCH_PPC 0
+#endif
+
+
+
+// Operating system identification, EIGEN_OS_*
+
+/// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant
+#if defined(__unix__) || defined(__unix)
+  #define EIGEN_OS_UNIX 1
+#else
+  #define EIGEN_OS_UNIX 0
+#endif
+
+/// \internal EIGEN_OS_LINUX set to 1 if the OS is based on Linux kernel
+#if defined(__linux__)
+  #define EIGEN_OS_LINUX 1
+#else
+  #define EIGEN_OS_LINUX 0
+#endif
+
+/// \internal EIGEN_OS_ANDROID set to 1 if the OS is Android
+// note: ANDROID is defined when using ndk_build, __ANDROID__ is defined when using a standalone toolchain.
+#if defined(__ANDROID__) || defined(ANDROID)
+  #define EIGEN_OS_ANDROID 1
+#else
+  #define EIGEN_OS_ANDROID 0
+#endif
+
+/// \internal EIGEN_OS_GNULINUX set to 1 if the OS is GNU Linux and not Linux-based OS (e.g., not android)
+#if defined(__gnu_linux__) && !(EIGEN_OS_ANDROID)
+  #define EIGEN_OS_GNULINUX 1
+#else
+  #define EIGEN_OS_GNULINUX 0
+#endif
+
+/// \internal EIGEN_OS_BSD set to 1 if the OS is a BSD variant
+#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) || defined(__DragonFly__)
+  #define EIGEN_OS_BSD 1
+#else
+  #define EIGEN_OS_BSD 0
+#endif
+
+/// \internal EIGEN_OS_MAC set to 1 if the OS is MacOS
+#if defined(__APPLE__)
+  #define EIGEN_OS_MAC 1
+#else
+  #define EIGEN_OS_MAC 0
+#endif
+
+/// \internal EIGEN_OS_QNX set to 1 if the OS is QNX
+#if defined(__QNX__)
+  #define EIGEN_OS_QNX 1
+#else
+  #define EIGEN_OS_QNX 0
+#endif
+
+/// \internal EIGEN_OS_WIN set to 1 if the OS is Windows based
+#if defined(_WIN32)
+  #define EIGEN_OS_WIN 1
+#else
+  #define EIGEN_OS_WIN 0
+#endif
+
+/// \internal EIGEN_OS_WIN64 set to 1 if the OS is Windows 64bits
+#if defined(_WIN64)
+  #define EIGEN_OS_WIN64 1
+#else
+  #define EIGEN_OS_WIN64 0
+#endif
+
+/// \internal EIGEN_OS_WINCE set to 1 if the OS is Windows CE
+#if defined(_WIN32_WCE)
+  #define EIGEN_OS_WINCE 1
+#else
+  #define EIGEN_OS_WINCE 0
+#endif
+
+/// \internal EIGEN_OS_CYGWIN set to 1 if the OS is Windows/Cygwin
+#if defined(__CYGWIN__)
+  #define EIGEN_OS_CYGWIN 1
+#else
+  #define EIGEN_OS_CYGWIN 0
+#endif
+
+/// \internal EIGEN_OS_WIN_STRICT set to 1 if the OS is really Windows and not some variants
+#if EIGEN_OS_WIN && !( EIGEN_OS_WINCE || EIGEN_OS_CYGWIN )
+  #define EIGEN_OS_WIN_STRICT 1
+#else
+  #define EIGEN_OS_WIN_STRICT 0
+#endif
+
+/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN
+#if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SUN 1
+#else
+  #define EIGEN_OS_SUN 0
+#endif
+
+/// \internal EIGEN_OS_SOLARIS set to 1 if the OS is Solaris
+#if (defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SOLARIS 1
+#else
+  #define EIGEN_OS_SOLARIS 0
+#endif
+
+
+
+#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
+  // see bug 89
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
+#else
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
+#endif
+
+// This macro can be used to prevent from macro expansion, e.g.:
+//   std::max EIGEN_NOT_A_MACRO(a,b)
+#define EIGEN_NOT_A_MACRO
+
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION RowMajor
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
 #else
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ColMajor
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
 #endif
 
 #ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
@@ -103,10 +343,130 @@
 #  define EIGEN_HAS_BUILTIN(x) 0
 #endif
 
+// A Clang feature extension to determine compiler features.
+// We use it to determine 'cxx_rvalue_references'
+#ifndef __has_feature
+# define __has_feature(x) 0
+#endif
+
+// Upperbound on the C++ version to use.
+// Expected values are 03, 11, 14, 17, etc.
+// By default, let's use an arbitrarily large C++ version.
+#ifndef EIGEN_MAX_CPP_VER
+#define EIGEN_MAX_CPP_VER 99
+#endif
+
+#if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
+#define EIGEN_HAS_CXX11 1
+#else
+#define EIGEN_HAS_CXX11 0
+#endif
+
+
+// Do we support r-value references?
+#ifndef EIGEN_HAS_RVALUE_REFERENCES
+#if EIGEN_MAX_CPP_VER>=11 && \
+    (__has_feature(cxx_rvalue_references) || \
+    (defined(__cplusplus) && __cplusplus >= 201103L) || \
+    (EIGEN_COMP_MSVC >= 1600))
+  #define EIGEN_HAS_RVALUE_REFERENCES 1
+#else
+  #define EIGEN_HAS_RVALUE_REFERENCES 0
+#endif
+#endif
+
+// Does the compiler support C99?
+#ifndef EIGEN_HAS_C99_MATH
+#if EIGEN_MAX_CPP_VER>=11 && \
+    ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
+  || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
+  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)))
+  #define EIGEN_HAS_C99_MATH 1
+#else
+  #define EIGEN_HAS_C99_MATH 0
+#endif
+#endif
+
+// Does the compiler support result_of?
+#ifndef EIGEN_HAS_STD_RESULT_OF
+#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L)))
+#define EIGEN_HAS_STD_RESULT_OF 1
+#else
+#define EIGEN_HAS_STD_RESULT_OF 0
+#endif
+#endif
+
+// Does the compiler support variadic templates?
+#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
+  && ( !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
+    // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
+    //    this prevents nvcc from crashing when compiling Eigen on Tegra X1
+#define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#else
+#define EIGEN_HAS_VARIADIC_TEMPLATES 0
+#endif
+#endif
+
+// Does the compiler fully support const expressions? (as in c++14)
+#ifndef EIGEN_HAS_CONSTEXPR
+
+#ifdef __CUDACC__
+// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
+#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500))
+  #define EIGEN_HAS_CONSTEXPR 1
+#endif
+#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
+  (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)))
+#define EIGEN_HAS_CONSTEXPR 1
+#endif
+
+#ifndef EIGEN_HAS_CONSTEXPR
+#define EIGEN_HAS_CONSTEXPR 0
+#endif
+
+#endif
+
+// Does the compiler support C++11 math?
+// Let's be conservative and enable the default C++11 implementation only if we are sure it exists
+#ifndef EIGEN_HAS_CXX11_MATH
+  #if EIGEN_MAX_CPP_VER>=11 && ((__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+      && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC))
+    #define EIGEN_HAS_CXX11_MATH 1
+  #else
+    #define EIGEN_HAS_CXX11_MATH 0
+  #endif
+#endif
+
+// Does the compiler support proper C++11 containers?
+#ifndef EIGEN_HAS_CXX11_CONTAINERS
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         ((__cplusplus > 201103L) \
+      || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
+      || EIGEN_COMP_MSVC >= 1900)
+    #define EIGEN_HAS_CXX11_CONTAINERS 1
+  #else
+    #define EIGEN_HAS_CXX11_CONTAINERS 0
+  #endif
+#endif
+
+// Does the compiler support C++11 noexcept?
+#ifndef EIGEN_HAS_CXX11_NOEXCEPT
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         (__has_feature(cxx_noexcept) \
+      || (__cplusplus > 201103L) \
+      || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
+      || EIGEN_COMP_MSVC >= 1900)
+    #define EIGEN_HAS_CXX11_NOEXCEPT 1
+  #else
+    #define EIGEN_HAS_CXX11_NOEXCEPT 0
+  #endif
+#endif
+
 /** Allows to disable some optimizations which might affect the accuracy of the result.
   * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
   * They currently include:
-  *   - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled.
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
   */
 #ifndef EIGEN_FAST_MATH
 #define EIGEN_FAST_MATH 1
@@ -118,6 +478,8 @@
 #define EIGEN_CAT2(a,b) a ## b
 #define EIGEN_CAT(a,b) EIGEN_CAT2(a,b)
 
+#define EIGEN_COMMA ,
+
 // convert a token to a string
 #define EIGEN_MAKESTRING2(a) #a
 #define EIGEN_MAKESTRING(a) EIGEN_MAKESTRING2(a)
@@ -125,7 +487,7 @@
 // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC,
 // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline
 // but GCC is still doing fine with just inline.
-#if (defined _MSC_VER) || (defined __INTEL_COMPILER)
+#if EIGEN_COMP_MSVC || EIGEN_COMP_ICC
 #define EIGEN_STRONG_INLINE __forceinline
 #else
 #define EIGEN_STRONG_INLINE inline
@@ -135,24 +497,25 @@
 // attribute to maximize inlining. This should only be used when really necessary: in particular,
 // it uses __attribute__((always_inline)) on GCC, which most of the time is useless and can severely harm compile times.
 // FIXME with the always_inline attribute,
-// gcc 3.4.x reports the following compilation error:
+// gcc 3.4.x and 4.1 reports the following compilation error:
 //   Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const'
 //    : function body not available
-#if EIGEN_GNUC_AT_LEAST(4,0)
+//   See also bug 1367
+#if EIGEN_GNUC_AT_LEAST(4,2)
 #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline
 #else
 #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
 #endif
 
-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 #define EIGEN_DONT_INLINE __attribute__((noinline))
-#elif (defined _MSC_VER)
+#elif EIGEN_COMP_MSVC
 #define EIGEN_DONT_INLINE __declspec(noinline)
 #else
 #define EIGEN_DONT_INLINE
 #endif
 
-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 #define EIGEN_PERMISSIVE_EXPR __extension__
 #else
 #define EIGEN_PERMISSIVE_EXPR
@@ -221,15 +584,15 @@
 #endif
 
 #ifdef EIGEN_NO_DEBUG
-#define EIGEN_ONLY_USED_FOR_DEBUG(x) (void)x
+#define EIGEN_ONLY_USED_FOR_DEBUG(x) EIGEN_UNUSED_VARIABLE(x)
 #else
 #define EIGEN_ONLY_USED_FOR_DEBUG(x)
 #endif
 
 #ifndef EIGEN_NO_DEPRECATED_WARNING
-  #if (defined __GNUC__)
+  #if EIGEN_COMP_GNUC
     #define EIGEN_DEPRECATED __attribute__((deprecated))
-  #elif (defined _MSC_VER)
+  #elif EIGEN_COMP_MSVC
     #define EIGEN_DEPRECATED __declspec(deprecated)
   #else
     #define EIGEN_DEPRECATED
@@ -238,7 +601,7 @@
   #define EIGEN_DEPRECATED
 #endif
 
-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 #define EIGEN_UNUSED __attribute__((unused))
 #else
 #define EIGEN_UNUSED
@@ -247,19 +610,33 @@
 // Suppresses 'unused variable' warnings.
 namespace Eigen {
   namespace internal {
-    template<typename T> void ignore_unused_variable(const T&) {}
+    template<typename T> EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {}
   }
 }
 #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
 
 #if !defined(EIGEN_ASM_COMMENT)
-  #if (defined __GNUC__) && ( defined(__i386__) || defined(__x86_64__) )
+  #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
     #define EIGEN_ASM_COMMENT(X)  __asm__("#" X)
   #else
     #define EIGEN_ASM_COMMENT(X)
   #endif
 #endif
 
+
+//------------------------------------------------------------------------------------------
+// Static and dynamic alignment control
+//
+// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
+// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
+// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
+// a default value is automatically computed based on architecture, compiler, and OS.
+//
+// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
+// to be used to declare statically aligned buffers.
+//------------------------------------------------------------------------------------------
+
+
 /* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
  * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
  * so that vectorization doesn't affect binary compatibility.
@@ -267,28 +644,149 @@ namespace Eigen {
  * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
  * vectorized and non-vectorized code.
  */
-#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
+#if (defined __CUDACC__)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#elif (defined _MSC_VER)
+#elif EIGEN_COMP_MSVC
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
-#elif (defined __SUNPRO_CC)
+#elif EIGEN_COMP_SUNCC
   // FIXME not sure about this one:
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
 #else
   #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
 #endif
 
+// If the user explicitly disable vectorization, then we also disable alignment
+#if defined(EIGEN_DONT_VECTORIZE)
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+#elif defined(EIGEN_VECTORIZE_AVX512)
+  // 64 bytes static alignmeent is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
+#elif defined(__AVX__)
+  // 32 bytes static alignmeent is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
+#else
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#endif
+
+
+// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
+#define EIGEN_MIN_ALIGN_BYTES 16
+
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+
+#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
+#endif
+
+// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated
+// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
+#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
+  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
+    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+  // certain common platform (compiler+architecture combinations) to avoid these problems.
+  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
+  // try to keep heap alignment even when we have to disable static alignment.
+  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
+  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
+  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
+  // 4.8 and newer seem definitely unaffected.
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #else
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #endif
+
+  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+  && !EIGEN_GCC3_OR_OLDER \
+  && !EIGEN_COMP_SUNCC \
+  && !EIGEN_OS_QNX
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+  #else
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #endif
+
+  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+  #else
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+  #endif
+
+#endif
+
+// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
+#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+#endif
+
+// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
+// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES)
+// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
+// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
+
+
+// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
 #define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
 #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
+#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
+#else
+#define EIGEN_ALIGN_MAX
+#endif
+
 
-#if EIGEN_ALIGN_STATICALLY
-#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
-#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
+// Dynamic alignment control
+
+#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
+#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
+#endif
+
+#ifdef EIGEN_DONT_ALIGN
+  #ifdef EIGEN_MAX_ALIGN_BYTES
+    #undef EIGEN_MAX_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_ALIGN_BYTES 0
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
 #else
-#define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
-#define EIGEN_USER_ALIGN16
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+
+#ifndef EIGEN_UNALIGNED_VECTORIZE
+#define EIGEN_UNALIGNED_VECTORIZE 1
 #endif
 
+//----------------------------------------------------------------------
+
+
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
   #define EIGEN_RESTRICT
 #endif
@@ -314,25 +812,26 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if defined(_MSC_VER) && (_MSC_VER < 1800) && (!defined(__INTEL_COMPILER))
-#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-  using Base::operator =;
-#elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
-#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-  using Base::operator =; \
-  EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
-  template <typename OtherDerived> \
-  EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
-#else
-#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-  using Base::operator =; \
-  EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
-  { \
-    Base::operator=(other); \
-    return *this; \
-  }
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 ||  defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
+  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =;
+#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =; \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
+    template <typename OtherDerived> \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
+#else
+  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =; \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
+    { \
+      Base::operator=(other); \
+      return *this; \
+    }
 #endif
 
+
 /** \internal
  * \brief Macro to manually inherit assignment operators.
  * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
@@ -351,32 +850,12 @@ namespace Eigen {
   typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
   typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
-  typedef typename Eigen::internal::nested<Derived>::type Nested; \
+  typedef typename Eigen::internal::ref_selector<Derived>::type Nested; \
   typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived>::Index Index; \
+  typedef typename Eigen::internal::traits<Derived>::StorageIndex StorageIndex; \
   enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
         ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
         Flags = Eigen::internal::traits<Derived>::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived>::CoeffReadCost, \
-        SizeAtCompileTime = Base::SizeAtCompileTime, \
-        MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime };
-
-
-#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
-  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
-  typedef typename Base::PacketScalar PacketScalar; \
-  typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
-  typedef typename Eigen::internal::nested<Derived>::type Nested; \
-  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived>::Index Index; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
-        ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
-        MaxRowsAtCompileTime = Eigen::internal::traits<Derived>::MaxRowsAtCompileTime, \
-        MaxColsAtCompileTime = Eigen::internal::traits<Derived>::MaxColsAtCompileTime, \
-        Flags = Eigen::internal::traits<Derived>::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived>::CoeffReadCost, \
         SizeAtCompileTime = Base::SizeAtCompileTime, \
         MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
         IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
@@ -384,6 +863,12 @@ namespace Eigen {
   using Base::const_cast_derived;
 
 
+// FIXME Maybe the EIGEN_DENSE_PUBLIC_INTERFACE could be removed as importing PacketScalar is rarely needed
+#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
+  typedef typename Base::PacketScalar PacketScalar;
+
+
 #define EIGEN_PLAIN_ENUM_MIN(a,b) (((int)a <= (int)b) ? (int)a : (int)b)
 #define EIGEN_PLAIN_ENUM_MAX(a,b) (((int)a >= (int)b) ? (int)a : (int)b)
 
@@ -413,18 +898,10 @@ namespace Eigen {
 
 #define EIGEN_IMPLIES(a,b) (!(a) || (b))
 
-#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR) \
-  template<typename OtherDerived> \
-  EIGEN_STRONG_INLINE const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> \
-  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
-  { \
-    return CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived>(derived(), other.derived()); \
-  }
-
-// the expression type of a cwise product
-#define EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS) \
+// the expression type of a standard coefficient wise binary operation
+#define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \
     CwiseBinaryOp< \
-      internal::scalar_product_op< \
+      EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)< \
           typename internal::traits<LHS>::Scalar, \
           typename internal::traits<RHS>::Scalar \
       >, \
@@ -432,4 +909,84 @@ namespace Eigen {
       const RHS \
     >
 
+#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,OPNAME) \
+  template<typename OtherDerived> \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME) \
+  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+  { \
+    return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME)(derived(), other.derived()); \
+  }
+
+#define EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,TYPEA,TYPEB) \
+  (Eigen::internal::has_ReturnType<Eigen::ScalarBinaryOpTraits<TYPEA,TYPEB,EIGEN_CAT(EIGEN_CAT(Eigen::internal::scalar_,OPNAME),_op)<TYPEA,TYPEB>  > >::value)
+
+#define EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(EXPR,SCALAR,OPNAME) \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<typename internal::traits<EXPR>::Scalar,SCALAR>, const EXPR, \
+                const typename internal::plain_constant_type<EXPR,SCALAR>::type>
+
+#define EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(SCALAR,EXPR,OPNAME) \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<SCALAR,typename internal::traits<EXPR>::Scalar>, \
+                const typename internal::plain_constant_type<EXPR,SCALAR>::type, const EXPR>
+
+// Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010")
+#if EIGEN_COMP_MSVC_STRICT<=1600
+#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if<true,X>::type
+#else
+#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X
+#endif
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \
+  template <typename T> EIGEN_DEVICE_FUNC inline \
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type,OPNAME))\
+  (METHOD)(const T& scalar) const { \
+    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type PromotedT; \
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedT,OPNAME)(derived(), \
+           typename internal::plain_constant_type<Derived,PromotedT>::type(derived().rows(), derived().cols(), internal::scalar_constant_op<PromotedT>(scalar))); \
+  }
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
+  template <typename T> EIGEN_DEVICE_FUNC inline friend \
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type,Derived,OPNAME)) \
+  (METHOD)(const T& scalar, const StorageBaseType& matrix) { \
+    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type PromotedT; \
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedT,Derived,OPNAME)( \
+           typename internal::plain_constant_type<Derived,PromotedT>::type(matrix.derived().rows(), matrix.derived().cols(), internal::scalar_constant_op<PromotedT>(scalar)), matrix.derived()); \
+  }
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP(METHOD,OPNAME) \
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME)
+
+
+#ifdef EIGEN_EXCEPTIONS
+#  define EIGEN_THROW_X(X) throw X
+#  define EIGEN_THROW throw
+#  define EIGEN_TRY try
+#  define EIGEN_CATCH(X) catch (X)
+#else
+#  ifdef __CUDA_ARCH__
+#    define EIGEN_THROW_X(X) asm("trap;")
+#    define EIGEN_THROW asm("trap;")
+#  else
+#    define EIGEN_THROW_X(X) std::abort()
+#    define EIGEN_THROW std::abort()
+#  endif
+#  define EIGEN_TRY if (true)
+#  define EIGEN_CATCH(X) else
+#endif
+
+
+#if EIGEN_HAS_CXX11_NOEXCEPT
+#   define EIGEN_INCLUDE_TYPE_TRAITS
+#   define EIGEN_NOEXCEPT noexcept
+#   define EIGEN_NOEXCEPT_IF(x) noexcept(x)
+#   define EIGEN_NO_THROW noexcept(true)
+#   define EIGEN_EXCEPTION_SPEC(X) noexcept(false)
+#else
+#   define EIGEN_NOEXCEPT
+#   define EIGEN_NOEXCEPT_IF(x)
+#   define EIGEN_NO_THROW throw()
+#   define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#endif
+
 #endif // EIGEN_MACROS_H
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 779640237..c634d7ea0 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -1,11 +1,12 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2009 Kenneth Riddile <kfriddile@yahoo.com>
 // Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
 // Copyright (C) 2010 Thomas Capricelli <orzel@freehackers.org>
+// Copyright (C) 2013 Pavel Holoborodko <pavel@holoborodko.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -31,7 +32,7 @@
 // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
 // quite safe, at least within the context of glibc, to equate 64-bit with LP64.
 #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
- && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ )
+ && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
   #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
 #else
   #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
@@ -41,15 +42,15 @@
 //   See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
 // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
 //   See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
-#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
+#if defined(__FreeBSD__) && !(EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
   #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
 #else
   #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
 #endif
 
-#if defined(__APPLE__) \
- || defined(_WIN64) \
- || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
+#if (EIGEN_OS_MAC && (EIGEN_DEFAULT_ALIGN_BYTES == 16))     \
+ || (EIGEN_OS_WIN64 && (EIGEN_DEFAULT_ALIGN_BYTES == 16))   \
+ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED              \
  || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
   #define EIGEN_MALLOC_ALREADY_ALIGNED 1
 #else
@@ -58,36 +59,17 @@
 
 #endif
 
-// See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554)
-// It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first.
-// Currently, let's include it only on unix systems:
-#if defined(__unix__) || defined(__unix)
-  #include <unistd.h>
-  #if ((defined __QNXNTO__) || (defined _GNU_SOURCE) || (defined __PGI) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0) && !defined(EIGEN_ANDROID_POSIX_MEMALIGN_WR)
-    #define EIGEN_HAS_POSIX_MEMALIGN 1
-  #endif
-#endif
-
-#ifndef EIGEN_HAS_POSIX_MEMALIGN
-  #define EIGEN_HAS_POSIX_MEMALIGN 0
-#endif
-
-#if defined(EIGEN_VECTORIZE_SSE) && !defined(EIGEN_ANDROID_POSIX_MEMALIGN_WR)
-  #define EIGEN_HAS_MM_MALLOC 1
-#else
-  #define EIGEN_HAS_MM_MALLOC 0
-#endif
-
 namespace Eigen {
 
 namespace internal {
 
+EIGEN_DEVICE_FUNC 
 inline void throw_std_bad_alloc()
 {
   #ifdef EIGEN_EXCEPTIONS
     throw std::bad_alloc();
   #else
-    std::size_t huge = -1;
+    std::size_t huge = static_cast<std::size_t>(-1);
     new int[huge];
   #endif
 }
@@ -103,9 +85,9 @@ inline void throw_std_bad_alloc()
   */
 inline void* handmade_aligned_malloc(std::size_t size)
 {
-  void *original = std::malloc(size+16);
+  void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES);
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
@@ -118,7 +100,7 @@ inline void handmade_aligned_free(void *ptr)
 
 /** \internal
   * \brief Reallocates aligned memory.
-  * Since we know that our handmade version is based on std::realloc
+  * Since we know that our handmade version is based on std::malloc
   * we can use std::realloc to implement efficient reallocation.
   */
 inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = 0)
@@ -126,9 +108,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
   if (ptr == 0) return handmade_aligned_malloc(size);
   void *original = *(reinterpret_cast<void**>(ptr) - 1);
   std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
-  original = std::realloc(original,size+16);
+  original = std::realloc(original,size+EIGEN_DEFAULT_ALIGN_BYTES);
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
   void *previous_aligned = static_cast<char *>(original)+previous_offset;
   if(aligned!=previous_aligned)
     std::memmove(aligned, previous_aligned, size);
@@ -138,92 +120,46 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
 }
 
 /*****************************************************************************
-*** Implementation of generic aligned realloc (when no realloc can be used)***
-*****************************************************************************/
-
-void* aligned_malloc(std::size_t size);
-void  aligned_free(void *ptr);
-
-/** \internal
-  * \brief Reallocates aligned memory.
-  * Allows reallocation with aligned ptr types. This implementation will
-  * always create a new memory chunk and copy the old data.
-  */
-inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
-{
-  if (ptr==0)
-    return aligned_malloc(size);
-
-  if (size==0)
-  {
-    aligned_free(ptr);
-    return 0;
-  }
-
-  void* newptr = aligned_malloc(size);
-  if (newptr == 0)
-  {
-    #ifdef EIGEN_HAS_ERRNO
-    errno = ENOMEM; // according to the standard
-    #endif
-    return 0;
-  }
-
-  if (ptr != 0)
-  {
-    std::memcpy(newptr, ptr, (std::min)(size,old_size));
-    aligned_free(ptr);
-  }
-
-  return newptr;
-}
-
-/*****************************************************************************
 *** Implementation of portable aligned versions of malloc/free/realloc     ***
 *****************************************************************************/
 
 #ifdef EIGEN_NO_MALLOC
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
 }
 #elif defined EIGEN_RUNTIME_NO_MALLOC
-inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
 {
   static bool value = true;
   if (update == 1)
     value = new_value;
   return value;
 }
-inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
-inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
+EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
 }
 #else 
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {}
 #endif
 
-/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
+/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
   * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
   */
-inline void* aligned_malloc(size_t size)
+EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 {
   check_that_malloc_is_allowed();
 
   void *result;
-  #if !EIGEN_ALIGN
+  #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
     result = std::malloc(size);
-  #elif EIGEN_MALLOC_ALREADY_ALIGNED
-    result = std::malloc(size);
-  #elif EIGEN_HAS_POSIX_MEMALIGN
-    if(posix_memalign(&result, 16, size)) result = 0;
-  #elif EIGEN_HAS_MM_MALLOC
-    result = _mm_malloc(size, 16);
-  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-    result = _aligned_malloc(size, 16);
+    #if EIGEN_DEFAULT_ALIGN_BYTES==16
+    eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator.");
+    #endif
   #else
     result = handmade_aligned_malloc(size);
   #endif
@@ -235,50 +171,27 @@ inline void* aligned_malloc(size_t size)
 }
 
 /** \internal Frees memory allocated with aligned_malloc. */
-inline void aligned_free(void *ptr)
+EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
 {
-  #if !EIGEN_ALIGN
-    std::free(ptr);
-  #elif EIGEN_MALLOC_ALREADY_ALIGNED
+  #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
     std::free(ptr);
-  #elif EIGEN_HAS_POSIX_MEMALIGN
-    std::free(ptr);
-  #elif EIGEN_HAS_MM_MALLOC
-    _mm_free(ptr);
-  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-    _aligned_free(ptr);
   #else
     handmade_aligned_free(ptr);
   #endif
 }
 
 /**
-* \internal
-* \brief Reallocates an aligned block of memory.
-* \throws std::bad_alloc on allocation failure
-**/
-inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
+  * \internal
+  * \brief Reallocates an aligned block of memory.
+  * \throws std::bad_alloc on allocation failure
+  */
+inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
 {
   EIGEN_UNUSED_VARIABLE(old_size);
 
   void *result;
-#if !EIGEN_ALIGN
+#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
   result = std::realloc(ptr,new_size);
-#elif EIGEN_MALLOC_ALREADY_ALIGNED
-  result = std::realloc(ptr,new_size);
-#elif EIGEN_HAS_POSIX_MEMALIGN
-  result = generic_aligned_realloc(ptr,new_size,old_size);
-#elif EIGEN_HAS_MM_MALLOC
-  // The defined(_mm_free) is just here to verify that this MSVC version
-  // implements _mm_malloc/_mm_free based on the corresponding _aligned_
-  // functions. This may not always be the case and we just try to be safe.
-  #if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free)
-    result = _aligned_realloc(ptr,new_size,16);
-  #else
-    result = generic_aligned_realloc(ptr,new_size,old_size);
-  #endif
-#elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-  result = _aligned_realloc(ptr,new_size,16);
 #else
   result = handmade_aligned_realloc(ptr,new_size,old_size);
 #endif
@@ -296,12 +209,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
 /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
   * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
   */
-template<bool Align> inline void* conditional_aligned_malloc(size_t size)
+template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std::size_t size)
 {
   return aligned_malloc(size);
 }
 
-template<> inline void* conditional_aligned_malloc<false>(size_t size)
+template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std::size_t size)
 {
   check_that_malloc_is_allowed();
 
@@ -312,22 +225,22 @@ template<> inline void* conditional_aligned_malloc<false>(size_t size)
 }
 
 /** \internal Frees memory allocated with conditional_aligned_malloc */
-template<bool Align> inline void conditional_aligned_free(void *ptr)
+template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr)
 {
   aligned_free(ptr);
 }
 
-template<> inline void conditional_aligned_free<false>(void *ptr)
+template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
 {
   std::free(ptr);
 }
 
-template<bool Align> inline void* conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
+template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
 {
   return aligned_realloc(ptr, new_size, old_size);
 }
 
-template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t)
+template<> inline void* conditional_aligned_realloc<false>(void* ptr, std::size_t new_size, std::size_t)
 {
   return std::realloc(ptr, new_size);
 }
@@ -336,33 +249,43 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new
 *** Construction/destruction of array elements                             ***
 *****************************************************************************/
 
-/** \internal Constructs the elements of an array.
-  * The \a size parameter tells on how many objects to call the constructor of T.
-  */
-template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
-{
-  for (size_t i=0; i < size; ++i) ::new (ptr + i) T;
-  return ptr;
-}
-
 /** \internal Destructs the elements of an array.
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, std::size_t size)
 {
   // always destruct an array starting from the end.
   if(ptr)
     while(size) ptr[--size].~T();
 }
 
+/** \internal Constructs the elements of an array.
+  * The \a size parameter tells on how many objects to call the constructor of T.
+  */
+template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size)
+{
+  std::size_t i;
+  EIGEN_TRY
+  {
+      for (i = 0; i < size; ++i) ::new (ptr + i) T;
+      return ptr;
+  }
+  EIGEN_CATCH(...)
+  {
+    destruct_elements_of_array(ptr, i);
+    EIGEN_THROW;
+  }
+  return NULL;
+}
+
 /*****************************************************************************
 *** Implementation of aligned new/delete-like functions                    ***
 *****************************************************************************/
 
 template<typename T>
-EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size)
 {
-  if(size > size_t(-1) / sizeof(T))
+  if(size > std::size_t(-1) / sizeof(T))
     throw_std_bad_alloc();
 }
 
@@ -370,24 +293,42 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
   * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
   * The default constructor of T is called.
   */
-template<typename T> inline T* aligned_new(size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size)
 {
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
-  return construct_elements_of_array(result, size);
+  EIGEN_TRY
+  {
+    return construct_elements_of_array(result, size);
+  }
+  EIGEN_CATCH(...)
+  {
+    aligned_free(result);
+    EIGEN_THROW;
+  }
+  return result;
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size)
 {
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
-  return construct_elements_of_array(result, size);
+  EIGEN_TRY
+  {
+    return construct_elements_of_array(result, size);
+  }
+  EIGEN_CATCH(...)
+  {
+    conditional_aligned_free<Align>(result);
+    EIGEN_THROW;
+  }
+  return result;
 }
 
 /** \internal Deletes objects constructed with aligned_new
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T> inline void aligned_delete(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
   aligned_free(ptr);
@@ -396,13 +337,13 @@ template<typename T> inline void aligned_delete(T *ptr, size_t size)
 /** \internal Deletes objects constructed with conditional_aligned_new
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, std::size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
   conditional_aligned_free<Align>(ptr);
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, std::size_t new_size, std::size_t old_size)
 {
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
@@ -410,23 +351,43 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pt
     destruct_elements_of_array(pts+new_size, old_size-new_size);
   T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
   if(new_size > old_size)
-    construct_elements_of_array(result+old_size, new_size-old_size);
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result+old_size, new_size-old_size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
   return result;
 }
 
 
-template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(std::size_t size)
 {
   if(size==0)
     return 0; // short-cut. Also fixes Bug 884
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
   if(NumTraits<T>::RequireInitialization)
-    construct_elements_of_array(result, size);
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result, size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
   return result;
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, size_t new_size, size_t old_size)
+template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size)
 {
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
@@ -434,11 +395,21 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(
     destruct_elements_of_array(pts+new_size, old_size-new_size);
   T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
   if(NumTraits<T>::RequireInitialization && (new_size > old_size))
-    construct_elements_of_array(result+old_size, new_size-old_size);
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result+old_size, new_size-old_size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
   return result;
 }
 
-template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size)
 {
   if(NumTraits<T>::RequireInitialization)
     destruct_elements_of_array<T>(ptr, size);
@@ -447,51 +418,62 @@ template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *
 
 /****************************************************************************/
 
-/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment.
   *
+  * \tparam Alignment requested alignment in Bytes.
   * \param array the address of the start of the array
   * \param size the size of the array
   *
-  * \note If no element of the array is well aligned, the size of the array is returned. Typically,
-  * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
+  * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar,
+  * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If
   * packet size for the given scalar type is 1, then everything is considered well-aligned.
   *
-  * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
-  * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
-  * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+  * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a
+  * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
   * example with Scalar=double on certain 32-bit platforms, see bug #79.
   *
   * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
+  * \sa first_default_aligned()
   */
-template<typename Scalar, typename Index>
-static inline Index first_aligned(const Scalar* array, Index size)
+template<int Alignment, typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size)
 {
-  static const Index PacketSize = packet_traits<Scalar>::size;
-  static const Index PacketAlignedMask = PacketSize-1;
+  const Index ScalarSize = sizeof(Scalar);
+  const Index AlignmentSize = Alignment / ScalarSize;
+  const Index AlignmentMask = AlignmentSize-1;
 
-  if(PacketSize==1)
+  if(AlignmentSize<=1)
   {
-    // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
-    // of the array have the same alignment.
+    // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar
+    // so that all elements of the array have the same alignment.
     return 0;
   }
-  else if(size_t(array) & (sizeof(Scalar)-1))
+  else if( (UIntPtr(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
   {
-    // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
+    // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size.
     // Consequently, no element of the array is well aligned.
     return size;
   }
   else
   {
-    return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
-                           & PacketAlignedMask, size);
+    Index first = (AlignmentSize - (Index((UIntPtr(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask;
+    return (first < size) ? first : size;
   }
 }
 
+/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
+   * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
+template<typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size)
+{
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(array, size);
+}
+
 /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
   */ 
 template<typename Index> 
-inline static Index first_multiple(Index size, Index base)
+inline Index first_multiple(Index size, Index base)
 {
   return ((size+base-1)/base)*base;
 }
@@ -500,21 +482,59 @@ inline static Index first_multiple(Index size, Index base)
 // use memcpy on trivial types, i.e., on types that does not require an initialization ctor.
 template<typename T, bool UseMemcpy> struct smart_copy_helper;
 
-template<typename T> void smart_copy(const T* start, const T* end, T* target)
+template<typename T> EIGEN_DEVICE_FUNC void smart_copy(const T* start, const T* end, T* target)
 {
   smart_copy_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
 }
 
 template<typename T> struct smart_copy_helper<T,true> {
-  static inline void run(const T* start, const T* end, T* target)
-  { memcpy(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
+  {
+    IntPtr size = IntPtr(end)-IntPtr(start);
+    if(size==0) return;
+    eigen_internal_assert(start!=0 && end!=0 && target!=0);
+    memcpy(target, start, size);
+  }
 };
 
 template<typename T> struct smart_copy_helper<T,false> {
-  static inline void run(const T* start, const T* end, T* target)
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
   { std::copy(start, end, target); }
 };
 
+// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. 
+template<typename T, bool UseMemmove> struct smart_memmove_helper;
+
+template<typename T> void smart_memmove(const T* start, const T* end, T* target)
+{
+  smart_memmove_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
+}
+
+template<typename T> struct smart_memmove_helper<T,true> {
+  static inline void run(const T* start, const T* end, T* target)
+  {
+    IntPtr size = IntPtr(end)-IntPtr(start);
+    if(size==0) return;
+    eigen_internal_assert(start!=0 && end!=0 && target!=0);
+    std::memmove(target, start, size);
+  }
+};
+
+template<typename T> struct smart_memmove_helper<T,false> {
+  static inline void run(const T* start, const T* end, T* target)
+  { 
+    if (UIntPtr(target) < UIntPtr(start))
+    {
+      std::copy(start, end, target);
+    }
+    else                                 
+    {
+      std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T);
+      std::copy_backward(start, end, target + count); 
+    }
+  }
+};
+
 
 /*****************************************************************************
 *** Implementation of runtime stack allocation (falling back to malloc)    ***
@@ -523,16 +543,16 @@ template<typename T> struct smart_copy_helper<T,false> {
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
 #ifndef EIGEN_ALLOCA
-  #if (defined __linux__) || (defined __APPLE__) || (defined alloca)
+  #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
     #define EIGEN_ALLOCA alloca
-  #elif defined(_MSC_VER)
+  #elif EIGEN_COMP_MSVC
     #define EIGEN_ALLOCA _alloca
   #endif
 #endif
 
 // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
 // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
-template<typename T> class aligned_stack_memory_handler
+template<typename T> class aligned_stack_memory_handler : noncopyable
 {
   public:
     /* Creates a stack_memory_handler responsible for the buffer \a ptr of size \a size.
@@ -541,7 +561,7 @@ template<typename T> class aligned_stack_memory_handler
      * In this case, the buffer elements will also be destructed when this handler will be destructed.
      * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
      **/
-    aligned_stack_memory_handler(T* ptr, size_t size, bool dealloc)
+    aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
       : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
@@ -556,10 +576,34 @@ template<typename T> class aligned_stack_memory_handler
     }
   protected:
     T* m_ptr;
-    size_t m_size;
+    std::size_t m_size;
     bool m_deallocate;
 };
 
+template<typename T> class scoped_array : noncopyable
+{
+  T* m_ptr;
+public:
+  explicit scoped_array(std::ptrdiff_t size)
+  {
+    m_ptr = new T[size];
+  }
+  ~scoped_array()
+  {
+    delete[] m_ptr;
+  }
+  T& operator[](std::ptrdiff_t i) { return m_ptr[i]; }
+  const T& operator[](std::ptrdiff_t i) const { return m_ptr[i]; }
+  T* &ptr() { return m_ptr; }
+  const T* ptr() const { return m_ptr; }
+  operator const T*() const { return m_ptr; }
+};
+
+template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
+{
+  std::swap(a.ptr(),b.ptr());
+}
+    
 } // end namespace internal
 
 /** \internal
@@ -578,11 +622,13 @@ template<typename T> class aligned_stack_memory_handler
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
   */
 #ifdef EIGEN_ALLOCA
-
-  #if defined(__arm__) || defined(_WIN32)
-    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16)
+  
+  #if EIGEN_DEFAULT_ALIGN_BYTES>0
+    // We always manually re-align the result of EIGEN_ALLOCA.
+    // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((internal::UIntPtr(EIGEN_ALLOCA(SIZE+EIGEN_DEFAULT_ALIGN_BYTES-1)) + EIGEN_DEFAULT_ALIGN_BYTES-1) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)))
   #else
-    #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
   #endif
 
   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
@@ -607,39 +653,33 @@ template<typename T> class aligned_stack_memory_handler
 *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
 *****************************************************************************/
 
-#if EIGEN_ALIGN
-  #ifdef EIGEN_EXCEPTIONS
-    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void* operator new(size_t size, const std::nothrow_t&) throw() { \
-        try { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
-        catch (...) { return 0; } \
+#if EIGEN_MAX_ALIGN_BYTES!=0
+  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
+        EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
+        EIGEN_CATCH (...) { return 0; } \
       }
-  #else
-    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void* operator new(size_t size, const std::nothrow_t&) throw() { \
-        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
-      }
-  #endif
-
   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
-      void *operator new(size_t size) { \
+      void *operator new(std::size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
       } \
-      void *operator new[](size_t size) { \
+      void *operator new[](std::size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
       } \
-      void operator delete(void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
-      void operator delete[](void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete(void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete[](void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
       /* in-place new and delete. since (at least afaik) there is no actual   */ \
       /* memory allocated we can safely let the default implementation handle */ \
       /* this particular case. */ \
-      static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \
-      static void *operator new[](size_t size, void* ptr) { return ::operator new[](size,ptr); } \
-      void operator delete(void * memory, void *ptr) throw() { return ::operator delete(memory,ptr); } \
-      void operator delete[](void * memory, void *ptr) throw() { return ::operator delete[](memory,ptr); } \
+      static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \
+      static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \
+      void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \
+      void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \
       /* nothrow-new (returns zero instead of std::bad_alloc) */ \
       EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void operator delete(void *ptr, const std::nothrow_t&) throw() { \
+      void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \
         Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
       } \
       typedef void eigen_aligned_operator_new_marker_type;
@@ -649,7 +689,7 @@ template<typename T> class aligned_stack_memory_handler
 
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0)))
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0)))
 
 /****************************************************************************/
 
@@ -667,96 +707,56 @@ template<typename T> class aligned_stack_memory_handler
 * std::map< int, Vector3f > my_map_vec3;
 * \endcode
 *
-* \sa \ref TopicStlContainers.
+* \sa \blank \ref TopicStlContainers.
 */
 template<class T>
-class aligned_allocator
+class aligned_allocator : public std::allocator<T>
 {
 public:
-    typedef size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-    typedef T*        pointer;
-    typedef const T*  const_pointer;
-    typedef T&        reference;
-    typedef const T&  const_reference;
-    typedef T         value_type;
-
-    template<class U>
-    struct rebind
-    {
-        typedef aligned_allocator<U> other;
-    };
-
-    pointer address( reference value ) const
-    {
-        return &value;
-    }
-
-    const_pointer address( const_reference value ) const
-    {
-        return &value;
-    }
-
-    aligned_allocator()
-    {
-    }
-
-    aligned_allocator( const aligned_allocator& )
-    {
-    }
-
-    template<class U>
-    aligned_allocator( const aligned_allocator<U>& )
-    {
-    }
-
-    ~aligned_allocator()
-    {
-    }
-
-    size_type max_size() const
-    {
-        return (std::numeric_limits<size_type>::max)();
-    }
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef T*              pointer;
+  typedef const T*        const_pointer;
+  typedef T&              reference;
+  typedef const T&        const_reference;
+  typedef T               value_type;
+
+  template<class U>
+  struct rebind
+  {
+    typedef aligned_allocator<U> other;
+  };
 
-    pointer allocate( size_type num, const void* hint = 0 )
-    {
-        EIGEN_UNUSED_VARIABLE(hint);
-        internal::check_size_for_overflow<T>(num);
-        return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) );
-    }
+  aligned_allocator() : std::allocator<T>() {}
 
-    void construct( pointer p, const T& value )
-    {
-        ::new( p ) T( value );
-    }
+  aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {}
 
-    void destroy( pointer p )
-    {
-        p->~T();
-    }
+  template<class U>
+  aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {}
 
-    void deallocate( pointer p, size_type /*num*/ )
-    {
-        internal::aligned_free( p );
-    }
+  ~aligned_allocator() {}
 
-    bool operator!=(const aligned_allocator<T>& ) const
-    { return false; }
+  pointer allocate(size_type num, const void* /*hint*/ = 0)
+  {
+    internal::check_size_for_overflow<T>(num);
+    return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
+  }
 
-    bool operator==(const aligned_allocator<T>& ) const
-    { return true; }
+  void deallocate(pointer p, size_type /*num*/)
+  {
+    internal::aligned_free(p);
+  }
 };
 
 //---------- Cache sizes ----------
 
 #if !defined(EIGEN_NO_CPUID)
-#  if defined(__GNUC__) && ( defined(__i386__) || defined(__x86_64__) )
-#    if defined(__PIC__) && defined(__i386__)
+#  if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
+#    if defined(__PIC__) && EIGEN_ARCH_i386
        // Case for x86 with PIC
 #      define EIGEN_CPUID(abcd,func,id) \
          __asm__ __volatile__ ("xchgl %%ebx, %k1;cpuid; xchgl %%ebx,%k1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id));
-#    elif defined(__PIC__) && defined(__x86_64__)
+#    elif defined(__PIC__) && EIGEN_ARCH_x86_64
        // Case for x64 with PIC. In theory this is only a problem with recent gcc and with medium or large code model, not with the default small code model.
        // However, we cannot detect which code model is used, and the xchg overhead is negligible anyway.
 #      define EIGEN_CPUID(abcd,func,id) \
@@ -766,8 +766,8 @@ public:
 #      define EIGEN_CPUID(abcd,func,id) \
          __asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id) );
 #    endif
-#  elif defined(_MSC_VER)
-#    if (_MSC_VER > 1500) && ( defined(_M_IX86) || defined(_M_X64) )
+#  elif EIGEN_COMP_MSVC
+#    if (EIGEN_COMP_MSVC > 1500) && EIGEN_ARCH_i386_OR_x86_64
 #      define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id)
 #    endif
 #  endif
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 71d587108..7f6370755 100644..100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,8 +11,27 @@
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
+#if defined(__CUDA_ARCH__)
+#include <cfloat>
+#include <math_constants.h>
+#endif
+
+#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+#include <cstdint>
+#endif
+
 namespace Eigen {
 
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
+
+/**
+ * \brief The Index type as used for the API.
+ * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+ * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex.
+ */
+
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
+
 namespace internal {
 
 /** \internal
@@ -22,6 +41,16 @@ namespace internal {
   * we however don't want to add a dependency to Boost.
   */
 
+// Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
+// and older versions do not provide *intptr_t types.
+#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+typedef std::intptr_t  IntPtr;
+typedef std::uintptr_t UIntPtr;
+#else
+typedef std::ptrdiff_t IntPtr;
+typedef std::size_t UIntPtr;
+#endif
+
 struct true_type {  enum { value = 1 }; };
 struct false_type { enum { value = 0 }; };
 
@@ -68,6 +97,18 @@ template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
 template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
 
+template<typename T> struct is_integral        { enum { value = false }; };
+template<> struct is_integral<bool>            { enum { value = true }; };
+template<> struct is_integral<char>            { enum { value = true }; };
+template<> struct is_integral<signed char>     { enum { value = true }; };
+template<> struct is_integral<unsigned char>   { enum { value = true }; };
+template<> struct is_integral<signed short>    { enum { value = true }; };
+template<> struct is_integral<unsigned short>  { enum { value = true }; };
+template<> struct is_integral<signed int>      { enum { value = true }; };
+template<> struct is_integral<unsigned int>    { enum { value = true }; };
+template<> struct is_integral<signed long>     { enum { value = true }; };
+template<> struct is_integral<unsigned long>   { enum { value = true }; };
+
 template <typename T> struct add_const { typedef const T type; };
 template <typename T> struct add_const<T&> { typedef T& type; };
 
@@ -80,29 +121,163 @@ template<typename T> struct add_const_on_value_type<T*>        { typedef T const
 template<typename T> struct add_const_on_value_type<T* const>  { typedef T const* const type; };
 template<typename T> struct add_const_on_value_type<T const* const>  { typedef T const* const type; };
 
+
+template<typename From, typename To>
+struct is_convertible_impl
+{
+private:
+  struct any_conversion
+  {
+    template <typename T> any_conversion(const volatile T&);
+    template <typename T> any_conversion(T&);
+  };
+  struct yes {int a[1];};
+  struct no  {int a[2];};
+
+  static yes test(const To&, int);
+  static no  test(any_conversion, ...);
+
+public:
+  static From ms_from;
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  enum { value = sizeof(test(ms_from, 0))==sizeof(yes) };
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
+};
+
+template<typename From, typename To>
+struct is_convertible
+{
+  enum { value = is_convertible_impl<typename remove_all<From>::type,
+                                     typename remove_all<To  >::type>::value };
+};
+
 /** \internal Allows to enable/disable an overload
   * according to a compile time condition.
   */
-template<bool Condition, typename T> struct enable_if;
+template<bool Condition, typename T=void> struct enable_if;
 
 template<typename T> struct enable_if<true,T>
 { typedef T type; };
 
+#if defined(__CUDA_ARCH__)
+#if !defined(__FLT_EPSILON__)
+#define __FLT_EPSILON__ FLT_EPSILON
+#define __DBL_EPSILON__ DBL_EPSILON
+#endif
 
+namespace device {
+
+template<typename T> struct numeric_limits
+{
+  EIGEN_DEVICE_FUNC
+  static T epsilon() { return 0; }
+  static T (max)() { assert(false && "Highest not supported for this type"); }
+  static T (min)() { assert(false && "Lowest not supported for this type"); }
+  static T infinity() { assert(false && "Infinity not supported for this type"); }
+  static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); }
+};
+template<> struct numeric_limits<float>
+{
+  EIGEN_DEVICE_FUNC
+  static float epsilon() { return __FLT_EPSILON__; }
+  EIGEN_DEVICE_FUNC
+  static float (max)() { return CUDART_MAX_NORMAL_F; }
+  EIGEN_DEVICE_FUNC
+  static float (min)() { return FLT_MIN; }
+  EIGEN_DEVICE_FUNC
+  static float infinity() { return CUDART_INF_F; }
+  EIGEN_DEVICE_FUNC
+  static float quiet_NaN() { return CUDART_NAN_F; }
+};
+template<> struct numeric_limits<double>
+{
+  EIGEN_DEVICE_FUNC
+  static double epsilon() { return __DBL_EPSILON__; }
+  EIGEN_DEVICE_FUNC
+  static double (max)() { return DBL_MAX; }
+  EIGEN_DEVICE_FUNC
+  static double (min)() { return DBL_MIN; }
+  EIGEN_DEVICE_FUNC
+  static double infinity() { return CUDART_INF; }
+  EIGEN_DEVICE_FUNC
+  static double quiet_NaN() { return CUDART_NAN; }
+};
+template<> struct numeric_limits<int>
+{
+  EIGEN_DEVICE_FUNC
+  static int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static int (max)() { return INT_MAX; }
+  EIGEN_DEVICE_FUNC
+  static int (min)() { return INT_MIN; }
+};
+template<> struct numeric_limits<unsigned int>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned int (max)() { return UINT_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned int (min)() { return 0; }
+};
+template<> struct numeric_limits<long>
+{
+  EIGEN_DEVICE_FUNC
+  static long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static long (max)() { return LONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static long (min)() { return LONG_MIN; }
+};
+template<> struct numeric_limits<unsigned long>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long (max)() { return ULONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long (min)() { return 0; }
+};
+template<> struct numeric_limits<long long>
+{
+  EIGEN_DEVICE_FUNC
+  static long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static long long (max)() { return LLONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static long long (min)() { return LLONG_MIN; }
+};
+template<> struct numeric_limits<unsigned long long>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long long (max)() { return ULLONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long long (min)() { return 0; }
+};
+
+}
+
+#endif
 
 /** \internal
   * A base class do disable default copy ctor and copy assignement operator.
   */
 class noncopyable
 {
-  noncopyable(const noncopyable&);
-  const noncopyable& operator=(const noncopyable&);
+  EIGEN_DEVICE_FUNC noncopyable(const noncopyable&);
+  EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&);
 protected:
-  noncopyable() {}
-  ~noncopyable() {}
+  EIGEN_DEVICE_FUNC noncopyable() {}
+  EIGEN_DEVICE_FUNC ~noncopyable() {}
 };
 
-
 /** \internal
   * Convenient struct to get the result type of a unary or binary functor.
   *
@@ -110,14 +285,20 @@ protected:
   * upcoming next STL generation (using a templated result member).
   * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
   */
-template<typename T> struct result_of {};
+#if EIGEN_HAS_STD_RESULT_OF
+template<typename T> struct result_of {
+  typedef typename std::result_of<T>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename T> struct result_of { };
 
 struct has_none {int a[1];};
 struct has_std_result_type {int a[2];};
 struct has_tr1_result {int a[3];};
 
 template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
-struct unary_result_of_select {typedef ArgType type;};
+struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};
 
 template<typename Func, typename ArgType>
 struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
@@ -128,10 +309,10 @@ struct unary_result_of_select<Func, ArgType, sizeof(has_tr1_result)> {typedef ty
 template<typename Func, typename ArgType>
 struct result_of<Func(ArgType)> {
     template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
     template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
-    static has_none            testFunctor(...);
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
+    static has_none               testFunctor(...);
 
     // note that the following indirection is needed for gcc-3.3
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
@@ -139,7 +320,7 @@ struct result_of<Func(ArgType)> {
 };
 
 template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)>
-struct binary_result_of_select {typedef ArgType0 type;};
+struct binary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
 
 template<typename Func, typename ArgType0, typename ArgType1>
 struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)>
@@ -152,16 +333,83 @@ struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_tr1_result)>
 template<typename Func, typename ArgType0, typename ArgType1>
 struct result_of<Func(ArgType0,ArgType1)> {
     template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
     template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
-    static has_none            testFunctor(...);
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
+    static has_none               testFunctor(...);
 
     // note that the following indirection is needed for gcc-3.3
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
     typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
 };
 
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2, int SizeOf=sizeof(has_none)>
+struct ternary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_std_result_type)>
+{typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_tr1_result)>
+{typedef typename Func::template result<Func(ArgType0,ArgType1,ArgType2)>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1,ArgType2)>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
+};
+#endif
+
+struct meta_yes { char a[1]; };
+struct meta_no  { char a[2]; };
+
+// Check whether T::ReturnType does exist
+template <typename T>
+struct has_ReturnType
+{
+  template <typename C> static meta_yes testFunctor(typename C::ReturnType const *);
+  template <typename C> static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor<T>(0)) == sizeof(meta_yes) };
+};
+
+template<typename T> const T* return_ptr();
+
+template <typename T, typename IndexType=Index>
+struct has_nullary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()())>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+template <typename T, typename IndexType=Index>
+struct has_unary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0)))>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+template <typename T, typename IndexType=Index>
+struct has_binary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
 /** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
   * Usage example: \code meta_sqrt<1023>::ret \endcode
   */
@@ -185,37 +433,26 @@ class meta_sqrt
 template<int Y, int InfX, int SupX>
 class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
 
-/** \internal determines whether the product of two numeric types is allowed and what the return type is */
-template<typename T, typename U> struct scalar_product_traits
-{
-  enum { Defined = 0 };
-};
 
-template<typename T> struct scalar_product_traits<T,T>
+/** \internal Computes the least common multiple of two positive integer A and B
+  * at compile-time. It implements a naive algorithm testing all multiples of A.
+  * It thus works better if A>=B.
+  */
+template<int A, int B, int K=1, bool Done = ((A*K)%B)==0>
+struct meta_least_common_multiple
 {
-  enum {
-    // Cost = NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef T ReturnType;
+  enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
 };
-
-template<typename T> struct scalar_product_traits<T,std::complex<T> >
+template<int A, int B, int K>
+struct meta_least_common_multiple<A,B,K,true>
 {
-  enum {
-    // Cost = 2*NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef std::complex<T> ReturnType;
+  enum { ret = A*K };
 };
 
-template<typename T> struct scalar_product_traits<std::complex<T>, T>
+/** \internal determines whether the product of two numeric types is allowed and what the return type is */
+template<typename T, typename U> struct scalar_product_traits
 {
-  enum {
-    // Cost = 2*NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef std::complex<T> ReturnType;
+  enum { Defined = 0 };
 };
 
 // FIXME quick workaround around current limitation of result_of
@@ -224,19 +461,31 @@ template<typename T> struct scalar_product_traits<std::complex<T>, T>
 // typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
 // };
 
-template<typename T> struct is_diagonal
-{ enum { ret = false }; };
-
-template<typename T> struct is_diagonal<DiagonalBase<T> >
-{ enum { ret = true }; };
-
-template<typename T> struct is_diagonal<DiagonalWrapper<T> >
-{ enum { ret = true }; };
+} // end namespace internal
 
-template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
-{ enum { ret = true }; };
+namespace numext {
+  
+#if defined(__CUDA_ARCH__)
+template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
+#else
+template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
+#endif
+
+#if defined(__CUDA_ARCH__)
+using internal::device::numeric_limits;
+#else
+using std::numeric_limits;
+#endif
+
+// Integer division with rounding up.
+// T is assumed to be an integer type with a>=0, and b>0
+template<typename T>
+T div_ceil(const T &a, const T &b)
+{
+  return (a+b-1) / b;
+}
 
-} // end namespace internal
+} // end namespace numext
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index 5ddfbd4aa..86b60f52f 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -8,7 +8,20 @@
     #pragma warning pop
   #elif defined __clang__
     #pragma clang diagnostic pop
+  #elif defined __GNUC__ && __GNUC__>=6
+    #pragma GCC diagnostic pop
   #endif
+
+  #if defined __NVCC__
+//    Don't reenable the diagnostic messages, as it turns out these messages need
+//    to be disabled at the point of the template instantiation (i.e the user code)
+//    otherwise they'll be triggered by nvcc.
+//    #pragma diag_default code_is_unreachable
+//    #pragma diag_default initialization_not_reachable
+//    #pragma diag_default 2651
+//    #pragma diag_default 2653
+  #endif
+
 #endif
 
 #endif // EIGEN_WARNINGS_DISABLED
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index bac5d9fe9..983361a45 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -26,7 +26,7 @@
 
 #ifndef EIGEN_NO_STATIC_ASSERT
 
-  #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (defined(_MSC_VER) && (_MSC_VER >= 1600))
+  #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600))
 
     // if native static_assert is enabled, let's use it
     #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
@@ -50,6 +50,7 @@
         THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE,
         THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE,
         THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE,
+        OUT_OF_RANGE_ACCESS,
         YOU_MADE_A_PROGRAMMING_MISTAKE,
         EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT,
         EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE,
@@ -84,6 +85,7 @@
         THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY,
         YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT,
         THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS,
+        THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS,
         THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL,
         THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES,
         YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED,
@@ -92,7 +94,14 @@
         THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
         OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
         IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
-        STORAGE_LAYOUT_DOES_NOT_MATCH
+        STORAGE_LAYOUT_DOES_NOT_MATCH,
+        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE,
+        THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS,
+        MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY,
+        THIS_TYPE_IS_NOT_SUPPORTED,
+        STORAGE_KIND_MUST_MATCH,
+        STORAGE_INDEX_MUST_MATCH,
+        CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY
       };
     };
 
@@ -103,15 +112,15 @@
     // Specialized implementation for MSVC to avoid "conditional
     // expression is constant" warnings.  This implementation doesn't
     // appear to work under GCC, hence the multiple implementations.
-    #ifdef _MSC_VER
+    #if EIGEN_COMP_MSVC
 
       #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \
         {Eigen::internal::static_assertion<bool(CONDITION)>::MSG;}
 
     #else
-
+      // In some cases clang interprets bool(CONDITION) as function declaration
       #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \
-        if (Eigen::internal::static_assertion<bool(CONDITION)>::MSG) {}
+        if (Eigen::internal::static_assertion<static_cast<bool>(CONDITION)>::MSG) {}
 
     #endif
 
@@ -159,7 +168,7 @@
 
 #define EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1) \
      ( \
-        (int(TYPE0::SizeAtCompileTime)==0 && int(TYPE1::SizeAtCompileTime)==0) \
+        (int(Eigen::internal::size_of_xpr_at_compile_time<TYPE0>::ret)==0 && int(Eigen::internal::size_of_xpr_at_compile_time<TYPE1>::ret)==0) \
     || (\
           (int(TYPE0::RowsAtCompileTime)==Eigen::Dynamic \
         || int(TYPE1::RowsAtCompileTime)==Eigen::Dynamic \
@@ -170,13 +179,8 @@
        ) \
      )
 
-#ifdef EIGEN2_SUPPORT
-  #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
-    eigen_assert(!NumTraits<Scalar>::IsInteger);
-#else
-  #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
+#define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
     EIGEN_STATIC_ASSERT(!NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
-#endif
 
 
 // static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes
@@ -191,18 +195,22 @@
                           THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS)
 
 #define EIGEN_STATIC_ASSERT_LVALUE(Derived) \
-      EIGEN_STATIC_ASSERT(internal::is_lvalue<Derived>::value, \
+      EIGEN_STATIC_ASSERT(Eigen::internal::is_lvalue<Derived>::value, \
                           THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY)
 
 #define EIGEN_STATIC_ASSERT_ARRAYXPR(Derived) \
-      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Derived>::XprKind, ArrayXpr>::value), \
+      EIGEN_STATIC_ASSERT((Eigen::internal::is_same<typename Eigen::internal::traits<Derived>::XprKind, ArrayXpr>::value), \
                           THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES)
 
 #define EIGEN_STATIC_ASSERT_SAME_XPR_KIND(Derived1, Derived2) \
-      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Derived1>::XprKind, \
-                                             typename internal::traits<Derived2>::XprKind \
+      EIGEN_STATIC_ASSERT((Eigen::internal::is_same<typename Eigen::internal::traits<Derived1>::XprKind, \
+                                             typename Eigen::internal::traits<Derived2>::XprKind \
                                             >::value), \
                           YOU_CANNOT_MIX_ARRAYS_AND_MATRICES)
 
+// Check that a cost value is positive, and that is stay within a reasonable range
+// TODO this check could be enabled for internal debugging only
+#define EIGEN_INTERNAL_CHECK_COST_VALUE(C) \
+      EIGEN_STATIC_ASSERT((C)>=0 && (C)<=HugeCost*HugeCost, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE);
 
 #endif // EIGEN_STATIC_ASSERT_H
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 781965d2c..ba5bd186d 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -14,20 +14,77 @@
 // just a workaround because GCC seems to not really like empty structs
 // FIXME: gcc 4.3 generates bad code when strict-aliasing is enabled
 // so currently we simply disable this optimization for gcc 4.3
-#if (defined __GNUG__) && !((__GNUC__==4) && (__GNUC_MINOR__==3))
+#if EIGEN_COMP_GNUC && !EIGEN_GNUC_AT(4,3)
   #define EIGEN_EMPTY_STRUCT_CTOR(X) \
-    EIGEN_STRONG_INLINE X() {} \
-    EIGEN_STRONG_INLINE X(const X& ) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE X() {} \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE X(const X& ) {}
 #else
   #define EIGEN_EMPTY_STRUCT_CTOR(X)
 #endif
 
 namespace Eigen {
 
-typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
-
 namespace internal {
 
+template<typename IndexDest, typename IndexSrc>
+EIGEN_DEVICE_FUNC
+inline IndexDest convert_index(const IndexSrc& idx) {
+  // for sizeof(IndexDest)>=sizeof(IndexSrc) compilers should be able to optimize this away:
+  eigen_internal_assert(idx <= NumTraits<IndexDest>::highest() && "Index value to big for target type");
+  return IndexDest(idx);
+}
+
+
+// promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
+//    expression * scalar
+// Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression.
+// The IsSupported template parameter must be provided by the caller as: internal::has_ReturnType<ScalarBinaryOpTraits<ExprScalar,T,op> >::value using the proper order for ExprScalar and T.
+// Then the logic is as follows:
+//  - if the operation is natively supported as defined by IsSupported, then the scalar type is not promoted, and T is returned.
+//  - otherwise, NumTraits<ExprScalar>::Literal is returned if T is implicitly convertible to NumTraits<ExprScalar>::Literal AND that this does not imply a float to integer conversion.
+//  - otherwise, ExprScalar is returned if T is implicitly convertible to ExprScalar AND that this does not imply a float to integer conversion.
+//  - In all other cases, the promoted type is not defined, and the respective operation is thus invalid and not available (SFINAE).
+template<typename ExprScalar,typename T, bool IsSupported>
+struct promote_scalar_arg;
+
+template<typename S,typename T>
+struct promote_scalar_arg<S,T,true>
+{
+  typedef T type;
+};
+
+// Recursively check safe conversion to PromotedType, and then ExprScalar if they are different.
+template<typename ExprScalar,typename T,typename PromotedType,
+  bool ConvertibleToLiteral = internal::is_convertible<T,PromotedType>::value,
+  bool IsSafe = NumTraits<T>::IsInteger || !NumTraits<PromotedType>::IsInteger>
+struct promote_scalar_arg_unsupported;
+
+// Start recursion with NumTraits<ExprScalar>::Literal
+template<typename S,typename T>
+struct promote_scalar_arg<S,T,false> : promote_scalar_arg_unsupported<S,T,typename NumTraits<S>::Literal> {};
+
+// We found a match!
+template<typename S,typename T, typename PromotedType>
+struct promote_scalar_arg_unsupported<S,T,PromotedType,true,true>
+{
+  typedef PromotedType type;
+};
+
+// No match, but no real-to-integer issues, and ExprScalar and current PromotedType are different,
+// so let's try to promote to ExprScalar
+template<typename ExprScalar,typename T, typename PromotedType>
+struct promote_scalar_arg_unsupported<ExprScalar,T,PromotedType,false,true>
+   : promote_scalar_arg_unsupported<ExprScalar,T,ExprScalar>
+{};
+
+// Unsafe real-to-integer, let's stop.
+template<typename S,typename T, typename PromotedType, bool ConvertibleToLiteral>
+struct promote_scalar_arg_unsupported<S,T,PromotedType,ConvertibleToLiteral,false> {};
+
+// T is not even convertible to ExprScalar, let's stop.
+template<typename S,typename T>
+struct promote_scalar_arg_unsupported<S,T,S,false,true> {};
+
 //classes inheriting no_assignment_operator don't generate a default operator=.
 class no_assignment_operator
 {
@@ -50,19 +107,19 @@ template<typename T, int Value> class variable_if_dynamic
 {
   public:
     EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic)
-    explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); assert(v == T(Value)); }
-    static T value() { return T(Value); }
-    void setValue(T) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
 };
 
 template<typename T> class variable_if_dynamic<T, Dynamic>
 {
     T m_value;
-    variable_if_dynamic() { assert(false); }
+    EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); }
   public:
-    explicit variable_if_dynamic(T value) : m_value(value) {}
-    T value() const { return m_value; }
-    void setValue(T value) { m_value = value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
 /** \internal like variable_if_dynamic but for DynamicIndex
@@ -71,19 +128,19 @@ template<typename T, int Value> class variable_if_dynamicindex
 {
   public:
     EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex)
-    explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); assert(v == T(Value)); }
-    static T value() { return T(Value); }
-    void setValue(T) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
 };
 
 template<typename T> class variable_if_dynamicindex<T, DynamicIndex>
 {
     T m_value;
-    variable_if_dynamicindex() { assert(false); }
+    EIGEN_DEVICE_FUNC variable_if_dynamicindex() { eigen_assert(false); }
   public:
-    explicit variable_if_dynamicindex(T value) : m_value(value) {}
-    T value() const { return m_value; }
-    void setValue(T value) { m_value = value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T value) : m_value(value) {}
+    EIGEN_DEVICE_FUNC T EIGEN_STRONG_INLINE value() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
 template<typename T> struct functor_traits
@@ -101,7 +158,73 @@ template<typename T> struct packet_traits;
 template<typename T> struct unpacket_traits
 {
   typedef T type;
-  enum {size=1};
+  typedef T half;
+  enum
+  {
+    size = 1,
+    alignment = 1
+  };
+};
+
+template<int Size, typename PacketType,
+         bool Stop = Size==Dynamic || (Size%unpacket_traits<PacketType>::size)==0 || is_same<PacketType,typename unpacket_traits<PacketType>::half>::value>
+struct find_best_packet_helper;
+
+template< int Size, typename PacketType>
+struct find_best_packet_helper<Size,PacketType,true>
+{
+  typedef PacketType type;
+};
+
+template<int Size, typename PacketType>
+struct find_best_packet_helper<Size,PacketType,false>
+{
+  typedef typename find_best_packet_helper<Size,typename unpacket_traits<PacketType>::half>::type type;
+};
+
+template<typename T, int Size>
+struct find_best_packet
+{
+  typedef typename find_best_packet_helper<Size,typename packet_traits<T>::type>::type type;
+};
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+template<int ArrayBytes, int AlignmentBytes,
+         bool Match     =  bool((ArrayBytes%AlignmentBytes)==0),
+         bool TryHalf   =  bool(EIGEN_MIN_ALIGN_BYTES<AlignmentBytes) >
+struct compute_default_alignment_helper
+{
+  enum { value = 0 };
+};
+
+template<int ArrayBytes, int AlignmentBytes, bool TryHalf>
+struct compute_default_alignment_helper<ArrayBytes, AlignmentBytes, true, TryHalf> // Match
+{
+  enum { value = AlignmentBytes };
+};
+
+template<int ArrayBytes, int AlignmentBytes>
+struct compute_default_alignment_helper<ArrayBytes, AlignmentBytes, false, true> // Try-half
+{
+  // current packet too large, try with an half-packet
+  enum { value = compute_default_alignment_helper<ArrayBytes, AlignmentBytes/2>::value };
+};
+#else
+// If static alignment is disabled, no need to bother.
+// This also avoids a division by zero in "bool Match =  bool((ArrayBytes%AlignmentBytes)==0)"
+template<int ArrayBytes, int AlignmentBytes>
+struct compute_default_alignment_helper
+{
+  enum { value = 0 };
+};
+#endif
+
+template<typename T, int Size> struct compute_default_alignment {
+  enum { value = compute_default_alignment_helper<Size*sizeof(T),EIGEN_MAX_STATIC_ALIGN_BYTES>::value };
+};
+
+template<typename T> struct compute_default_alignment<T,Dynamic> {
+  enum { value = EIGEN_MAX_ALIGN_BYTES };
 };
 
 template<typename _Scalar, int _Rows, int _Cols,
@@ -127,35 +250,12 @@ template<typename _Scalar, int _Rows, int _Cols,
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
 class compute_matrix_flags
 {
-    enum {
-      row_major_bit = Options&RowMajor ? RowMajorBit : 0,
-      is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
-
-      aligned_bit =
-      (
-            ((Options&DontAlign)==0)
-        && (
-#if EIGEN_ALIGN_STATICALLY
-             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0))
-#else
-             0
-#endif
-
-          ||
-
-#if EIGEN_ALIGN
-             is_dynamic_size_storage
-#else
-             0
-#endif
-
-          )
-      ) ? AlignedBit : 0,
-      packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
-    };
-
+    enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0 };
   public:
-    enum { ret = LinearAccessBit | LvalueBit | DirectAccessBit | NestByRefBit | packet_access_bit | row_major_bit | aligned_bit };
+    // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<>
+    // and then propagate this information to the evaluator's flags.
+    // However, I (Gael) think that DirectAccessBit should only matter at the evaluation stage.
+    enum { ret = DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit };
 };
 
 template<int _Rows, int _Cols> struct size_at_compile_time
@@ -163,34 +263,43 @@ template<int _Rows, int _Cols> struct size_at_compile_time
   enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
 };
 
+template<typename XprType> struct size_of_xpr_at_compile_time
+{
+  enum { ret = size_at_compile_time<traits<XprType>::RowsAtCompileTime,traits<XprType>::ColsAtCompileTime>::ret };
+};
+
 /* plain_matrix_type : the difference from eval is that plain_matrix_type is always a plain matrix type,
  * whereas eval is a const reference in the case of a matrix
  */
 
 template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_matrix_type;
-template<typename T, typename BaseClassType> struct plain_matrix_type_dense;
+template<typename T, typename BaseClassType, int Flags> struct plain_matrix_type_dense;
 template<typename T> struct plain_matrix_type<T,Dense>
 {
-  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind>::type type;
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind, traits<T>::Flags>::type type;
+};
+template<typename T> struct plain_matrix_type<T,DiagonalShape>
+{
+  typedef typename T::PlainObject type;
 };
 
-template<typename T> struct plain_matrix_type_dense<T,MatrixXpr>
+template<typename T, int Flags> struct plain_matrix_type_dense<T,MatrixXpr,Flags>
 {
   typedef Matrix<typename traits<T>::Scalar,
                 traits<T>::RowsAtCompileTime,
                 traits<T>::ColsAtCompileTime,
-                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor),
                 traits<T>::MaxRowsAtCompileTime,
                 traits<T>::MaxColsAtCompileTime
           > type;
 };
 
-template<typename T> struct plain_matrix_type_dense<T,ArrayXpr>
+template<typename T, int Flags> struct plain_matrix_type_dense<T,ArrayXpr,Flags>
 {
   typedef Array<typename traits<T>::Scalar,
                 traits<T>::RowsAtCompileTime,
                 traits<T>::ColsAtCompileTime,
-                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor),
                 traits<T>::MaxRowsAtCompileTime,
                 traits<T>::MaxColsAtCompileTime
           > type;
@@ -215,6 +324,11 @@ template<typename T> struct eval<T,Dense>
 //           > type;
 };
 
+template<typename T> struct eval<T,DiagonalShape>
+{
+  typedef typename plain_matrix_type<T>::type type;
+};
+
 // for matrices, no need to evaluate, just use a const reference to avoid a useless copy
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 struct eval<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
@@ -229,6 +343,15 @@ struct eval<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
 };
 
 
+/* similar to plain_matrix_type, but using the evaluator's Flags */
+template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_object_eval;
+
+template<typename T>
+struct plain_object_eval<T,Dense>
+{
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind, evaluator<T>::Flags>::type type;
+};
+
 
 /* plain_matrix_type_column_major : same as plain_matrix_type but guaranteed to be column-major
  */
@@ -266,9 +389,6 @@ template<typename T> struct plain_matrix_type_row_major
           > type;
 };
 
-// we should be able to get rid of this one too
-template<typename T> struct must_nest_by_value { enum { ret = false }; };
-
 /** \internal The reference selector for template expressions. The idea is that we don't
   * need to use references for expressions since they are light weight proxy
   * objects which should generate no copying overhead. */
@@ -280,6 +400,12 @@ struct ref_selector
     T const&,
     const T
   >::type type;
+  
+  typedef typename conditional<
+    bool(traits<T>::Flags & NestByRefBit),
+    T &,
+    T
+  >::type non_const_type;
 };
 
 /** \internal Adds the const qualifier on the value-type of T2 if and only if T1 is a const type */
@@ -293,54 +419,41 @@ struct transfer_constness
   >::type type;
 };
 
-/** \internal Determines how a given expression should be nested into another one.
+
+// However, we still need a mechanism to detect whether an expression which is evaluated multiple time
+// has to be evaluated into a temporary.
+// That's the purpose of this new nested_eval helper:
+/** \internal Determines how a given expression should be nested when evaluated multiple times.
   * For example, when you do a * (b+c), Eigen will determine how the expression b+c should be
-  * nested into the bigger product expression. The choice is between nesting the expression b+c as-is, or
+  * evaluated into the bigger product expression. The choice is between nesting the expression b+c as-is, or
   * evaluating that expression b+c into a temporary variable d, and nest d so that the resulting expression is
   * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes
   * many coefficient accesses in the nested expressions -- as is the case with matrix product for example.
   *
-  * \param T the type of the expression being nested
-  * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
-  *
-  * Note that if no evaluation occur, then the constness of T is preserved.
-  *
-  * Example. Suppose that a, b, and c are of type Matrix3d. The user forms the expression a*(b+c).
-  * b+c is an expression "sum of matrices", which we will denote by S. In order to determine how to nest it,
-  * the Product expression uses: nested<S, 3>::ret, which turns out to be Matrix3d because the internal logic of
-  * nested determined that in this case it was better to evaluate the expression b+c into a temporary. On the other hand,
-  * since a is of type Matrix3d, the Product expression nests it as nested<Matrix3d, 3>::ret, which turns out to be
-  * const Matrix3d&, because the internal logic of nested determined that since a was already a matrix, there was no point
-  * in copying it into another matrix.
+  * \tparam T the type of the expression being nested.
+  * \tparam n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
+  * \tparam PlainObject the type of the temporary if needed.
   */
-template<typename T, int n=1, typename PlainObject = typename eval<T>::type> struct nested
+template<typename T, int n, typename PlainObject = typename plain_object_eval<T>::type> struct nested_eval
 {
   enum {
-    // for the purpose of this test, to keep it reasonably simple, we arbitrarily choose a value of Dynamic values.
-    // the choice of 10000 makes it larger than any practical fixed value and even most dynamic values.
-    // in extreme cases where these assumptions would be wrong, we would still at worst suffer performance issues
-    // (poor choice of temporaries).
-    // it's important that this value can still be squared without integer overflowing.
-    DynamicAsInteger = 10000,
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    ScalarReadCostAsInteger = ScalarReadCost == Dynamic ? int(DynamicAsInteger) : int(ScalarReadCost),
-    CoeffReadCost = traits<T>::CoeffReadCost,
-    CoeffReadCostAsInteger = CoeffReadCost == Dynamic ? int(DynamicAsInteger) : int(CoeffReadCost),
-    NAsInteger = n == Dynamic ? int(DynamicAsInteger) : n,
-    CostEvalAsInteger   = (NAsInteger+1) * ScalarReadCostAsInteger + CoeffReadCostAsInteger,
-    CostNoEvalAsInteger = NAsInteger * CoeffReadCostAsInteger
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a tempory?
+                                                  //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
+                                                  //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
+                                                  //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
+                                                  //      Another solution could be to count the number of temps?
+    NAsInteger = n == Dynamic ? HugeCost : n,
+    CostEval   = (NAsInteger+1) * ScalarReadCost + CoeffReadCost,
+    CostNoEval = NAsInteger * CoeffReadCost,
+    Evaluate = (int(evaluator<T>::Flags) & EvalBeforeNestingBit) || (int(CostEval) < int(CostNoEval))
   };
 
-  typedef typename conditional<
-      ( (int(traits<T>::Flags) & EvalBeforeNestingBit) ||
-        int(CostEvalAsInteger) < int(CostNoEvalAsInteger)
-      ),
-      PlainObject,
-      typename ref_selector<T>::type
-  >::type type;
+  typedef typename conditional<Evaluate, PlainObject, typename ref_selector<T>::type>::type type;
 };
 
 template<typename T>
+EIGEN_DEVICE_FUNC
 inline T* const_cast_ptr(const T* ptr)
 {
   return const_cast<T*>(ptr);
@@ -364,30 +477,13 @@ struct dense_xpr_base<Derived, ArrayXpr>
   typedef ArrayBase<Derived> type;
 };
 
-/** \internal Helper base class to add a scalar multiple operator
-  * overloads for complex types */
-template<typename Derived,typename Scalar,typename OtherScalar,
-         bool EnableIt = !is_same<Scalar,OtherScalar>::value >
-struct special_scalar_op_base : public DenseCoeffsBase<Derived>
-{
-  // dummy operator* so that the
-  // "using special_scalar_op_base::operator*" compiles
-  void operator*() const;
-};
+template<typename Derived, typename XprKind = typename traits<Derived>::XprKind, typename StorageKind = typename traits<Derived>::StorageKind>
+struct generic_xpr_base;
 
-template<typename Derived,typename Scalar,typename OtherScalar>
-struct special_scalar_op_base<Derived,Scalar,OtherScalar,true>  : public DenseCoeffsBase<Derived>
+template<typename Derived, typename XprKind>
+struct generic_xpr_base<Derived, XprKind, Dense>
 {
-  const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
-  operator*(const OtherScalar& scalar) const
-  {
-    return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
-      (*static_cast<const Derived*>(this), scalar_multiple2_op<Scalar,OtherScalar>(scalar));
-  }
-
-  inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
-  operator*(const OtherScalar& scalar, const Derived& matrix)
-  { return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar); }
+  typedef typename dense_xpr_base<Derived,XprKind>::type type;
 };
 
 template<typename XprType, typename CastType> struct cast_return_type
@@ -405,9 +501,79 @@ template <typename A> struct promote_storage_type<A,A>
 {
   typedef A ret;
 };
+template <typename A> struct promote_storage_type<A, const A>
+{
+  typedef A ret;
+};
+template <typename A> struct promote_storage_type<const A, A>
+{
+  typedef A ret;
+};
+
+/** \internal Specify the "storage kind" of applying a coefficient-wise
+  * binary operations between two expressions of kinds A and B respectively.
+  * The template parameter Functor permits to specialize the resulting storage kind wrt to
+  * the functor.
+  * The default rules are as follows:
+  * \code
+  * A      op A      -> A
+  * A      op dense  -> dense
+  * dense  op B      -> dense
+  * sparse op dense  -> sparse
+  * dense  op sparse -> sparse
+  * \endcode
+  */
+template <typename A, typename B, typename Functor> struct cwise_promote_storage_type;
+
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,A,Functor>                                      { typedef A      ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Dense,Functor>                              { typedef Dense  ret; };
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,Dense,Functor>                                  { typedef Dense  ret; };
+template <typename B, typename Functor>                   struct cwise_promote_storage_type<Dense,B,Functor>                                  { typedef Dense  ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Sparse,Dense,Functor>                             { typedef Sparse ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Sparse,Functor>                             { typedef Sparse ret; };
+
+template <typename LhsKind, typename RhsKind, int LhsOrder, int RhsOrder> struct cwise_promote_storage_order {
+  enum { value = LhsOrder };
+};
+
+template <typename LhsKind, int LhsOrder, int RhsOrder>   struct cwise_promote_storage_order<LhsKind,Sparse,LhsOrder,RhsOrder>                { enum { value = RhsOrder }; };
+template <typename RhsKind, int LhsOrder, int RhsOrder>   struct cwise_promote_storage_order<Sparse,RhsKind,LhsOrder,RhsOrder>                { enum { value = LhsOrder }; };
+template <int Order>                                      struct cwise_promote_storage_order<Sparse,Sparse,Order,Order>                       { enum { value = Order }; };
+
+
+/** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B.
+  * The template parameter ProductTag permits to specialize the resulting storage kind wrt to
+  * some compile-time properties of the product: GemmProduct, GemvProduct, OuterProduct, InnerProduct.
+  * The default rules are as follows:
+  * \code
+  *  K * K            -> K
+  *  dense * K        -> dense
+  *  K * dense        -> dense
+  *  diag * K         -> K
+  *  K * diag         -> K
+  *  Perm * K         -> K
+  * K * Perm          -> K
+  * \endcode
+  */
+template <typename A, typename B, int ProductTag> struct product_promote_storage_type;
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  A,                  ProductTag> { typedef A     ret;};
+template <int ProductTag>             struct product_promote_storage_type<Dense,              Dense,              ProductTag> { typedef Dense ret;};
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  Dense,              ProductTag> { typedef Dense ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<Dense,              B,                  ProductTag> { typedef Dense ret; };
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  DiagonalShape,      ProductTag> { typedef A ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<DiagonalShape,      B,                  ProductTag> { typedef B ret; };
+template <int ProductTag>             struct product_promote_storage_type<Dense,              DiagonalShape,      ProductTag> { typedef Dense ret; };
+template <int ProductTag>             struct product_promote_storage_type<DiagonalShape,      Dense,              ProductTag> { typedef Dense ret; };
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  PermutationStorage, ProductTag> { typedef A ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<PermutationStorage, B,                  ProductTag> { typedef B ret; };
+template <int ProductTag>             struct product_promote_storage_type<Dense,              PermutationStorage, ProductTag> { typedef Dense ret; };
+template <int ProductTag>             struct product_promote_storage_type<PermutationStorage, Dense,              ProductTag> { typedef Dense ret; };
 
 /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
-  * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
+  * \tparam Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
   */
 template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
 struct plain_row_type
@@ -455,15 +621,201 @@ struct plain_diag_type
   >::type type;
 };
 
+template<typename Expr,typename Scalar = typename Expr::Scalar>
+struct plain_constant_type
+{
+  enum { Options = (traits<Expr>::Flags&RowMajorBit)?RowMajor:0 };
+
+  typedef Array<Scalar,  traits<Expr>::RowsAtCompileTime,   traits<Expr>::ColsAtCompileTime,
+                Options, traits<Expr>::MaxRowsAtCompileTime,traits<Expr>::MaxColsAtCompileTime> array_type;
+
+  typedef Matrix<Scalar,  traits<Expr>::RowsAtCompileTime,   traits<Expr>::ColsAtCompileTime,
+                 Options, traits<Expr>::MaxRowsAtCompileTime,traits<Expr>::MaxColsAtCompileTime> matrix_type;
+
+  typedef CwiseNullaryOp<scalar_constant_op<Scalar>, const typename conditional<is_same< typename traits<Expr>::XprKind, MatrixXpr >::value, matrix_type, array_type>::type > type;
+};
+
 template<typename ExpressionType>
 struct is_lvalue
 {
-  enum { value = !bool(is_const<ExpressionType>::value) &&
+  enum { value = (!bool(is_const<ExpressionType>::value)) &&
                  bool(traits<ExpressionType>::Flags & LvalueBit) };
 };
 
+template<typename T> struct is_diagonal
+{ enum { ret = false }; };
+
+template<typename T> struct is_diagonal<DiagonalBase<T> >
+{ enum { ret = true }; };
+
+template<typename T> struct is_diagonal<DiagonalWrapper<T> >
+{ enum { ret = true }; };
+
+template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
+{ enum { ret = true }; };
+
+template<typename S1, typename S2> struct glue_shapes;
+template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
+
+template<typename T1, typename T2>
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
+{
+  return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
+}
+
+template<typename T1, typename T2>
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
+{
+  return false;
+}
+
+// Internal helper defining the cost of a scalar division for the type T.
+// The default heuristic can be specialized for each scalar type and architecture.
+template<typename T,bool Vectorized=false,typename EnaleIf = void>
+struct scalar_div_cost {
+  enum { value = 8*NumTraits<T>::MulCost };
+};
+
+template<typename T,bool Vectorized>
+struct scalar_div_cost<std::complex<T>, Vectorized> {
+  enum { value = 2*scalar_div_cost<T>::value
+               + 6*NumTraits<T>::MulCost
+               + 3*NumTraits<T>::AddCost
+  };
+};
+
+
+template<bool Vectorized>
+struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; };
+template<bool Vectorized>
+struct scalar_div_cost<unsigned long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 21 }; };
+
+
+#ifdef EIGEN_DEBUG_ASSIGN
+std::string demangle_traversal(int t)
+{
+  if(t==DefaultTraversal) return "DefaultTraversal";
+  if(t==LinearTraversal) return "LinearTraversal";
+  if(t==InnerVectorizedTraversal) return "InnerVectorizedTraversal";
+  if(t==LinearVectorizedTraversal) return "LinearVectorizedTraversal";
+  if(t==SliceVectorizedTraversal) return "SliceVectorizedTraversal";
+  return "?";
+}
+std::string demangle_unrolling(int t)
+{
+  if(t==NoUnrolling) return "NoUnrolling";
+  if(t==InnerUnrolling) return "InnerUnrolling";
+  if(t==CompleteUnrolling) return "CompleteUnrolling";
+  return "?";
+}
+std::string demangle_flags(int f)
+{
+  std::string res;
+  if(f&RowMajorBit)                 res += " | RowMajor";
+  if(f&PacketAccessBit)             res += " | Packet";
+  if(f&LinearAccessBit)             res += " | Linear";
+  if(f&LvalueBit)                   res += " | Lvalue";
+  if(f&DirectAccessBit)             res += " | Direct";
+  if(f&NestByRefBit)                res += " | NestByRef";
+  if(f&NoPreferredStorageOrderBit)  res += " | NoPreferredStorageOrderBit";
+  
+  return res;
+}
+#endif
+
 } // end namespace internal
 
+
+/** \class ScalarBinaryOpTraits
+  * \ingroup Core_Module
+  *
+  * \brief Determines whether the given binary operation of two numeric types is allowed and what the scalar return type is.
+  *
+  * This class permits to control the scalar return type of any binary operation performed on two different scalar types through (partial) template specializations.
+  *
+  * For instance, let \c U1, \c U2 and \c U3 be three user defined scalar types for which most operations between instances of \c U1 and \c U2 returns an \c U3.
+  * You can let %Eigen knows that by defining:
+    \code
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U1,U2,BinaryOp> { typedef U3 ReturnType;  };
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U2,U1,BinaryOp> { typedef U3 ReturnType;  };
+    \endcode
+  * You can then explicitly disable some particular operations to get more explicit error messages:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_max_op<U1,U2> > {};
+    \endcode
+  * Or customize the return type for individual operation:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_sum_op<U1,U2> > { typedef U1 ReturnType; };
+    \endcode
+  *
+  * By default, the following generic combinations are supported:
+  <table class="manual">
+  <tr><th>ScalarA</th><th>ScalarB</th><th>BinaryOp</th><th>ReturnType</th><th>Note</th></tr>
+  <tr            ><td>\c T </td><td>\c T </td><td>\c * </td><td>\c T </td><td></td></tr>
+  <tr class="alt"><td>\c NumTraits<T>::Real </td><td>\c T </td><td>\c * </td><td>\c T </td><td>Only if \c NumTraits<T>::IsComplex </td></tr>
+  <tr            ><td>\c T </td><td>\c NumTraits<T>::Real </td><td>\c * </td><td>\c T </td><td>Only if \c NumTraits<T>::IsComplex </td></tr>
+  </table>
+  *
+  * \sa CwiseBinaryOp
+  */
+template<typename ScalarA, typename ScalarB, typename BinaryOp=internal::scalar_product_op<ScalarA,ScalarB> >
+struct ScalarBinaryOpTraits
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  // for backward compatibility, use the hints given by the (deprecated) internal::scalar_product_traits class.
+  : internal::scalar_product_traits<ScalarA,ScalarB>
+#endif // EIGEN_PARSED_BY_DOXYGEN
+{};
+
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,T,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T, typename NumTraits<typename internal::enable_if<NumTraits<T>::IsComplex,T>::type>::Real, BinaryOp>
+{
+  typedef T ReturnType;
+};
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<typename NumTraits<typename internal::enable_if<NumTraits<T>::IsComplex,T>::type>::Real, T, BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// For Matrix * Permutation
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,void,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// For Permutation * Matrix
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<void,T,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// for Permutation*Permutation
+template<typename BinaryOp>
+struct ScalarBinaryOpTraits<void,void,BinaryOp>
+{
+  typedef void ReturnType;
+};
+
+// We require Lhs and Rhs to have "compatible" scalar types.
+// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
+// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
+// add together a float matrix and a double matrix.
+#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
+  EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType<ScalarBinaryOpTraits<LHS, RHS,BINOP> >::value), \
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+    
 } // end namespace Eigen
 
 #endif // EIGEN_XPRHELPER_H
diff --git a/Eigen/src/Eigen2Support/Block.h b/Eigen/src/Eigen2Support/Block.h
deleted file mode 100644
index 604456f40..000000000
--- a/Eigen/src/Eigen2Support/Block.h
+++ /dev/null
@@ -1,126 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_BLOCK2_H
-#define EIGEN_BLOCK2_H
-
-namespace Eigen { 
-
-/** \returns a dynamic-size expression of a corner of *this.
-  *
-  * \param type the type of corner. Can be \a Eigen::TopLeft, \a Eigen::TopRight,
-  * \a Eigen::BottomLeft, \a Eigen::BottomRight.
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_corner_enum_int_int.cpp
-  * Output: \verbinclude MatrixBase_corner_enum_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<typename Derived>
-inline Block<Derived> DenseBase<Derived>
-  ::corner(CornerType type, Index cRows, Index cCols)
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived>(derived(), 0, 0, cRows, cCols);
-    case TopRight:
-      return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
-    case BottomLeft:
-      return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
-    case BottomRight:
-      return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
-  }
-}
-
-/** This is the const version of corner(CornerType, Index, Index).*/
-template<typename Derived>
-inline const Block<Derived>
-DenseBase<Derived>::corner(CornerType type, Index cRows, Index cCols) const
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived>(derived(), 0, 0, cRows, cCols);
-    case TopRight:
-      return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
-    case BottomLeft:
-      return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
-    case BottomRight:
-      return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
-  }
-}
-
-/** \returns a fixed-size expression of a corner of *this.
-  *
-  * \param type the type of corner. Can be \a Eigen::TopLeft, \a Eigen::TopRight,
-  * \a Eigen::BottomLeft, \a Eigen::BottomRight.
-  *
-  * The template parameters CRows and CCols arethe number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_corner_enum.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_corner_enum.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<typename Derived>
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols>
-DenseBase<Derived>::corner(CornerType type)
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived, CRows, CCols>(derived(), 0, 0);
-    case TopRight:
-      return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
-    case BottomLeft:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
-    case BottomRight:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
-  }
-}
-
-/** This is the const version of corner<int, int>(CornerType).*/
-template<typename Derived>
-template<int CRows, int CCols>
-inline const Block<Derived, CRows, CCols>
-DenseBase<Derived>::corner(CornerType type) const
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived, CRows, CCols>(derived(), 0, 0);
-    case TopRight:
-      return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
-    case BottomLeft:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
-    case BottomRight:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
-  }
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_BLOCK2_H
diff --git a/Eigen/src/Eigen2Support/CMakeLists.txt b/Eigen/src/Eigen2Support/CMakeLists.txt
deleted file mode 100644
index 7ae41b3cb..000000000
--- a/Eigen/src/Eigen2Support/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_Eigen2Support_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Eigen2Support_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Eigen2Support COMPONENT Devel
-  )
-
-ADD_SUBDIRECTORY(Geometry)
-\ No newline at end of file
diff --git a/Eigen/src/Eigen2Support/Cwise.h b/Eigen/src/Eigen2Support/Cwise.h
deleted file mode 100644
index d95009b6e..000000000
--- a/Eigen/src/Eigen2Support/Cwise.h
+++ /dev/null
@@ -1,192 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CWISE_H
-#define EIGEN_CWISE_H
-
-namespace Eigen { 
-
-/** \internal
-  * convenient macro to defined the return type of a cwise binary operation */
-#define EIGEN_CWISE_BINOP_RETURN_TYPE(OP) \
-    CwiseBinaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType, OtherDerived>
-
-/** \internal
-  * convenient macro to defined the return type of a cwise unary operation */
-#define EIGEN_CWISE_UNOP_RETURN_TYPE(OP) \
-    CwiseUnaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType>
-
-/** \internal
-  * convenient macro to defined the return type of a cwise comparison to a scalar */
-#define EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(OP) \
-    CwiseBinaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType, \
-        typename ExpressionType::ConstantReturnType >
-
-/** \class Cwise
-  *
-  * \brief Pseudo expression providing additional coefficient-wise operations
-  *
-  * \param ExpressionType the type of the object on which to do coefficient-wise operations
-  *
-  * This class represents an expression with additional coefficient-wise features.
-  * It is the return type of MatrixBase::cwise()
-  * and most of the time this is the only way it is used.
-  *
-  * Example: \include MatrixBase_cwise_const.cpp
-  * Output: \verbinclude MatrixBase_cwise_const.out
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_CWISE_PLUGIN.
-  *
-  * \sa MatrixBase::cwise() const, MatrixBase::cwise()
-  */
-template<typename ExpressionType> class Cwise
-{
-  public:
-
-    typedef typename internal::traits<ExpressionType>::Scalar Scalar;
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
-    typedef CwiseUnaryOp<internal::scalar_add_op<Scalar>, ExpressionType> ScalarAddReturnType;
-
-    inline Cwise(const ExpressionType& matrix) : m_matrix(matrix) {}
-
-    /** \internal */
-    inline const ExpressionType& _expression() const { return m_matrix; }
-
-    template<typename OtherDerived>
-    const EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)
-    operator*(const MatrixBase<OtherDerived> &other) const;
-
-    template<typename OtherDerived>
-    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)
-    operator/(const MatrixBase<OtherDerived> &other) const;
-
-    /** \deprecated ArrayBase::min() */
-    template<typename OtherDerived>
-    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_min_op)
-    (min)(const MatrixBase<OtherDerived> &other) const
-    { return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_min_op)(_expression(), other.derived()); }
-
-    /** \deprecated ArrayBase::max() */
-    template<typename OtherDerived>
-    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_max_op)
-    (max)(const MatrixBase<OtherDerived> &other) const
-    { return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_max_op)(_expression(), other.derived()); }
-
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs_op)      abs() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs2_op)     abs2() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_square_op)   square() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cube_op)     cube() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_inverse_op)  inverse() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sqrt_op)     sqrt() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_exp_op)      exp() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_log_op)      log() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cos_op)      cos() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sin_op)      sin() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)      pow(const Scalar& exponent) const;
-
-    const ScalarAddReturnType
-    operator+(const Scalar& scalar) const;
-
-    /** \relates Cwise */
-    friend const ScalarAddReturnType
-    operator+(const Scalar& scalar, const Cwise& mat)
-    { return mat + scalar; }
-
-    ExpressionType& operator+=(const Scalar& scalar);
-
-    const ScalarAddReturnType
-    operator-(const Scalar& scalar) const;
-
-    ExpressionType& operator-=(const Scalar& scalar);
-
-    template<typename OtherDerived>
-    inline ExpressionType& operator*=(const MatrixBase<OtherDerived> &other);
-
-    template<typename OtherDerived>
-    inline ExpressionType& operator/=(const MatrixBase<OtherDerived> &other);
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)
-    operator<(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)
-    operator<=(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)
-    operator>(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)
-    operator>=(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)
-    operator==(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)
-    operator!=(const MatrixBase<OtherDerived>& other) const;
-
-    // comparisons to a scalar value
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)
-    operator<(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)
-    operator<=(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)
-    operator>(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)
-    operator>=(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)
-    operator==(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)
-    operator!=(Scalar s) const;
-
-    // allow to extend Cwise outside Eigen
-    #ifdef EIGEN_CWISE_PLUGIN
-    #include EIGEN_CWISE_PLUGIN
-    #endif
-
-  protected:
-    ExpressionTypeNested m_matrix;
-};
-
-
-/** \returns a Cwise wrapper of *this providing additional coefficient-wise operations
-  *
-  * Example: \include MatrixBase_cwise_const.cpp
-  * Output: \verbinclude MatrixBase_cwise_const.out
-  *
-  * \sa class Cwise, cwise()
-  */
-template<typename Derived>
-inline const Cwise<Derived> MatrixBase<Derived>::cwise() const
-{
-  return derived();
-}
-
-/** \returns a Cwise wrapper of *this providing additional coefficient-wise operations
-  *
-  * Example: \include MatrixBase_cwise.cpp
-  * Output: \verbinclude MatrixBase_cwise.out
-  *
-  * \sa class Cwise, cwise() const
-  */
-template<typename Derived>
-inline Cwise<Derived> MatrixBase<Derived>::cwise()
-{
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CWISE_H
diff --git a/Eigen/src/Eigen2Support/CwiseOperators.h b/Eigen/src/Eigen2Support/CwiseOperators.h
deleted file mode 100644
index 482f30648..000000000
--- a/Eigen/src/Eigen2Support/CwiseOperators.h
+++ /dev/null
@@ -1,298 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_ARRAY_CWISE_OPERATORS_H
-#define EIGEN_ARRAY_CWISE_OPERATORS_H
-
-namespace Eigen { 
-
-/***************************************************************************
-* The following functions were defined in Core
-***************************************************************************/
-
-
-/** \deprecated ArrayBase::abs() */
-template<typename ExpressionType>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs_op)
-Cwise<ExpressionType>::abs() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::abs2() */
-template<typename ExpressionType>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs2_op)
-Cwise<ExpressionType>::abs2() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::exp() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_exp_op)
-Cwise<ExpressionType>::exp() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::log() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_log_op)
-Cwise<ExpressionType>::log() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::operator*() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)
-Cwise<ExpressionType>::operator*(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator/() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)
-Cwise<ExpressionType>::operator/(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator*=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline ExpressionType& Cwise<ExpressionType>::operator*=(const MatrixBase<OtherDerived> &other)
-{
-  return m_matrix.const_cast_derived() = *this * other;
-}
-
-/** \deprecated ArrayBase::operator/=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline ExpressionType& Cwise<ExpressionType>::operator/=(const MatrixBase<OtherDerived> &other)
-{
-  return m_matrix.const_cast_derived() = *this / other;
-}
-
-/***************************************************************************
-* The following functions were defined in Array
-***************************************************************************/
-
-// -- unary operators --
-
-/** \deprecated ArrayBase::sqrt() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sqrt_op)
-Cwise<ExpressionType>::sqrt() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::cos() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cos_op)
-Cwise<ExpressionType>::cos() const
-{
-  return _expression();
-}
-
-
-/** \deprecated ArrayBase::sin() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sin_op)
-Cwise<ExpressionType>::sin() const
-{
-  return _expression();
-}
-
-
-/** \deprecated ArrayBase::log() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)
-Cwise<ExpressionType>::pow(const Scalar& exponent) const
-{
-  return EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)(_expression(), internal::scalar_pow_op<Scalar>(exponent));
-}
-
-
-/** \deprecated ArrayBase::inverse() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_inverse_op)
-Cwise<ExpressionType>::inverse() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::square() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_square_op)
-Cwise<ExpressionType>::square() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::cube() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cube_op)
-Cwise<ExpressionType>::cube() const
-{
-  return _expression();
-}
-
-
-// -- binary operators --
-
-/** \deprecated ArrayBase::operator<() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)
-Cwise<ExpressionType>::operator<(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::<=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)
-Cwise<ExpressionType>::operator<=(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator>() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)
-Cwise<ExpressionType>::operator>(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator>=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)
-Cwise<ExpressionType>::operator>=(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator==() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)
-Cwise<ExpressionType>::operator==(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator!=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)
-Cwise<ExpressionType>::operator!=(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)(_expression(), other.derived());
-}
-
-// comparisons to scalar value
-
-/** \deprecated ArrayBase::operator<(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)
-Cwise<ExpressionType>::operator<(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator<=(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)
-Cwise<ExpressionType>::operator<=(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator>(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)
-Cwise<ExpressionType>::operator>(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator>=(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)
-Cwise<ExpressionType>::operator>=(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator==(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)
-Cwise<ExpressionType>::operator==(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator!=(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)
-Cwise<ExpressionType>::operator!=(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-// scalar addition
-
-/** \deprecated ArrayBase::operator+(Scalar) */
-template<typename ExpressionType>
-inline const typename Cwise<ExpressionType>::ScalarAddReturnType
-Cwise<ExpressionType>::operator+(const Scalar& scalar) const
-{
-  return typename Cwise<ExpressionType>::ScalarAddReturnType(m_matrix, internal::scalar_add_op<Scalar>(scalar));
-}
-
-/** \deprecated ArrayBase::operator+=(Scalar) */
-template<typename ExpressionType>
-inline ExpressionType& Cwise<ExpressionType>::operator+=(const Scalar& scalar)
-{
-  return m_matrix.const_cast_derived() = *this + scalar;
-}
-
-/** \deprecated ArrayBase::operator-(Scalar) */
-template<typename ExpressionType>
-inline const typename Cwise<ExpressionType>::ScalarAddReturnType
-Cwise<ExpressionType>::operator-(const Scalar& scalar) const
-{
-  return *this + (-scalar);
-}
-
-/** \deprecated ArrayBase::operator-=(Scalar) */
-template<typename ExpressionType>
-inline ExpressionType& Cwise<ExpressionType>::operator-=(const Scalar& scalar)
-{
-  return m_matrix.const_cast_derived() = *this - scalar;
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_ARRAY_CWISE_OPERATORS_H
diff --git a/Eigen/src/Eigen2Support/Geometry/AlignedBox.h b/Eigen/src/Eigen2Support/Geometry/AlignedBox.h
deleted file mode 100644
index 2e4309dd9..000000000
--- a/Eigen/src/Eigen2Support/Geometry/AlignedBox.h
+++ /dev/null
@@ -1,159 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  * \nonstableyet
-  *
-  * \class AlignedBox
-  *
-  * \brief An axis aligned box
-  *
-  * \param _Scalar the type of the scalar coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  *
-  * This class represents an axis aligned box as a pair of the minimal and maximal corners.
-  */
-template <typename _Scalar, int _AmbientDim>
-class AlignedBox
-{
-public:
-EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
-  enum { AmbientDimAtCompileTime = _AmbientDim };
-  typedef _Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-
-  /** Default constructor initializing a null box. */
-  inline AlignedBox()
-  { if (AmbientDimAtCompileTime!=Dynamic) setNull(); }
-
-  /** Constructs a null box with \a _dim the dimension of the ambient space. */
-  inline explicit AlignedBox(int _dim) : m_min(_dim), m_max(_dim)
-  { setNull(); }
-
-  /** Constructs a box with extremities \a _min and \a _max. */
-  inline AlignedBox(const VectorType& _min, const VectorType& _max) : m_min(_min), m_max(_max) {}
-
-  /** Constructs a box containing a single point \a p. */
-  inline explicit AlignedBox(const VectorType& p) : m_min(p), m_max(p) {}
-
-  ~AlignedBox() {}
-
-  /** \returns the dimension in which the box holds */
-  inline int dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size()-1 : AmbientDimAtCompileTime; }
-
-  /** \returns true if the box is null, i.e, empty. */
-  inline bool isNull() const { return (m_min.cwise() > m_max).any(); }
-
-  /** Makes \c *this a null/empty box. */
-  inline void setNull()
-  {
-    m_min.setConstant( (std::numeric_limits<Scalar>::max)());
-    m_max.setConstant(-(std::numeric_limits<Scalar>::max)());
-  }
-
-  /** \returns the minimal corner */
-  inline const VectorType& (min)() const { return m_min; }
-  /** \returns a non const reference to the minimal corner */
-  inline VectorType& (min)() { return m_min; }
-  /** \returns the maximal corner */
-  inline const VectorType& (max)() const { return m_max; }
-  /** \returns a non const reference to the maximal corner */
-  inline VectorType& (max)() { return m_max; }
-
-  /** \returns true if the point \a p is inside the box \c *this. */
-  inline bool contains(const VectorType& p) const
-  { return (m_min.cwise()<=p).all() && (p.cwise()<=m_max).all(); }
-
-  /** \returns true if the box \a b is entirely inside the box \c *this. */
-  inline bool contains(const AlignedBox& b) const
-  { return (m_min.cwise()<=(b.min)()).all() && ((b.max)().cwise()<=m_max).all(); }
-
-  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this. */
-  inline AlignedBox& extend(const VectorType& p)
-  { m_min = (m_min.cwise().min)(p); m_max = (m_max.cwise().max)(p); return *this; }
-
-  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this. */
-  inline AlignedBox& extend(const AlignedBox& b)
-  { m_min = (m_min.cwise().min)(b.m_min); m_max = (m_max.cwise().max)(b.m_max); return *this; }
-
-  /** Clamps \c *this by the box \a b and returns a reference to \c *this. */
-  inline AlignedBox& clamp(const AlignedBox& b)
-  { m_min = (m_min.cwise().max)(b.m_min); m_max = (m_max.cwise().min)(b.m_max); return *this; }
-
-  /** Translate \c *this by the vector \a t and returns a reference to \c *this. */
-  inline AlignedBox& translate(const VectorType& t)
-  { m_min += t; m_max += t; return *this; }
-
-  /** \returns the squared distance between the point \a p and the box \c *this,
-    * and zero if \a p is inside the box.
-    * \sa exteriorDistance()
-    */
-  inline Scalar squaredExteriorDistance(const VectorType& p) const;
-
-  /** \returns the distance between the point \a p and the box \c *this,
-    * and zero if \a p is inside the box.
-    * \sa squaredExteriorDistance()
-    */
-  inline Scalar exteriorDistance(const VectorType& p) const
-  { return ei_sqrt(squaredExteriorDistance(p)); }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<AlignedBox,
-           AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
-  {
-    return typename internal::cast_return_type<AlignedBox,
-                    AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
-  }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit AlignedBox(const AlignedBox<OtherScalarType,AmbientDimAtCompileTime>& other)
-  {
-    m_min = (other.min)().template cast<Scalar>();
-    m_max = (other.max)().template cast<Scalar>();
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const AlignedBox& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_min.isApprox(other.m_min, prec) && m_max.isApprox(other.m_max, prec); }
-
-protected:
-
-  VectorType m_min, m_max;
-};
-
-template<typename Scalar,int AmbiantDim>
-inline Scalar AlignedBox<Scalar,AmbiantDim>::squaredExteriorDistance(const VectorType& p) const
-{
-  Scalar dist2(0);
-  Scalar aux;
-  for (int k=0; k<dim(); ++k)
-  {
-    if ((aux = (p[k]-m_min[k]))<Scalar(0))
-      dist2 += aux*aux;
-    else if ( (aux = (m_max[k]-p[k]))<Scalar(0))
-      dist2 += aux*aux;
-  }
-  return dist2;
-}
-
-} // end namespace Eigen
diff --git a/Eigen/src/Eigen2Support/Geometry/All.h b/Eigen/src/Eigen2Support/Geometry/All.h
deleted file mode 100644
index e0b00fccc..000000000
--- a/Eigen/src/Eigen2Support/Geometry/All.h
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef EIGEN2_GEOMETRY_MODULE_H
-#define EIGEN2_GEOMETRY_MODULE_H
-
-#include <limits>
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-#include "RotationBase.h"
-#include "Rotation2D.h"
-#include "Quaternion.h"
-#include "AngleAxis.h"
-#include "Transform.h"
-#include "Translation.h"
-#include "Scaling.h"
-#include "AlignedBox.h"
-#include "Hyperplane.h"
-#include "ParametrizedLine.h"
-#endif
-
-
-#define RotationBase eigen2_RotationBase
-#define Rotation2D eigen2_Rotation2D
-#define Rotation2Df eigen2_Rotation2Df
-#define Rotation2Dd eigen2_Rotation2Dd
-
-#define Quaternion  eigen2_Quaternion
-#define Quaternionf eigen2_Quaternionf
-#define Quaterniond eigen2_Quaterniond
-
-#define AngleAxis eigen2_AngleAxis
-#define AngleAxisf eigen2_AngleAxisf
-#define AngleAxisd eigen2_AngleAxisd
-
-#define Transform   eigen2_Transform
-#define Transform2f eigen2_Transform2f
-#define Transform2d eigen2_Transform2d
-#define Transform3f eigen2_Transform3f
-#define Transform3d eigen2_Transform3d
-
-#define Translation eigen2_Translation
-#define Translation2f eigen2_Translation2f
-#define Translation2d eigen2_Translation2d
-#define Translation3f eigen2_Translation3f
-#define Translation3d eigen2_Translation3d
-
-#define Scaling eigen2_Scaling
-#define Scaling2f eigen2_Scaling2f
-#define Scaling2d eigen2_Scaling2d
-#define Scaling3f eigen2_Scaling3f
-#define Scaling3d eigen2_Scaling3d
-
-#define AlignedBox eigen2_AlignedBox
-
-#define Hyperplane eigen2_Hyperplane
-#define ParametrizedLine eigen2_ParametrizedLine
-
-#define ei_toRotationMatrix eigen2_ei_toRotationMatrix
-#define ei_quaternion_assign_impl eigen2_ei_quaternion_assign_impl
-#define ei_transform_product_impl eigen2_ei_transform_product_impl
-
-#include "RotationBase.h"
-#include "Rotation2D.h"
-#include "Quaternion.h"
-#include "AngleAxis.h"
-#include "Transform.h"
-#include "Translation.h"
-#include "Scaling.h"
-#include "AlignedBox.h"
-#include "Hyperplane.h"
-#include "ParametrizedLine.h"
-
-#undef ei_toRotationMatrix
-#undef ei_quaternion_assign_impl
-#undef ei_transform_product_impl
-
-#undef RotationBase
-#undef Rotation2D
-#undef Rotation2Df
-#undef Rotation2Dd
-
-#undef Quaternion
-#undef Quaternionf
-#undef Quaterniond
-
-#undef AngleAxis
-#undef AngleAxisf
-#undef AngleAxisd
-
-#undef Transform
-#undef Transform2f
-#undef Transform2d
-#undef Transform3f
-#undef Transform3d
-
-#undef Translation
-#undef Translation2f
-#undef Translation2d
-#undef Translation3f
-#undef Translation3d
-
-#undef Scaling
-#undef Scaling2f
-#undef Scaling2d
-#undef Scaling3f
-#undef Scaling3d
-
-#undef AlignedBox
-
-#undef Hyperplane
-#undef ParametrizedLine
-
-#endif // EIGEN2_GEOMETRY_MODULE_H
diff --git a/Eigen/src/Eigen2Support/Geometry/AngleAxis.h b/Eigen/src/Eigen2Support/Geometry/AngleAxis.h
deleted file mode 100644
index af598a403..000000000
--- a/Eigen/src/Eigen2Support/Geometry/AngleAxis.h
+++ /dev/null
@@ -1,214 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class AngleAxis
-  *
-  * \brief Represents a 3D rotation as a rotation angle around an arbitrary 3D axis
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  *
-  * The following two typedefs are provided for convenience:
-  * \li \c AngleAxisf for \c float
-  * \li \c AngleAxisd for \c double
-  *
-  * \addexample AngleAxisForEuler \label How to define a rotation from Euler-angles
-  *
-  * Combined with MatrixBase::Unit{X,Y,Z}, AngleAxis can be used to easily
-  * mimic Euler-angles. Here is an example:
-  * \include AngleAxis_mimic_euler.cpp
-  * Output: \verbinclude AngleAxis_mimic_euler.out
-  *
-  * \note This class is not aimed to be used to store a rotation transformation,
-  * but rather to make easier the creation of other rotation (Quaternion, rotation Matrix)
-  * and transformation objects.
-  *
-  * \sa class Quaternion, class Transform, MatrixBase::UnitX()
-  */
-
-template<typename _Scalar> struct ei_traits<AngleAxis<_Scalar> >
-{
-  typedef _Scalar Scalar;
-};
-
-template<typename _Scalar>
-class AngleAxis : public RotationBase<AngleAxis<_Scalar>,3>
-{
-  typedef RotationBase<AngleAxis<_Scalar>,3> Base;
-
-public:
-
-  using Base::operator*;
-
-  enum { Dim = 3 };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  typedef Matrix<Scalar,3,3> Matrix3;
-  typedef Matrix<Scalar,3,1> Vector3;
-  typedef Quaternion<Scalar> QuaternionType;
-
-protected:
-
-  Vector3 m_axis;
-  Scalar m_angle;
-
-public:
-
-  /** Default constructor without initialization. */
-  AngleAxis() {}
-  /** Constructs and initialize the angle-axis rotation from an \a angle in radian
-    * and an \a axis which must be normalized. */
-  template<typename Derived>
-  inline AngleAxis(Scalar angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
-  /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
-  inline AngleAxis(const QuaternionType& q) { *this = q; }
-  /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
-  template<typename Derived>
-  inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
-
-  Scalar angle() const { return m_angle; }
-  Scalar& angle() { return m_angle; }
-
-  const Vector3& axis() const { return m_axis; }
-  Vector3& axis() { return m_axis; }
-
-  /** Concatenates two rotations */
-  inline QuaternionType operator* (const AngleAxis& other) const
-  { return QuaternionType(*this) * QuaternionType(other); }
-
-  /** Concatenates two rotations */
-  inline QuaternionType operator* (const QuaternionType& other) const
-  { return QuaternionType(*this) * other; }
-
-  /** Concatenates two rotations */
-  friend inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
-  { return a * QuaternionType(b); }
-
-  /** Concatenates two rotations */
-  inline Matrix3 operator* (const Matrix3& other) const
-  { return toRotationMatrix() * other; }
-
-  /** Concatenates two rotations */
-  inline friend Matrix3 operator* (const Matrix3& a, const AngleAxis& b)
-  { return a * b.toRotationMatrix(); }
-
-  /** Applies rotation to vector */
-  inline Vector3 operator* (const Vector3& other) const
-  { return toRotationMatrix() * other; }
-
-  /** \returns the inverse rotation, i.e., an angle-axis with opposite rotation angle */
-  AngleAxis inverse() const
-  { return AngleAxis(-m_angle, m_axis); }
-
-  AngleAxis& operator=(const QuaternionType& q);
-  template<typename Derived>
-  AngleAxis& operator=(const MatrixBase<Derived>& m);
-
-  template<typename Derived>
-  AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix3 toRotationMatrix(void) const;
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other)
-  {
-    m_axis = other.axis().template cast<Scalar>();
-    m_angle = Scalar(other.angle());
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const AngleAxis& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_axis.isApprox(other.m_axis, prec) && ei_isApprox(m_angle,other.m_angle, prec); }
-};
-
-/** \ingroup Geometry_Module
-  * single precision angle-axis type */
-typedef AngleAxis<float> AngleAxisf;
-/** \ingroup Geometry_Module
-  * double precision angle-axis type */
-typedef AngleAxis<double> AngleAxisd;
-
-/** Set \c *this from a quaternion.
-  * The axis is normalized.
-  */
-template<typename Scalar>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionType& q)
-{
-  Scalar n2 = q.vec().squaredNorm();
-  if (n2 < precision<Scalar>()*precision<Scalar>())
-  {
-    m_angle = 0;
-    m_axis << 1, 0, 0;
-  }
-  else
-  {
-    m_angle = 2*std::acos(q.w());
-    m_axis = q.vec() / ei_sqrt(n2);
-  }
-  return *this;
-}
-
-/** Set \c *this from a 3x3 rotation matrix \a mat.
-  */
-template<typename Scalar>
-template<typename Derived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
-{
-  // Since a direct conversion would not be really faster,
-  // let's use the robust Quaternion implementation:
-  return *this = QuaternionType(mat);
-}
-
-/** Constructs and \returns an equivalent 3x3 rotation matrix.
-  */
-template<typename Scalar>
-typename AngleAxis<Scalar>::Matrix3
-AngleAxis<Scalar>::toRotationMatrix(void) const
-{
-  Matrix3 res;
-  Vector3 sin_axis  = ei_sin(m_angle) * m_axis;
-  Scalar c = ei_cos(m_angle);
-  Vector3 cos1_axis = (Scalar(1)-c) * m_axis;
-
-  Scalar tmp;
-  tmp = cos1_axis.x() * m_axis.y();
-  res.coeffRef(0,1) = tmp - sin_axis.z();
-  res.coeffRef(1,0) = tmp + sin_axis.z();
-
-  tmp = cos1_axis.x() * m_axis.z();
-  res.coeffRef(0,2) = tmp + sin_axis.y();
-  res.coeffRef(2,0) = tmp - sin_axis.y();
-
-  tmp = cos1_axis.y() * m_axis.z();
-  res.coeffRef(1,2) = tmp - sin_axis.x();
-  res.coeffRef(2,1) = tmp + sin_axis.x();
-
-  res.diagonal() = (cos1_axis.cwise() * m_axis).cwise() + c;
-
-  return res;
-}
-
-} // end namespace Eigen
diff --git a/Eigen/src/Eigen2Support/Geometry/CMakeLists.txt b/Eigen/src/Eigen2Support/Geometry/CMakeLists.txt
deleted file mode 100644
index c347a8f26..000000000
--- a/Eigen/src/Eigen2Support/Geometry/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Eigen2Support_Geometry_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Eigen2Support_Geometry_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Eigen2Support/Geometry
-  )
diff --git a/Eigen/src/Eigen2Support/Geometry/Hyperplane.h b/Eigen/src/Eigen2Support/Geometry/Hyperplane.h
deleted file mode 100644
index b95bf00ec..000000000
--- a/Eigen/src/Eigen2Support/Geometry/Hyperplane.h
+++ /dev/null
@@ -1,254 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Hyperplane
-  *
-  * \brief A hyperplane
-  *
-  * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
-  * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  *             Notice that the dimension of the hyperplane is _AmbientDim-1.
-  *
-  * This class represents an hyperplane as the zero set of the implicit equation
-  * \f$ n \cdot x + d = 0 \f$ where \f$ n \f$ is a unit normal vector of the plane (linear part)
-  * and \f$ d \f$ is the distance (offset) to the origin.
-  */
-template <typename _Scalar, int _AmbientDim>
-class Hyperplane
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
-  enum { AmbientDimAtCompileTime = _AmbientDim };
-  typedef _Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-  typedef Matrix<Scalar,int(AmbientDimAtCompileTime)==Dynamic
-                        ? Dynamic
-                        : int(AmbientDimAtCompileTime)+1,1> Coefficients;
-  typedef Block<Coefficients,AmbientDimAtCompileTime,1> NormalReturnType;
-
-  /** Default constructor without initialization */
-  inline Hyperplane() {}
-
-  /** Constructs a dynamic-size hyperplane with \a _dim the dimension
-    * of the ambient space */
-  inline explicit Hyperplane(int _dim) : m_coeffs(_dim+1) {}
-
-  /** Construct a plane from its normal \a n and a point \a e onto the plane.
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline Hyperplane(const VectorType& n, const VectorType& e)
-    : m_coeffs(n.size()+1)
-  {
-    normal() = n;
-    offset() = -e.eigen2_dot(n);
-  }
-
-  /** Constructs a plane from its normal \a n and distance to the origin \a d
-    * such that the algebraic equation of the plane is \f$ n \cdot x + d = 0 \f$.
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline Hyperplane(const VectorType& n, Scalar d)
-    : m_coeffs(n.size()+1)
-  {
-    normal() = n;
-    offset() = d;
-  }
-
-  /** Constructs a hyperplane passing through the two points. If the dimension of the ambient space
-    * is greater than 2, then there isn't uniqueness, so an arbitrary choice is made.
-    */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1)
-  {
-    Hyperplane result(p0.size());
-    result.normal() = (p1 - p0).unitOrthogonal();
-    result.offset() = -result.normal().eigen2_dot(p0);
-    return result;
-  }
-
-  /** Constructs a hyperplane passing through the three points. The dimension of the ambient space
-    * is required to be exactly 3.
-    */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2)
-  {
-    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 3)
-    Hyperplane result(p0.size());
-    result.normal() = (p2 - p0).cross(p1 - p0).normalized();
-    result.offset() = -result.normal().eigen2_dot(p0);
-    return result;
-  }
-
-  /** Constructs a hyperplane passing through the parametrized line \a parametrized.
-    * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
-    * so an arbitrary choice is made.
-    */
-  // FIXME to be consitent with the rest this could be implemented as a static Through function ??
-  explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
-  {
-    normal() = parametrized.direction().unitOrthogonal();
-    offset() = -normal().eigen2_dot(parametrized.origin());
-  }
-
-  ~Hyperplane() {}
-
-  /** \returns the dimension in which the plane holds */
-  inline int dim() const { return int(AmbientDimAtCompileTime)==Dynamic ? m_coeffs.size()-1 : int(AmbientDimAtCompileTime); }
-
-  /** normalizes \c *this */
-  void normalize(void)
-  {
-    m_coeffs /= normal().norm();
-  }
-
-  /** \returns the signed distance between the plane \c *this and a point \a p.
-    * \sa absDistance()
-    */
-  inline Scalar signedDistance(const VectorType& p) const { return p.eigen2_dot(normal()) + offset(); }
-
-  /** \returns the absolute distance between the plane \c *this and a point \a p.
-    * \sa signedDistance()
-    */
-  inline Scalar absDistance(const VectorType& p) const { return ei_abs(signedDistance(p)); }
-
-  /** \returns the projection of a point \a p onto the plane \c *this.
-    */
-  inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
-
-  /** \returns a constant reference to the unit normal vector of the plane, which corresponds
-    * to the linear part of the implicit equation.
-    */
-  inline const NormalReturnType normal() const { return NormalReturnType(*const_cast<Coefficients*>(&m_coeffs),0,0,dim(),1); }
-
-  /** \returns a non-constant reference to the unit normal vector of the plane, which corresponds
-    * to the linear part of the implicit equation.
-    */
-  inline NormalReturnType normal() { return NormalReturnType(m_coeffs,0,0,dim(),1); }
-
-  /** \returns the distance to the origin, which is also the "constant term" of the implicit equation
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
-
-  /** \returns a non-constant reference to the distance to the origin, which is also the constant part
-    * of the implicit equation */
-  inline Scalar& offset() { return m_coeffs(dim()); }
-
-  /** \returns a constant reference to the coefficients c_i of the plane equation:
-    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
-    */
-  inline const Coefficients& coeffs() const { return m_coeffs; }
-
-  /** \returns a non-constant reference to the coefficients c_i of the plane equation:
-    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
-    */
-  inline Coefficients& coeffs() { return m_coeffs; }
-
-  /** \returns the intersection of *this with \a other.
-    *
-    * \warning The ambient space must be a plane, i.e. have dimension 2, so that \c *this and \a other are lines.
-    *
-    * \note If \a other is approximately parallel to *this, this method will return any point on *this.
-    */
-  VectorType intersection(const Hyperplane& other)
-  {
-    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
-    Scalar det = coeffs().coeff(0) * other.coeffs().coeff(1) - coeffs().coeff(1) * other.coeffs().coeff(0);
-    // since the line equations ax+by=c are normalized with a^2+b^2=1, the following tests
-    // whether the two lines are approximately parallel.
-    if(ei_isMuchSmallerThan(det, Scalar(1)))
-    {   // special case where the two lines are approximately parallel. Pick any point on the first line.
-        if(ei_abs(coeffs().coeff(1))>ei_abs(coeffs().coeff(0)))
-            return VectorType(coeffs().coeff(1), -coeffs().coeff(2)/coeffs().coeff(1)-coeffs().coeff(0));
-        else
-            return VectorType(-coeffs().coeff(2)/coeffs().coeff(0)-coeffs().coeff(1), coeffs().coeff(0));
-    }
-    else
-    {   // general case
-        Scalar invdet = Scalar(1) / det;
-        return VectorType(invdet*(coeffs().coeff(1)*other.coeffs().coeff(2)-other.coeffs().coeff(1)*coeffs().coeff(2)),
-                          invdet*(other.coeffs().coeff(0)*coeffs().coeff(2)-coeffs().coeff(0)*other.coeffs().coeff(2)));
-    }
-  }
-
-  /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this.
-    *
-    * \param mat the Dim x Dim transformation matrix
-    * \param traits specifies whether the matrix \a mat represents an Isometry
-    *               or a more generic Affine transformation. The default is Affine.
-    */
-  template<typename XprType>
-  inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
-  {
-    if (traits==Affine)
-      normal() = mat.inverse().transpose() * normal();
-    else if (traits==Isometry)
-      normal() = mat * normal();
-    else
-    {
-      ei_assert("invalid traits value in Hyperplane::transform()");
-    }
-    return *this;
-  }
-
-  /** Applies the transformation \a t to \c *this and returns a reference to \c *this.
-    *
-    * \param t the transformation of dimension Dim
-    * \param traits specifies whether the transformation \a t represents an Isometry
-    *               or a more generic Affine transformation. The default is Affine.
-    *               Other kind of transformations are not supported.
-    */
-  inline Hyperplane& transform(const Transform<Scalar,AmbientDimAtCompileTime>& t,
-                                TransformTraits traits = Affine)
-  {
-    transform(t.linear(), traits);
-    offset() -= t.translation().eigen2_dot(normal());
-    return *this;
-  }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Hyperplane,
-           Hyperplane<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
-  {
-    return typename internal::cast_return_type<Hyperplane,
-                    Hyperplane<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
-  }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Hyperplane(const Hyperplane<OtherScalarType,AmbientDimAtCompileTime>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Hyperplane& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-protected:
-
-  Coefficients m_coeffs;
-};
-
-} // end namespace Eigen
diff --git a/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h b/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
deleted file mode 100644
index 9b57b7e0b..000000000
--- a/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
+++ /dev/null
@@ -1,141 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class ParametrizedLine
-  *
-  * \brief A parametrized line
-  *
-  * A parametrized line is defined by an origin point \f$ \mathbf{o} \f$ and a unit
-  * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
-  * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ l \in \mathbf{R} \f$.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  */
-template <typename _Scalar, int _AmbientDim>
-class ParametrizedLine
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
-  enum { AmbientDimAtCompileTime = _AmbientDim };
-  typedef _Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-
-  /** Default constructor without initialization */
-  inline ParametrizedLine() {}
-
-  /** Constructs a dynamic-size line with \a _dim the dimension
-    * of the ambient space */
-  inline explicit ParametrizedLine(int _dim) : m_origin(_dim), m_direction(_dim) {}
-
-  /** Initializes a parametrized line of direction \a direction and origin \a origin.
-    * \warning the vector direction is assumed to be normalized.
-    */
-  ParametrizedLine(const VectorType& origin, const VectorType& direction)
-    : m_origin(origin), m_direction(direction) {}
-
-  explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim>& hyperplane);
-
-  /** Constructs a parametrized line going from \a p0 to \a p1. */
-  static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1)
-  { return ParametrizedLine(p0, (p1-p0).normalized()); }
-
-  ~ParametrizedLine() {}
-
-  /** \returns the dimension in which the line holds */
-  inline int dim() const { return m_direction.size(); }
-
-  const VectorType& origin() const { return m_origin; }
-  VectorType& origin() { return m_origin; }
-
-  const VectorType& direction() const { return m_direction; }
-  VectorType& direction() { return m_direction; }
-
-  /** \returns the squared distance of a point \a p to its projection onto the line \c *this.
-    * \sa distance()
-    */
-  RealScalar squaredDistance(const VectorType& p) const
-  {
-    VectorType diff = p-origin();
-    return (diff - diff.eigen2_dot(direction())* direction()).squaredNorm();
-  }
-  /** \returns the distance of a point \a p to its projection onto the line \c *this.
-    * \sa squaredDistance()
-    */
-  RealScalar distance(const VectorType& p) const { return ei_sqrt(squaredDistance(p)); }
-
-  /** \returns the projection of a point \a p onto the line \c *this. */
-  VectorType projection(const VectorType& p) const
-  { return origin() + (p-origin()).eigen2_dot(direction()) * direction(); }
-
-  Scalar intersection(const Hyperplane<_Scalar, _AmbientDim>& hyperplane);
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<ParametrizedLine,
-           ParametrizedLine<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
-  {
-    return typename internal::cast_return_type<ParametrizedLine,
-                    ParametrizedLine<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
-  }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit ParametrizedLine(const ParametrizedLine<OtherScalarType,AmbientDimAtCompileTime>& other)
-  {
-    m_origin = other.origin().template cast<Scalar>();
-    m_direction = other.direction().template cast<Scalar>();
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const ParametrizedLine& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); }
-
-protected:
-
-  VectorType m_origin, m_direction;
-};
-
-/** Constructs a parametrized line from a 2D hyperplane
-  *
-  * \warning the ambient space must have dimension 2 such that the hyperplane actually describes a line
-  */
-template <typename _Scalar, int _AmbientDim>
-inline ParametrizedLine<_Scalar, _AmbientDim>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim>& hyperplane)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
-  direction() = hyperplane.normal().unitOrthogonal();
-  origin() = -hyperplane.normal()*hyperplane.offset();
-}
-
-/** \returns the parameter value of the intersection between \c *this and the given hyperplane
-  */
-template <typename _Scalar, int _AmbientDim>
-inline _Scalar ParametrizedLine<_Scalar, _AmbientDim>::intersection(const Hyperplane<_Scalar, _AmbientDim>& hyperplane)
-{
-  return -(hyperplane.offset()+origin().eigen2_dot(hyperplane.normal()))
-          /(direction().eigen2_dot(hyperplane.normal()));
-}
-
-} // end namespace Eigen
diff --git a/Eigen/src/Eigen2Support/Geometry/Quaternion.h b/Eigen/src/Eigen2Support/Geometry/Quaternion.h
deleted file mode 100644
index 4b6390cf1..000000000
--- a/Eigen/src/Eigen2Support/Geometry/Quaternion.h
+++ /dev/null
@@ -1,495 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-template<typename Other,
-         int OtherRows=Other::RowsAtCompileTime,
-         int OtherCols=Other::ColsAtCompileTime>
-struct ei_quaternion_assign_impl;
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Quaternion
-  *
-  * \brief The quaternion class used to represent 3D orientations and rotations
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  *
-  * This class represents a quaternion \f$ w+xi+yj+zk \f$ that is a convenient representation of
-  * orientations and rotations of objects in three dimensions. Compared to other representations
-  * like Euler angles or 3x3 matrices, quatertions offer the following advantages:
-  * \li \b compact storage (4 scalars)
-  * \li \b efficient to compose (28 flops),
-  * \li \b stable spherical interpolation
-  *
-  * The following two typedefs are provided for convenience:
-  * \li \c Quaternionf for \c float
-  * \li \c Quaterniond for \c double
-  *
-  * \sa  class AngleAxis, class Transform
-  */
-
-template<typename _Scalar> struct ei_traits<Quaternion<_Scalar> >
-{
-  typedef _Scalar Scalar;
-};
-
-template<typename _Scalar>
-class Quaternion : public RotationBase<Quaternion<_Scalar>,3>
-{
-  typedef RotationBase<Quaternion<_Scalar>,3> Base;
-
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,4)
-
-  using Base::operator*;
-
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-
-  /** the type of the Coefficients 4-vector */
-  typedef Matrix<Scalar, 4, 1> Coefficients;
-  /** the type of a 3D vector */
-  typedef Matrix<Scalar,3,1> Vector3;
-  /** the equivalent rotation matrix type */
-  typedef Matrix<Scalar,3,3> Matrix3;
-  /** the equivalent angle-axis type */
-  typedef AngleAxis<Scalar> AngleAxisType;
-
-  /** \returns the \c x coefficient */
-  inline Scalar x() const { return m_coeffs.coeff(0); }
-  /** \returns the \c y coefficient */
-  inline Scalar y() const { return m_coeffs.coeff(1); }
-  /** \returns the \c z coefficient */
-  inline Scalar z() const { return m_coeffs.coeff(2); }
-  /** \returns the \c w coefficient */
-  inline Scalar w() const { return m_coeffs.coeff(3); }
-
-  /** \returns a reference to the \c x coefficient */
-  inline Scalar& x() { return m_coeffs.coeffRef(0); }
-  /** \returns a reference to the \c y coefficient */
-  inline Scalar& y() { return m_coeffs.coeffRef(1); }
-  /** \returns a reference to the \c z coefficient */
-  inline Scalar& z() { return m_coeffs.coeffRef(2); }
-  /** \returns a reference to the \c w coefficient */
-  inline Scalar& w() { return m_coeffs.coeffRef(3); }
-
-  /** \returns a read-only vector expression of the imaginary part (x,y,z) */
-  inline const Block<const Coefficients,3,1> vec() const { return m_coeffs.template start<3>(); }
-
-  /** \returns a vector expression of the imaginary part (x,y,z) */
-  inline Block<Coefficients,3,1> vec() { return m_coeffs.template start<3>(); }
-
-  /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
-  inline const Coefficients& coeffs() const { return m_coeffs; }
-
-  /** \returns a vector expression of the coefficients (x,y,z,w) */
-  inline Coefficients& coeffs() { return m_coeffs; }
-
-  /** Default constructor leaving the quaternion uninitialized. */
-  inline Quaternion() {}
-
-  /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
-    * its four coefficients \a w, \a x, \a y and \a z.
-    *
-    * \warning Note the order of the arguments: the real \a w coefficient first,
-    * while internally the coefficients are stored in the following order:
-    * [\c x, \c y, \c z, \c w]
-    */
-  inline Quaternion(Scalar w, Scalar x, Scalar y, Scalar z)
-  { m_coeffs << x, y, z, w; }
-
-  /** Copy constructor */
-  inline Quaternion(const Quaternion& other) { m_coeffs = other.m_coeffs; }
-
-  /** Constructs and initializes a quaternion from the angle-axis \a aa */
-  explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
-
-  /** Constructs and initializes a quaternion from either:
-    *  - a rotation matrix expression,
-    *  - a 4D vector expression representing quaternion coefficients.
-    * \sa operator=(MatrixBase<Derived>)
-    */
-  template<typename Derived>
-  explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
-
-  Quaternion& operator=(const Quaternion& other);
-  Quaternion& operator=(const AngleAxisType& aa);
-  template<typename Derived>
-  Quaternion& operator=(const MatrixBase<Derived>& m);
-
-  /** \returns a quaternion representing an identity rotation
-    * \sa MatrixBase::Identity()
-    */
-  static inline Quaternion Identity() { return Quaternion(1, 0, 0, 0); }
-
-  /** \sa Quaternion::Identity(), MatrixBase::setIdentity()
-    */
-  inline Quaternion& setIdentity() { m_coeffs << 0, 0, 0, 1; return *this; }
-
-  /** \returns the squared norm of the quaternion's coefficients
-    * \sa Quaternion::norm(), MatrixBase::squaredNorm()
-    */
-  inline Scalar squaredNorm() const { return m_coeffs.squaredNorm(); }
-
-  /** \returns the norm of the quaternion's coefficients
-    * \sa Quaternion::squaredNorm(), MatrixBase::norm()
-    */
-  inline Scalar norm() const { return m_coeffs.norm(); }
-
-  /** Normalizes the quaternion \c *this
-    * \sa normalized(), MatrixBase::normalize() */
-  inline void normalize() { m_coeffs.normalize(); }
-  /** \returns a normalized version of \c *this
-    * \sa normalize(), MatrixBase::normalized() */
-  inline Quaternion normalized() const { return Quaternion(m_coeffs.normalized()); }
-
-  /** \returns the dot product of \c *this and \a other
-    * Geometrically speaking, the dot product of two unit quaternions
-    * corresponds to the cosine of half the angle between the two rotations.
-    * \sa angularDistance()
-    */
-  inline Scalar eigen2_dot(const Quaternion& other) const { return m_coeffs.eigen2_dot(other.m_coeffs); }
-
-  inline Scalar angularDistance(const Quaternion& other) const;
-
-  Matrix3 toRotationMatrix(void) const;
-
-  template<typename Derived1, typename Derived2>
-  Quaternion& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
-
-  inline Quaternion operator* (const Quaternion& q) const;
-  inline Quaternion& operator*= (const Quaternion& q);
-
-  Quaternion inverse(void) const;
-  Quaternion conjugate(void) const;
-
-  Quaternion slerp(Scalar t, const Quaternion& other) const;
-
-  template<typename Derived>
-  Vector3 operator* (const MatrixBase<Derived>& vec) const;
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Quaternion,Quaternion<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<Quaternion,Quaternion<NewScalarType> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Quaternion(const Quaternion<OtherScalarType>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Quaternion& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-protected:
-  Coefficients m_coeffs;
-};
-
-/** \ingroup Geometry_Module
-  * single precision quaternion type */
-typedef Quaternion<float> Quaternionf;
-/** \ingroup Geometry_Module
-  * double precision quaternion type */
-typedef Quaternion<double> Quaterniond;
-
-// Generic Quaternion * Quaternion product
-template<typename Scalar> inline Quaternion<Scalar>
-ei_quaternion_product(const Quaternion<Scalar>& a, const Quaternion<Scalar>& b)
-{
-  return Quaternion<Scalar>
-  (
-    a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
-    a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
-    a.w() * b.y() + a.y() * b.w() + a.z() * b.x() - a.x() * b.z(),
-    a.w() * b.z() + a.z() * b.w() + a.x() * b.y() - a.y() * b.x()
-  );
-}
-
-/** \returns the concatenation of two rotations as a quaternion-quaternion product */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::operator* (const Quaternion& other) const
-{
-  return ei_quaternion_product(*this,other);
-}
-
-/** \sa operator*(Quaternion) */
-template <typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator*= (const Quaternion& other)
-{
-  return (*this = *this * other);
-}
-
-/** Rotation of a vector by a quaternion.
-  * \remarks If the quaternion is used to rotate several points (>1)
-  * then it is much more efficient to first convert it to a 3x3 Matrix.
-  * Comparison of the operation cost for n transformations:
-  *   - Quaternion:    30n
-  *   - Via a Matrix3: 24 + 15n
-  */
-template <typename Scalar>
-template<typename Derived>
-inline typename Quaternion<Scalar>::Vector3
-Quaternion<Scalar>::operator* (const MatrixBase<Derived>& v) const
-{
-    // Note that this algorithm comes from the optimization by hand
-    // of the conversion to a Matrix followed by a Matrix/Vector product.
-    // It appears to be much faster than the common algorithm found
-    // in the litterature (30 versus 39 flops). It also requires two
-    // Vector3 as temporaries.
-    Vector3 uv;
-    uv = 2 * this->vec().cross(v);
-    return v + this->w() * uv + this->vec().cross(uv);
-}
-
-template<typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const Quaternion& other)
-{
-  m_coeffs = other.m_coeffs;
-  return *this;
-}
-
-/** Set \c *this from an angle-axis \a aa and returns a reference to \c *this
-  */
-template<typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const AngleAxisType& aa)
-{
-  Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
-  this->w() = ei_cos(ha);
-  this->vec() = ei_sin(ha) * aa.axis();
-  return *this;
-}
-
-/** Set \c *this from the expression \a xpr:
-  *   - if \a xpr is a 4x1 vector, then \a xpr is assumed to be a quaternion
-  *   - if \a xpr is a 3x3 matrix, then \a xpr is assumed to be rotation matrix
-  *     and \a xpr is converted to a quaternion
-  */
-template<typename Scalar>
-template<typename Derived>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const MatrixBase<Derived>& xpr)
-{
-  ei_quaternion_assign_impl<Derived>::run(*this, xpr.derived());
-  return *this;
-}
-
-/** Convert the quaternion to a 3x3 rotation matrix */
-template<typename Scalar>
-inline typename Quaternion<Scalar>::Matrix3
-Quaternion<Scalar>::toRotationMatrix(void) const
-{
-  // NOTE if inlined, then gcc 4.2 and 4.4 get rid of the temporary (not gcc 4.3 !!)
-  // if not inlined then the cost of the return by value is huge ~ +35%,
-  // however, not inlining this function is an order of magnitude slower, so
-  // it has to be inlined, and so the return by value is not an issue
-  Matrix3 res;
-
-  const Scalar tx  = Scalar(2)*this->x();
-  const Scalar ty  = Scalar(2)*this->y();
-  const Scalar tz  = Scalar(2)*this->z();
-  const Scalar twx = tx*this->w();
-  const Scalar twy = ty*this->w();
-  const Scalar twz = tz*this->w();
-  const Scalar txx = tx*this->x();
-  const Scalar txy = ty*this->x();
-  const Scalar txz = tz*this->x();
-  const Scalar tyy = ty*this->y();
-  const Scalar tyz = tz*this->y();
-  const Scalar tzz = tz*this->z();
-
-  res.coeffRef(0,0) = Scalar(1)-(tyy+tzz);
-  res.coeffRef(0,1) = txy-twz;
-  res.coeffRef(0,2) = txz+twy;
-  res.coeffRef(1,0) = txy+twz;
-  res.coeffRef(1,1) = Scalar(1)-(txx+tzz);
-  res.coeffRef(1,2) = tyz-twx;
-  res.coeffRef(2,0) = txz-twy;
-  res.coeffRef(2,1) = tyz+twx;
-  res.coeffRef(2,2) = Scalar(1)-(txx+tyy);
-
-  return res;
-}
-
-/** Sets *this to be a quaternion representing a rotation sending the vector \a a to the vector \a b.
-  *
-  * \returns a reference to *this.
-  *
-  * Note that the two input vectors do \b not have to be normalized.
-  */
-template<typename Scalar>
-template<typename Derived1, typename Derived2>
-inline Quaternion<Scalar>& Quaternion<Scalar>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
-{
-  Vector3 v0 = a.normalized();
-  Vector3 v1 = b.normalized();
-  Scalar c = v0.eigen2_dot(v1);
-
-  // if dot == 1, vectors are the same
-  if (ei_isApprox(c,Scalar(1)))
-  {
-    // set to identity
-    this->w() = 1; this->vec().setZero();
-    return *this;
-  }
-  // if dot == -1, vectors are opposites
-  if (ei_isApprox(c,Scalar(-1)))
-  {
-    this->vec() = v0.unitOrthogonal();
-    this->w() = 0;
-    return *this;
-  }
-
-  Vector3 axis = v0.cross(v1);
-  Scalar s = ei_sqrt((Scalar(1)+c)*Scalar(2));
-  Scalar invs = Scalar(1)/s;
-  this->vec() = axis * invs;
-  this->w() = s * Scalar(0.5);
-
-  return *this;
-}
-
-/** \returns the multiplicative inverse of \c *this
-  * Note that in most cases, i.e., if you simply want the opposite rotation,
-  * and/or the quaternion is normalized, then it is enough to use the conjugate.
-  *
-  * \sa Quaternion::conjugate()
-  */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::inverse() const
-{
-  // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
-  Scalar n2 = this->squaredNorm();
-  if (n2 > 0)
-    return Quaternion(conjugate().coeffs() / n2);
-  else
-  {
-    // return an invalid result to flag the error
-    return Quaternion(Coefficients::Zero());
-  }
-}
-
-/** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
-  * if the quaternion is normalized.
-  * The conjugate of a quaternion represents the opposite rotation.
-  *
-  * \sa Quaternion::inverse()
-  */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::conjugate() const
-{
-  return Quaternion(this->w(),-this->x(),-this->y(),-this->z());
-}
-
-/** \returns the angle (in radian) between two rotations
-  * \sa eigen2_dot()
-  */
-template <typename Scalar>
-inline Scalar Quaternion<Scalar>::angularDistance(const Quaternion& other) const
-{
-  double d = ei_abs(this->eigen2_dot(other));
-  if (d>=1.0)
-    return 0;
-  return Scalar(2) * std::acos(d);
-}
-
-/** \returns the spherical linear interpolation between the two quaternions
-  * \c *this and \a other at the parameter \a t
-  */
-template <typename Scalar>
-Quaternion<Scalar> Quaternion<Scalar>::slerp(Scalar t, const Quaternion& other) const
-{
-  static const Scalar one = Scalar(1) - machine_epsilon<Scalar>();
-  Scalar d = this->eigen2_dot(other);
-  Scalar absD = ei_abs(d);
-
-  Scalar scale0;
-  Scalar scale1;
-
-  if (absD>=one)
-  {
-    scale0 = Scalar(1) - t;
-    scale1 = t;
-  }
-  else
-  {
-    // theta is the angle between the 2 quaternions
-    Scalar theta = std::acos(absD);
-    Scalar sinTheta = ei_sin(theta);
-
-    scale0 = ei_sin( ( Scalar(1) - t ) * theta) / sinTheta;
-    scale1 = ei_sin( ( t * theta) ) / sinTheta;
-    if (d<0)
-      scale1 = -scale1;
-  }
-
-  return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
-}
-
-// set from a rotation matrix
-template<typename Other>
-struct ei_quaternion_assign_impl<Other,3,3>
-{
-  typedef typename Other::Scalar Scalar;
-  static inline void run(Quaternion<Scalar>& q, const Other& mat)
-  {
-    // This algorithm comes from  "Quaternion Calculus and Fast Animation",
-    // Ken Shoemake, 1987 SIGGRAPH course notes
-    Scalar t = mat.trace();
-    if (t > 0)
-    {
-      t = ei_sqrt(t + Scalar(1.0));
-      q.w() = Scalar(0.5)*t;
-      t = Scalar(0.5)/t;
-      q.x() = (mat.coeff(2,1) - mat.coeff(1,2)) * t;
-      q.y() = (mat.coeff(0,2) - mat.coeff(2,0)) * t;
-      q.z() = (mat.coeff(1,0) - mat.coeff(0,1)) * t;
-    }
-    else
-    {
-      int i = 0;
-      if (mat.coeff(1,1) > mat.coeff(0,0))
-        i = 1;
-      if (mat.coeff(2,2) > mat.coeff(i,i))
-        i = 2;
-      int j = (i+1)%3;
-      int k = (j+1)%3;
-
-      t = ei_sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
-      q.coeffs().coeffRef(i) = Scalar(0.5) * t;
-      t = Scalar(0.5)/t;
-      q.w() = (mat.coeff(k,j)-mat.coeff(j,k))*t;
-      q.coeffs().coeffRef(j) = (mat.coeff(j,i)+mat.coeff(i,j))*t;
-      q.coeffs().coeffRef(k) = (mat.coeff(k,i)+mat.coeff(i,k))*t;
-    }
-  }
-};
-
-// set from a vector of coefficients assumed to be a quaternion
-template<typename Other>
-struct ei_quaternion_assign_impl<Other,4,1>
-{
-  typedef typename Other::Scalar Scalar;
-  static inline void run(Quaternion<Scalar>& q, const Other& vec)
-  {
-    q.coeffs() = vec;
-  }
-};
-
-} // end namespace Eigen
diff --git a/Eigen/src/Eigen2Support/Geometry/Rotation2D.h b/Eigen/src/Eigen2Support/Geometry/Rotation2D.h
deleted file mode 100644
index 19b8582a1..000000000
--- a/Eigen/src/Eigen2Support/Geometry/Rotation2D.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Rotation2D
-  *
-  * \brief Represents a rotation/orientation in a 2 dimensional space.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  *
-  * This class is equivalent to a single scalar representing a counter clock wise rotation
-  * as a single angle in radian. It provides some additional features such as the automatic
-  * conversion from/to a 2x2 rotation matrix. Moreover this class aims to provide a similar
-  * interface to Quaternion in order to facilitate the writing of generic algorithms
-  * dealing with rotations.
-  *
-  * \sa class Quaternion, class Transform
-  */
-template<typename _Scalar> struct ei_traits<Rotation2D<_Scalar> >
-{
-  typedef _Scalar Scalar;
-};
-
-template<typename _Scalar>
-class Rotation2D : public RotationBase<Rotation2D<_Scalar>,2>
-{
-  typedef RotationBase<Rotation2D<_Scalar>,2> Base;
-
-public:
-
-  using Base::operator*;
-
-  enum { Dim = 2 };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  typedef Matrix<Scalar,2,1> Vector2;
-  typedef Matrix<Scalar,2,2> Matrix2;
-
-protected:
-
-  Scalar m_angle;
-
-public:
-
-  /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
-  inline Rotation2D(Scalar a) : m_angle(a) {}
-
-  /** \returns the rotation angle */
-  inline Scalar angle() const { return m_angle; }
-
-  /** \returns a read-write reference to the rotation angle */
-  inline Scalar& angle() { return m_angle; }
-
-  /** \returns the inverse rotation */
-  inline Rotation2D inverse() const { return -m_angle; }
-
-  /** Concatenates two rotations */
-  inline Rotation2D operator*(const Rotation2D& other) const
-  { return m_angle + other.m_angle; }
-
-  /** Concatenates two rotations */
-  inline Rotation2D& operator*=(const Rotation2D& other)
-  { return m_angle += other.m_angle; return *this; }
-
-  /** Applies the rotation to a 2D vector */
-  Vector2 operator* (const Vector2& vec) const
-  { return toRotationMatrix() * vec; }
-
-  template<typename Derived>
-  Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix2 toRotationMatrix(void) const;
-
-  /** \returns the spherical interpolation between \c *this and \a other using
-    * parameter \a t. It is in fact equivalent to a linear interpolation.
-    */
-  inline Rotation2D slerp(Scalar t, const Rotation2D& other) const
-  { return m_angle * (1-t) + other.angle() * t; }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other)
-  {
-    m_angle = Scalar(other.angle());
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Rotation2D& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return ei_isApprox(m_angle,other.m_angle, prec); }
-};
-
-/** \ingroup Geometry_Module
-  * single precision 2D rotation type */
-typedef Rotation2D<float> Rotation2Df;
-/** \ingroup Geometry_Module
-  * double precision 2D rotation type */
-typedef Rotation2D<double> Rotation2Dd;
-
-/** Set \c *this from a 2x2 rotation matrix \a mat.
-  * In other words, this function extract the rotation angle
-  * from the rotation matrix.
-  */
-template<typename Scalar>
-template<typename Derived>
-Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
-{
-  EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_angle = ei_atan2(mat.coeff(1,0), mat.coeff(0,0));
-  return *this;
-}
-
-/** Constructs and \returns an equivalent 2x2 rotation matrix.
-  */
-template<typename Scalar>
-typename Rotation2D<Scalar>::Matrix2
-Rotation2D<Scalar>::toRotationMatrix(void) const
-{
-  Scalar sinA = ei_sin(m_angle);
-  Scalar cosA = ei_cos(m_angle);
-  return (Matrix2() << cosA, -sinA, sinA, cosA).finished();
-}
-
-} // end namespace Eigen
diff --git a/Eigen/src/Eigen2Support/Geometry/RotationBase.h b/Eigen/src/Eigen2Support/Geometry/RotationBase.h
deleted file mode 100644
index b1c8f38da..000000000
--- a/Eigen/src/Eigen2Support/Geometry/RotationBase.h
+++ /dev/null
@@ -1,123 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-// this file aims to contains the various representations of rotation/orientation
-// in 2D and 3D space excepted Matrix and Quaternion.
-
-/** \class RotationBase
-  *
-  * \brief Common base class for compact rotation representations
-  *
-  * \param Derived is the derived type, i.e., a rotation type
-  * \param _Dim the dimension of the space
-  */
-template<typename Derived, int _Dim>
-class RotationBase
-{
-  public:
-    enum { Dim = _Dim };
-    /** the scalar type of the coefficients */
-    typedef typename ei_traits<Derived>::Scalar Scalar;
-    
-    /** corresponding linear transformation matrix type */
-    typedef Matrix<Scalar,Dim,Dim> RotationMatrixType;
-
-    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    inline Derived& derived() { return *static_cast<Derived*>(this); }
-
-    /** \returns an equivalent rotation matrix */
-    inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
-
-    /** \returns the inverse rotation */
-    inline Derived inverse() const { return derived().inverse(); }
-
-    /** \returns the concatenation of the rotation \c *this with a translation \a t */
-    inline Transform<Scalar,Dim> operator*(const Translation<Scalar,Dim>& t) const
-    { return toRotationMatrix() * t; }
-
-    /** \returns the concatenation of the rotation \c *this with a scaling \a s */
-    inline RotationMatrixType operator*(const Scaling<Scalar,Dim>& s) const
-    { return toRotationMatrix() * s; }
-
-    /** \returns the concatenation of the rotation \c *this with an affine transformation \a t */
-    inline Transform<Scalar,Dim> operator*(const Transform<Scalar,Dim>& t) const
-    { return toRotationMatrix() * t; }
-};
-
-/** \geometry_module
-  *
-  * Constructs a Dim x Dim rotation matrix from the rotation \a r
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
-template<typename OtherDerived>
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
-::Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
-{
-  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
-  *this = r.toRotationMatrix();
-}
-
-/** \geometry_module
-  *
-  * Set a Dim x Dim rotation matrix from the rotation \a r
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
-template<typename OtherDerived>
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>&
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
-::operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
-{
-  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
-  return *this = r.toRotationMatrix();
-}
-
-/** \internal
-  *
-  * Helper function to return an arbitrary rotation object to a rotation matrix.
-  *
-  * \param Scalar the numeric type of the matrix coefficients
-  * \param Dim the dimension of the current space
-  *
-  * It returns a Dim x Dim fixed size matrix.
-  *
-  * Default specializations are provided for:
-  *   - any scalar type (2D),
-  *   - any matrix expression,
-  *   - any type based on RotationBase (e.g., Quaternion, AngleAxis, Rotation2D)
-  *
-  * Currently ei_toRotationMatrix is only used by Transform.
-  *
-  * \sa class Transform, class Rotation2D, class Quaternion, class AngleAxis
-  */
-template<typename Scalar, int Dim>
-static inline Matrix<Scalar,2,2> ei_toRotationMatrix(const Scalar& s)
-{
-  EIGEN_STATIC_ASSERT(Dim==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return Rotation2D<Scalar>(s).toRotationMatrix();
-}
-
-template<typename Scalar, int Dim, typename OtherDerived>
-static inline Matrix<Scalar,Dim,Dim> ei_toRotationMatrix(const RotationBase<OtherDerived,Dim>& r)
-{
-  return r.toRotationMatrix();
-}
-
-template<typename Scalar, int Dim, typename OtherDerived>
-static inline const MatrixBase<OtherDerived>& ei_toRotationMatrix(const MatrixBase<OtherDerived>& mat)
-{
-  EIGEN_STATIC_ASSERT(OtherDerived::RowsAtCompileTime==Dim && OtherDerived::ColsAtCompileTime==Dim,
-    YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return mat;
-}
-
-} // end namespace Eigen
diff --git a/Eigen/src/Eigen2Support/Geometry/Scaling.h b/Eigen/src/Eigen2Support/Geometry/Scaling.h
deleted file mode 100644
index b8fa6cd3f..000000000
--- a/Eigen/src/Eigen2Support/Geometry/Scaling.h
+++ /dev/null
@@ -1,167 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Scaling
-  *
-  * \brief Represents a possibly non uniform scaling transformation
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
-  *
-  * \note This class is not aimed to be used to store a scaling transformation,
-  * but rather to make easier the constructions and updates of Transform objects.
-  *
-  * \sa class Translation, class Transform
-  */
-template<typename _Scalar, int _Dim>
-class Scaling
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
-  /** dimension of the space */
-  enum { Dim = _Dim };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  /** corresponding vector type */
-  typedef Matrix<Scalar,Dim,1> VectorType;
-  /** corresponding linear transformation matrix type */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** corresponding translation type */
-  typedef Translation<Scalar,Dim> TranslationType;
-  /** corresponding affine transformation type */
-  typedef Transform<Scalar,Dim> TransformType;
-
-protected:
-
-  VectorType m_coeffs;
-
-public:
-
-  /** Default constructor without initialization. */
-  Scaling() {}
-  /** Constructs and initialize a uniform scaling transformation */
-  explicit inline Scaling(const Scalar& s) { m_coeffs.setConstant(s); }
-  /** 2D only */
-  inline Scaling(const Scalar& sx, const Scalar& sy)
-  {
-    ei_assert(Dim==2);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-  }
-  /** 3D only */
-  inline Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
-  {
-    ei_assert(Dim==3);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-    m_coeffs.z() = sz;
-  }
-  /** Constructs and initialize the scaling transformation from a vector of scaling coefficients */
-  explicit inline Scaling(const VectorType& coeffs) : m_coeffs(coeffs) {}
-
-  const VectorType& coeffs() const { return m_coeffs; }
-  VectorType& coeffs() { return m_coeffs; }
-
-  /** Concatenates two scaling */
-  inline Scaling operator* (const Scaling& other) const
-  { return Scaling(coeffs().cwise() * other.coeffs()); }
-
-  /** Concatenates a scaling and a translation */
-  inline TransformType operator* (const TranslationType& t) const;
-
-  /** Concatenates a scaling and an affine transformation */
-  inline TransformType operator* (const TransformType& t) const;
-
-  /** Concatenates a scaling and a linear transformation matrix */
-  // TODO returns an expression
-  inline LinearMatrixType operator* (const LinearMatrixType& other) const
-  { return coeffs().asDiagonal() * other; }
-
-  /** Concatenates a linear transformation matrix and a scaling */
-  // TODO returns an expression
-  friend inline LinearMatrixType operator* (const LinearMatrixType& other, const Scaling& s)
-  { return other * s.coeffs().asDiagonal(); }
-
-  template<typename Derived>
-  inline LinearMatrixType operator*(const RotationBase<Derived,Dim>& r) const
-  { return *this * r.toRotationMatrix(); }
-
-  /** Applies scaling to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return coeffs().asDiagonal() * other; }
-
-  /** \returns the inverse scaling */
-  inline Scaling inverse() const
-  { return Scaling(coeffs().cwise().inverse()); }
-
-  inline Scaling& operator=(const Scaling& other)
-  {
-    m_coeffs = other.m_coeffs;
-    return *this;
-  }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Scaling,Scaling<NewScalarType,Dim> >::type cast() const
-  { return typename internal::cast_return_type<Scaling,Scaling<NewScalarType,Dim> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Scaling(const Scaling<OtherScalarType,Dim>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Scaling& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-};
-
-/** \addtogroup Geometry_Module */
-//@{
-typedef Scaling<float, 2> Scaling2f;
-typedef Scaling<double,2> Scaling2d;
-typedef Scaling<float, 3> Scaling3f;
-typedef Scaling<double,3> Scaling3d;
-//@}
-
-template<typename Scalar, int Dim>
-inline typename Scaling<Scalar,Dim>::TransformType
-Scaling<Scalar,Dim>::operator* (const TranslationType& t) const
-{
-  TransformType res;
-  res.matrix().setZero();
-  res.linear().diagonal() = coeffs();
-  res.translation() = m_coeffs.cwise() * t.vector();
-  res(Dim,Dim) = Scalar(1);
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline typename Scaling<Scalar,Dim>::TransformType
-Scaling<Scalar,Dim>::operator* (const TransformType& t) const
-{
-  TransformType res = t;
-  res.prescale(m_coeffs);
-  return res;
-}
-
-} // end namespace Eigen
diff --git a/Eigen/src/Eigen2Support/Geometry/Transform.h b/Eigen/src/Eigen2Support/Geometry/Transform.h
deleted file mode 100644
index fab60b251..000000000
--- a/Eigen/src/Eigen2Support/Geometry/Transform.h
+++ /dev/null
@@ -1,786 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-// Note that we have to pass Dim and HDim because it is not allowed to use a template
-// parameter to define a template specialization. To be more precise, in the following
-// specializations, it is not allowed to use Dim+1 instead of HDim.
-template< typename Other,
-          int Dim,
-          int HDim,
-          int OtherRows=Other::RowsAtCompileTime,
-          int OtherCols=Other::ColsAtCompileTime>
-struct ei_transform_product_impl;
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Transform
-  *
-  * \brief Represents an homogeneous transformation in a N dimensional space
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _Dim the dimension of the space
-  *
-  * The homography is internally represented and stored as a (Dim+1)^2 matrix which
-  * is available through the matrix() method.
-  *
-  * Conversion methods from/to Qt's QMatrix and QTransform are available if the
-  * preprocessor token EIGEN_QT_SUPPORT is defined.
-  *
-  * \sa class Matrix, class Quaternion
-  */
-template<typename _Scalar, int _Dim>
-class Transform
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim==Dynamic ? Dynamic : (_Dim+1)*(_Dim+1))
-  enum {
-    Dim = _Dim,     ///< space dimension in which the transformation holds
-    HDim = _Dim+1   ///< size of a respective homogeneous vector
-  };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  /** type of the matrix used to represent the transformation */
-  typedef Matrix<Scalar,HDim,HDim> MatrixType;
-  /** type of the matrix used to represent the linear part of the transformation */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** type of read/write reference to the linear part of the transformation */
-  typedef Block<MatrixType,Dim,Dim> LinearPart;
-  /** type of read/write reference to the linear part of the transformation */
-  typedef const Block<const MatrixType,Dim,Dim> ConstLinearPart;
-  /** type of a vector */
-  typedef Matrix<Scalar,Dim,1> VectorType;
-  /** type of a read/write reference to the translation part of the rotation */
-  typedef Block<MatrixType,Dim,1> TranslationPart;
-  /** type of a read/write reference to the translation part of the rotation */
-  typedef const Block<const MatrixType,Dim,1> ConstTranslationPart;
-  /** corresponding translation type */
-  typedef Translation<Scalar,Dim> TranslationType;
-  /** corresponding scaling transformation type */
-  typedef Scaling<Scalar,Dim> ScalingType;
-
-protected:
-
-  MatrixType m_matrix;
-
-public:
-
-  /** Default constructor without initialization of the coefficients. */
-  inline Transform() { }
-
-  inline Transform(const Transform& other)
-  {
-    m_matrix = other.m_matrix;
-  }
-
-  inline explicit Transform(const TranslationType& t) { *this = t; }
-  inline explicit Transform(const ScalingType& s) { *this = s; }
-  template<typename Derived>
-  inline explicit Transform(const RotationBase<Derived, Dim>& r) { *this = r; }
-
-  inline Transform& operator=(const Transform& other)
-  { m_matrix = other.m_matrix; return *this; }
-
-  template<typename OtherDerived, bool BigMatrix> // MSVC 2005 will commit suicide if BigMatrix has a default value
-  struct construct_from_matrix
-  {
-    static inline void run(Transform *transform, const MatrixBase<OtherDerived>& other)
-    {
-      transform->matrix() = other;
-    }
-  };
-
-  template<typename OtherDerived> struct construct_from_matrix<OtherDerived, true>
-  {
-    static inline void run(Transform *transform, const MatrixBase<OtherDerived>& other)
-    {
-      transform->linear() = other;
-      transform->translation().setZero();
-      transform->matrix()(Dim,Dim) = Scalar(1);
-      transform->matrix().template block<1,Dim>(Dim,0).setZero();
-    }
-  };
-
-  /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */
-  template<typename OtherDerived>
-  inline explicit Transform(const MatrixBase<OtherDerived>& other)
-  {
-    construct_from_matrix<OtherDerived, int(OtherDerived::RowsAtCompileTime) == Dim>::run(this, other);
-  }
-
-  /** Set \c *this from a (Dim+1)^2 matrix. */
-  template<typename OtherDerived>
-  inline Transform& operator=(const MatrixBase<OtherDerived>& other)
-  { m_matrix = other; return *this; }
-
-  #ifdef EIGEN_QT_SUPPORT
-  inline Transform(const QMatrix& other);
-  inline Transform& operator=(const QMatrix& other);
-  inline QMatrix toQMatrix(void) const;
-  inline Transform(const QTransform& other);
-  inline Transform& operator=(const QTransform& other);
-  inline QTransform toQTransform(void) const;
-  #endif
-
-  /** shortcut for m_matrix(row,col);
-    * \sa MatrixBase::operaror(int,int) const */
-  inline Scalar operator() (int row, int col) const { return m_matrix(row,col); }
-  /** shortcut for m_matrix(row,col);
-    * \sa MatrixBase::operaror(int,int) */
-  inline Scalar& operator() (int row, int col) { return m_matrix(row,col); }
-
-  /** \returns a read-only expression of the transformation matrix */
-  inline const MatrixType& matrix() const { return m_matrix; }
-  /** \returns a writable expression of the transformation matrix */
-  inline MatrixType& matrix() { return m_matrix; }
-
-  /** \returns a read-only expression of the linear (linear) part of the transformation */
-  inline ConstLinearPart linear() const { return m_matrix.template block<Dim,Dim>(0,0); }
-  /** \returns a writable expression of the linear (linear) part of the transformation */
-  inline LinearPart linear() { return m_matrix.template block<Dim,Dim>(0,0); }
-
-  /** \returns a read-only expression of the translation vector of the transformation */
-  inline ConstTranslationPart translation() const { return m_matrix.template block<Dim,1>(0,Dim); }
-  /** \returns a writable expression of the translation vector of the transformation */
-  inline TranslationPart translation() { return m_matrix.template block<Dim,1>(0,Dim); }
-
-  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other
-  *
-  * The right hand side \a other might be either:
-  * \li a vector of size Dim,
-  * \li an homogeneous vector of size Dim+1,
-  * \li a transformation matrix of size Dim+1 x Dim+1.
-  */
-  // note: this function is defined here because some compilers cannot find the respective declaration
-  template<typename OtherDerived>
-  inline const typename ei_transform_product_impl<OtherDerived,_Dim,_Dim+1>::ResultType
-  operator * (const MatrixBase<OtherDerived> &other) const
-  { return ei_transform_product_impl<OtherDerived,Dim,HDim>::run(*this,other.derived()); }
-
-  /** \returns the product expression of a transformation matrix \a a times a transform \a b
-    * The transformation matrix \a a must have a Dim+1 x Dim+1 sizes. */
-  template<typename OtherDerived>
-  friend inline const typename ProductReturnType<OtherDerived,MatrixType>::Type
-  operator * (const MatrixBase<OtherDerived> &a, const Transform &b)
-  { return a.derived() * b.matrix(); }
-
-  /** Contatenates two transformations */
-  inline const Transform
-  operator * (const Transform& other) const
-  { return Transform(m_matrix * other.matrix()); }
-
-  /** \sa MatrixBase::setIdentity() */
-  void setIdentity() { m_matrix.setIdentity(); }
-  static const typename MatrixType::IdentityReturnType Identity()
-  {
-    return MatrixType::Identity();
-  }
-
-  template<typename OtherDerived>
-  inline Transform& scale(const MatrixBase<OtherDerived> &other);
-
-  template<typename OtherDerived>
-  inline Transform& prescale(const MatrixBase<OtherDerived> &other);
-
-  inline Transform& scale(Scalar s);
-  inline Transform& prescale(Scalar s);
-
-  template<typename OtherDerived>
-  inline Transform& translate(const MatrixBase<OtherDerived> &other);
-
-  template<typename OtherDerived>
-  inline Transform& pretranslate(const MatrixBase<OtherDerived> &other);
-
-  template<typename RotationType>
-  inline Transform& rotate(const RotationType& rotation);
-
-  template<typename RotationType>
-  inline Transform& prerotate(const RotationType& rotation);
-
-  Transform& shear(Scalar sx, Scalar sy);
-  Transform& preshear(Scalar sx, Scalar sy);
-
-  inline Transform& operator=(const TranslationType& t);
-  inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
-  inline Transform operator*(const TranslationType& t) const;
-
-  inline Transform& operator=(const ScalingType& t);
-  inline Transform& operator*=(const ScalingType& s) { return scale(s.coeffs()); }
-  inline Transform operator*(const ScalingType& s) const;
-  friend inline Transform operator*(const LinearMatrixType& mat, const Transform& t)
-  {
-    Transform res = t;
-    res.matrix().row(Dim) = t.matrix().row(Dim);
-    res.matrix().template block<Dim,HDim>(0,0) = (mat * t.matrix().template block<Dim,HDim>(0,0)).lazy();
-    return res;
-  }
-
-  template<typename Derived>
-  inline Transform& operator=(const RotationBase<Derived,Dim>& r);
-  template<typename Derived>
-  inline Transform& operator*=(const RotationBase<Derived,Dim>& r) { return rotate(r.toRotationMatrix()); }
-  template<typename Derived>
-  inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
-
-  LinearMatrixType rotation() const;
-  template<typename RotationMatrixType, typename ScalingMatrixType>
-  void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
-  template<typename ScalingMatrixType, typename RotationMatrixType>
-  void computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const;
-
-  template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
-  Transform& fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
-    const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale);
-
-  inline const MatrixType inverse(TransformTraits traits = Affine) const;
-
-  /** \returns a const pointer to the column major internal matrix */
-  const Scalar* data() const { return m_matrix.data(); }
-  /** \returns a non-const pointer to the column major internal matrix */
-  Scalar* data() { return m_matrix.data(); }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim> >::type cast() const
-  { return typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Transform(const Transform<OtherScalarType,Dim>& other)
-  { m_matrix = other.matrix().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Transform& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_matrix.isApprox(other.m_matrix, prec); }
-
-  #ifdef EIGEN_TRANSFORM_PLUGIN
-  #include EIGEN_TRANSFORM_PLUGIN
-  #endif
-
-protected:
-
-};
-
-/** \ingroup Geometry_Module */
-typedef Transform<float,2> Transform2f;
-/** \ingroup Geometry_Module */
-typedef Transform<float,3> Transform3f;
-/** \ingroup Geometry_Module */
-typedef Transform<double,2> Transform2d;
-/** \ingroup Geometry_Module */
-typedef Transform<double,3> Transform3d;
-
-/**************************
-*** Optional QT support ***
-**************************/
-
-#ifdef EIGEN_QT_SUPPORT
-/** Initialises \c *this from a QMatrix assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>::Transform(const QMatrix& other)
-{
-  *this = other;
-}
-
-/** Set \c *this from a QMatrix assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const QMatrix& other)
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix << other.m11(), other.m21(), other.dx(),
-              other.m12(), other.m22(), other.dy(),
-              0, 0, 1;
-   return *this;
-}
-
-/** \returns a QMatrix from \c *this assuming the dimension is 2.
-  *
-  * \warning this convertion might loss data if \c *this is not affine
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-QMatrix Transform<Scalar,Dim>::toQMatrix(void) const
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return QMatrix(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
-                 m_matrix.coeff(0,1), m_matrix.coeff(1,1),
-                 m_matrix.coeff(0,2), m_matrix.coeff(1,2));
-}
-
-/** Initialises \c *this from a QTransform assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>::Transform(const QTransform& other)
-{
-  *this = other;
-}
-
-/** Set \c *this from a QTransform assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const QTransform& other)
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix << other.m11(), other.m21(), other.dx(),
-              other.m12(), other.m22(), other.dy(),
-              other.m13(), other.m23(), other.m33();
-   return *this;
-}
-
-/** \returns a QTransform from \c *this assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-QTransform Transform<Scalar,Dim>::toQTransform(void) const
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0), m_matrix.coeff(2,0),
-                    m_matrix.coeff(0,1), m_matrix.coeff(1,1), m_matrix.coeff(2,1),
-                    m_matrix.coeff(0,2), m_matrix.coeff(1,2), m_matrix.coeff(2,2));
-}
-#endif
-
-/*********************
-*** Procedural API ***
-*********************/
-
-/** Applies on the right the non uniform scale transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \sa prescale()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::scale(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  linear() = (linear() * other.asDiagonal()).lazy();
-  return *this;
-}
-
-/** Applies on the right a uniform scale of a factor \a c to \c *this
-  * and returns a reference to \c *this.
-  * \sa prescale(Scalar)
-  */
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::scale(Scalar s)
-{
-  linear() *= s;
-  return *this;
-}
-
-/** Applies on the left the non uniform scale transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \sa scale()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::prescale(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  m_matrix.template block<Dim,HDim>(0,0) = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0)).lazy();
-  return *this;
-}
-
-/** Applies on the left a uniform scale of a factor \a c to \c *this
-  * and returns a reference to \c *this.
-  * \sa scale(Scalar)
-  */
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::prescale(Scalar s)
-{
-  m_matrix.template corner<Dim,HDim>(TopLeft) *= s;
-  return *this;
-}
-
-/** Applies on the right the translation matrix represented by the vector \a other
-  * to \c *this and returns a reference to \c *this.
-  * \sa pretranslate()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::translate(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  translation() += linear() * other;
-  return *this;
-}
-
-/** Applies on the left the translation matrix represented by the vector \a other
-  * to \c *this and returns a reference to \c *this.
-  * \sa translate()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::pretranslate(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  translation() += other;
-  return *this;
-}
-
-/** Applies on the right the rotation represented by the rotation \a rotation
-  * to \c *this and returns a reference to \c *this.
-  *
-  * The template parameter \a RotationType is the type of the rotation which
-  * must be known by ei_toRotationMatrix<>.
-  *
-  * Natively supported types includes:
-  *   - any scalar (2D),
-  *   - a Dim x Dim matrix expression,
-  *   - a Quaternion (3D),
-  *   - a AngleAxis (3D)
-  *
-  * This mechanism is easily extendable to support user types such as Euler angles,
-  * or a pair of Quaternion for 4D rotations.
-  *
-  * \sa rotate(Scalar), class Quaternion, class AngleAxis, prerotate(RotationType)
-  */
-template<typename Scalar, int Dim>
-template<typename RotationType>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::rotate(const RotationType& rotation)
-{
-  linear() *= ei_toRotationMatrix<Scalar,Dim>(rotation);
-  return *this;
-}
-
-/** Applies on the left the rotation represented by the rotation \a rotation
-  * to \c *this and returns a reference to \c *this.
-  *
-  * See rotate() for further details.
-  *
-  * \sa rotate()
-  */
-template<typename Scalar, int Dim>
-template<typename RotationType>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::prerotate(const RotationType& rotation)
-{
-  m_matrix.template block<Dim,HDim>(0,0) = ei_toRotationMatrix<Scalar,Dim>(rotation)
-                                         * m_matrix.template block<Dim,HDim>(0,0);
-  return *this;
-}
-
-/** Applies on the right the shear transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \warning 2D only.
-  * \sa preshear()
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::shear(Scalar sx, Scalar sy)
-{
-  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  VectorType tmp = linear().col(0)*sy + linear().col(1);
-  linear() << linear().col(0) + linear().col(1)*sx, tmp;
-  return *this;
-}
-
-/** Applies on the left the shear transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \warning 2D only.
-  * \sa shear()
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::preshear(Scalar sx, Scalar sy)
-{
-  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix.template block<Dim,HDim>(0,0) = LinearMatrixType(1, sx, sy, 1) * m_matrix.template block<Dim,HDim>(0,0);
-  return *this;
-}
-
-/******************************************************
-*** Scaling, Translation and Rotation compatibility ***
-******************************************************/
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const TranslationType& t)
-{
-  linear().setIdentity();
-  translation() = t.vector();
-  m_matrix.template block<1,Dim>(Dim,0).setZero();
-  m_matrix(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const TranslationType& t) const
-{
-  Transform res = *this;
-  res.translate(t.vector());
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const ScalingType& s)
-{
-  m_matrix.setZero();
-  linear().diagonal() = s.coeffs();
-  m_matrix.coeffRef(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const ScalingType& s) const
-{
-  Transform res = *this;
-  res.scale(s.coeffs());
-  return res;
-}
-
-template<typename Scalar, int Dim>
-template<typename Derived>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const RotationBase<Derived,Dim>& r)
-{
-  linear() = ei_toRotationMatrix<Scalar,Dim>(r);
-  translation().setZero();
-  m_matrix.template block<1,Dim>(Dim,0).setZero();
-  m_matrix.coeffRef(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-template<typename Scalar, int Dim>
-template<typename Derived>
-inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const RotationBase<Derived,Dim>& r) const
-{
-  Transform res = *this;
-  res.rotate(r.derived());
-  return res;
-}
-
-/************************
-*** Special functions ***
-************************/
-
-/** \returns the rotation part of the transformation
-  * \nonstableyet
-  *
-  * \svd_module
-  *
-  * \sa computeRotationScaling(), computeScalingRotation(), class SVD
-  */
-template<typename Scalar, int Dim>
-typename Transform<Scalar,Dim>::LinearMatrixType
-Transform<Scalar,Dim>::rotation() const
-{
-  LinearMatrixType result;
-  computeRotationScaling(&result, (LinearMatrixType*)0);
-  return result;
-}
-
-
-/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * \nonstableyet
-  *
-  * \svd_module
-  *
-  * \sa computeScalingRotation(), rotation(), class SVD
-  */
-template<typename Scalar, int Dim>
-template<typename RotationMatrixType, typename ScalingMatrixType>
-void Transform<Scalar,Dim>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
-{
-  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU|ComputeFullV);
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, Dim, 1> sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling)
-  {
-    scaling->noalias() = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
-  }
-  if(rotation)
-  {
-    LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->noalias() = m * svd.matrixV().adjoint();
-  }
-}
-
-/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * \nonstableyet
-  *
-  * \svd_module
-  *
-  * \sa computeRotationScaling(), rotation(), class SVD
-  */
-template<typename Scalar, int Dim>
-template<typename ScalingMatrixType, typename RotationMatrixType>
-void Transform<Scalar,Dim>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
-{
-  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU|ComputeFullV);
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, Dim, 1> sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling)
-  {
-    scaling->noalias() = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint();
-  }
-  if(rotation)
-  {
-    LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->noalias() = m * svd.matrixV().adjoint();
-  }
-}
-
-/** Convenient method to set \c *this from a position, orientation and scale
-  * of a 3D object.
-  */
-template<typename Scalar, int Dim>
-template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
-  const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale)
-{
-  linear() = ei_toRotationMatrix<Scalar,Dim>(orientation);
-  linear() *= scale.asDiagonal();
-  translation() = position;
-  m_matrix.template block<1,Dim>(Dim,0).setZero();
-  m_matrix(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-/** \nonstableyet
-  *
-  * \returns the inverse transformation matrix according to some given knowledge
-  * on \c *this.
-  *
-  * \param traits allows to optimize the inversion process when the transformion
-  * is known to be not a general transformation. The possible values are:
-  *  - Projective if the transformation is not necessarily affine, i.e., if the
-  *    last row is not guaranteed to be [0 ... 0 1]
-  *  - Affine is the default, the last row is assumed to be [0 ... 0 1]
-  *  - Isometry if the transformation is only a concatenations of translations
-  *    and rotations.
-  *
-  * \warning unless \a traits is always set to NoShear or NoScaling, this function
-  * requires the generic inverse method of MatrixBase defined in the LU module. If
-  * you forget to include this module, then you will get hard to debug linking errors.
-  *
-  * \sa MatrixBase::inverse()
-  */
-template<typename Scalar, int Dim>
-inline const typename Transform<Scalar,Dim>::MatrixType
-Transform<Scalar,Dim>::inverse(TransformTraits traits) const
-{
-  if (traits == Projective)
-  {
-    return m_matrix.inverse();
-  }
-  else
-  {
-    MatrixType res;
-    if (traits == Affine)
-    {
-      res.template corner<Dim,Dim>(TopLeft) = linear().inverse();
-    }
-    else if (traits == Isometry)
-    {
-      res.template corner<Dim,Dim>(TopLeft) = linear().transpose();
-    }
-    else
-    {
-      ei_assert("invalid traits value in Transform::inverse()");
-    }
-    // translation and remaining parts
-    res.template corner<Dim,1>(TopRight) = - res.template corner<Dim,Dim>(TopLeft) * translation();
-    res.template corner<1,Dim>(BottomLeft).setZero();
-    res.coeffRef(Dim,Dim) = Scalar(1);
-    return res;
-  }
-}
-
-/*****************************************************
-*** Specializations of operator* with a MatrixBase ***
-*****************************************************/
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, HDim,HDim>
-{
-  typedef Transform<typename Other::Scalar,Dim> TransformType;
-  typedef typename TransformType::MatrixType MatrixType;
-  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  { return tr.matrix() * other; }
-};
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, Dim,Dim>
-{
-  typedef Transform<typename Other::Scalar,Dim> TransformType;
-  typedef typename TransformType::MatrixType MatrixType;
-  typedef TransformType ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  {
-    TransformType res;
-    res.translation() = tr.translation();
-    res.matrix().row(Dim) = tr.matrix().row(Dim);
-    res.linear() = (tr.linear() * other).lazy();
-    return res;
-  }
-};
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, HDim,1>
-{
-  typedef Transform<typename Other::Scalar,Dim> TransformType;
-  typedef typename TransformType::MatrixType MatrixType;
-  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  { return tr.matrix() * other; }
-};
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, Dim,1>
-{
-  typedef typename Other::Scalar Scalar;
-  typedef Transform<Scalar,Dim> TransformType;
-  typedef Matrix<Scalar,Dim,1> ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  { return ((tr.linear() * other) + tr.translation())
-          * (Scalar(1) / ( (tr.matrix().template block<1,Dim>(Dim,0) * other).coeff(0) + tr.matrix().coeff(Dim,Dim))); }
-};
-
-} // end namespace Eigen
diff --git a/Eigen/src/Eigen2Support/Geometry/Translation.h b/Eigen/src/Eigen2Support/Geometry/Translation.h
deleted file mode 100644
index 2b9859f6f..000000000
--- a/Eigen/src/Eigen2Support/Geometry/Translation.h
+++ /dev/null
@@ -1,184 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Translation
-  *
-  * \brief Represents a translation transformation
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
-  *
-  * \note This class is not aimed to be used to store a translation transformation,
-  * but rather to make easier the constructions and updates of Transform objects.
-  *
-  * \sa class Scaling, class Transform
-  */
-template<typename _Scalar, int _Dim>
-class Translation
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
-  /** dimension of the space */
-  enum { Dim = _Dim };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  /** corresponding vector type */
-  typedef Matrix<Scalar,Dim,1> VectorType;
-  /** corresponding linear transformation matrix type */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** corresponding scaling transformation type */
-  typedef Scaling<Scalar,Dim> ScalingType;
-  /** corresponding affine transformation type */
-  typedef Transform<Scalar,Dim> TransformType;
-
-protected:
-
-  VectorType m_coeffs;
-
-public:
-
-  /** Default constructor without initialization. */
-  Translation() {}
-  /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy)
-  {
-    ei_assert(Dim==2);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-  }
-  /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz)
-  {
-    ei_assert(Dim==3);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-    m_coeffs.z() = sz;
-  }
-  /** Constructs and initialize the scaling transformation from a vector of scaling coefficients */
-  explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
-
-  const VectorType& vector() const { return m_coeffs; }
-  VectorType& vector() { return m_coeffs; }
-
-  /** Concatenates two translation */
-  inline Translation operator* (const Translation& other) const
-  { return Translation(m_coeffs + other.m_coeffs); }
-
-  /** Concatenates a translation and a scaling */
-  inline TransformType operator* (const ScalingType& other) const;
-
-  /** Concatenates a translation and a linear transformation */
-  inline TransformType operator* (const LinearMatrixType& linear) const;
-
-  template<typename Derived>
-  inline TransformType operator*(const RotationBase<Derived,Dim>& r) const
-  { return *this * r.toRotationMatrix(); }
-
-  /** Concatenates a linear transformation and a translation */
-  // its a nightmare to define a templated friend function outside its declaration
-  friend inline TransformType operator* (const LinearMatrixType& linear, const Translation& t)
-  {
-    TransformType res;
-    res.matrix().setZero();
-    res.linear() = linear;
-    res.translation() = linear * t.m_coeffs;
-    res.matrix().row(Dim).setZero();
-    res(Dim,Dim) = Scalar(1);
-    return res;
-  }
-
-  /** Concatenates a translation and an affine transformation */
-  inline TransformType operator* (const TransformType& t) const;
-
-  /** Applies translation to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return m_coeffs + other; }
-
-  /** \returns the inverse translation (opposite) */
-  Translation inverse() const { return Translation(-m_coeffs); }
-
-  Translation& operator=(const Translation& other)
-  {
-    m_coeffs = other.m_coeffs;
-    return *this;
-  }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type cast() const
-  { return typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Translation(const Translation<OtherScalarType,Dim>& other)
-  { m_coeffs = other.vector().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Translation& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-};
-
-/** \addtogroup Geometry_Module */
-//@{
-typedef Translation<float, 2> Translation2f;
-typedef Translation<double,2> Translation2d;
-typedef Translation<float, 3> Translation3f;
-typedef Translation<double,3> Translation3d;
-//@}
-
-
-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const ScalingType& other) const
-{
-  TransformType res;
-  res.matrix().setZero();
-  res.linear().diagonal() = other.coeffs();
-  res.translation() = m_coeffs;
-  res(Dim,Dim) = Scalar(1);
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const LinearMatrixType& linear) const
-{
-  TransformType res;
-  res.matrix().setZero();
-  res.linear() = linear;
-  res.translation() = m_coeffs;
-  res.matrix().row(Dim).setZero();
-  res(Dim,Dim) = Scalar(1);
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const TransformType& t) const
-{
-  TransformType res = t;
-  res.pretranslate(m_coeffs);
-  return res;
-}
-
-} // end namespace Eigen
diff --git a/Eigen/src/Eigen2Support/LU.h b/Eigen/src/Eigen2Support/LU.h
deleted file mode 100644
index 49f19ad76..000000000
--- a/Eigen/src/Eigen2Support/LU.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_LU_H
-#define EIGEN2_LU_H
-
-namespace Eigen { 
-
-template<typename MatrixType>
-class LU : public FullPivLU<MatrixType>
-{
-  public:
-
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime, MatrixType::Options, 1, MatrixType::MaxColsAtCompileTime> IntRowVectorType;
-    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1, MatrixType::Options, MatrixType::MaxRowsAtCompileTime, 1> IntColVectorType;
-    typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime, MatrixType::Options, 1, MatrixType::MaxColsAtCompileTime> RowVectorType;
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1, MatrixType::Options, MatrixType::MaxRowsAtCompileTime, 1> ColVectorType;
-
-    typedef Matrix<typename MatrixType::Scalar,
-                  MatrixType::ColsAtCompileTime, // the number of rows in the "kernel matrix" is the number of cols of the original matrix
-                                                 // so that the product "matrix * kernel = zero" makes sense
-                  Dynamic,                       // we don't know at compile-time the dimension of the kernel
-                  MatrixType::Options,
-                  MatrixType::MaxColsAtCompileTime, // see explanation for 2nd template parameter
-                  MatrixType::MaxColsAtCompileTime // the kernel is a subspace of the domain space, whose dimension is the number
-                                                   // of columns of the original matrix
-    > KernelResultType;
-
-    typedef Matrix<typename MatrixType::Scalar,
-                   MatrixType::RowsAtCompileTime, // the image is a subspace of the destination space, whose dimension is the number
-                                                  // of rows of the original matrix
-                   Dynamic,                       // we don't know at compile time the dimension of the image (the rank)
-                   MatrixType::Options,
-                   MatrixType::MaxRowsAtCompileTime, // the image matrix will consist of columns from the original matrix,
-                   MatrixType::MaxColsAtCompileTime  // so it has the same number of rows and at most as many columns.
-    > ImageResultType;
-
-    typedef FullPivLU<MatrixType> Base;
-
-    template<typename T>
-    explicit LU(const T& t) : Base(t), m_originalMatrix(t) {}
-
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = static_cast<const Base*>(this)->solve(b);
-      return true;
-    }
-
-    template<typename ResultType>
-    inline void computeInverse(ResultType *result) const
-    {
-      solve(MatrixType::Identity(this->rows(), this->cols()), result);
-    }
-    
-    template<typename KernelMatrixType>
-    void computeKernel(KernelMatrixType *result) const
-    {
-      *result = static_cast<const Base*>(this)->kernel();
-    }
-    
-    template<typename ImageMatrixType>
-    void computeImage(ImageMatrixType *result) const
-    {
-      *result = static_cast<const Base*>(this)->image(m_originalMatrix);
-    }
-    
-    const ImageResultType image() const
-    {
-      return static_cast<const Base*>(this)->image(m_originalMatrix);
-    }
-    
-    const MatrixType& m_originalMatrix;
-};
-
-#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-/** \lu_module
-  *
-  * Synonym of partialPivLu().
-  *
-  * \return the partial-pivoting LU decomposition of \c *this.
-  *
-  * \sa class PartialPivLU
-  */
-template<typename Derived>
-inline const LU<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::lu() const
-{
-  return LU<PlainObject>(eval());
-}
-#endif
-
-#ifdef EIGEN2_SUPPORT
-/** \lu_module
-  *
-  * Synonym of partialPivLu().
-  *
-  * \return the partial-pivoting LU decomposition of \c *this.
-  *
-  * \sa class PartialPivLU
-  */
-template<typename Derived>
-inline const LU<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::eigen2_lu() const
-{
-  return LU<PlainObject>(eval());
-}
-#endif
-
-} // end namespace Eigen
-
-#endif // EIGEN2_LU_H
diff --git a/Eigen/src/Eigen2Support/Lazy.h b/Eigen/src/Eigen2Support/Lazy.h
deleted file mode 100644
index 593fc78e6..000000000
--- a/Eigen/src/Eigen2Support/Lazy.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_LAZY_H
-#define EIGEN_LAZY_H
-
-namespace Eigen { 
-
-/** \deprecated it is only used by lazy() which is deprecated
-  *
-  * \returns an expression of *this with added flags
-  *
-  * Example: \include MatrixBase_marked.cpp
-  * Output: \verbinclude MatrixBase_marked.out
-  *
-  * \sa class Flagged, extract(), part()
-  */
-template<typename Derived>
-template<unsigned int Added>
-inline const Flagged<Derived, Added, 0>
-MatrixBase<Derived>::marked() const
-{
-  return derived();
-}
-
-/** \deprecated use MatrixBase::noalias()
-  *
-  * \returns an expression of *this with the EvalBeforeAssigningBit flag removed.
-  *
-  * Example: \include MatrixBase_lazy.cpp
-  * Output: \verbinclude MatrixBase_lazy.out
-  *
-  * \sa class Flagged, marked()
-  */
-template<typename Derived>
-inline const Flagged<Derived, 0, EvalBeforeAssigningBit>
-MatrixBase<Derived>::lazy() const
-{
-  return derived();
-}
-
-
-/** \internal
-  * Overloaded to perform an efficient C += (A*B).lazy() */
-template<typename Derived>
-template<typename ProductDerived, typename Lhs, typename Rhs>
-Derived& MatrixBase<Derived>::operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                                       EvalBeforeAssigningBit>& other)
-{
-  other._expression().derived().addTo(derived()); return derived();
-}
-
-/** \internal
-  * Overloaded to perform an efficient C -= (A*B).lazy() */
-template<typename Derived>
-template<typename ProductDerived, typename Lhs, typename Rhs>
-Derived& MatrixBase<Derived>::operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                                       EvalBeforeAssigningBit>& other)
-{
-  other._expression().derived().subTo(derived()); return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_LAZY_H
diff --git a/Eigen/src/Eigen2Support/LeastSquares.h b/Eigen/src/Eigen2Support/LeastSquares.h
deleted file mode 100644
index 7992d4944..000000000
--- a/Eigen/src/Eigen2Support/LeastSquares.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_LEASTSQUARES_H
-#define EIGEN2_LEASTSQUARES_H
-
-namespace Eigen { 
-
-/** \ingroup LeastSquares_Module
-  *
-  * \leastsquares_module
-  *
-  * For a set of points, this function tries to express
-  * one of the coords as a linear (affine) function of the other coords.
-  *
-  * This is best explained by an example. This function works in full
-  * generality, for points in a space of arbitrary dimension, and also over
-  * the complex numbers, but for this example we will work in dimension 3
-  * over the real numbers (doubles).
-  *
-  * So let us work with the following set of 5 points given by their
-  * \f$(x,y,z)\f$ coordinates:
-  * @code
-    Vector3d points[5];
-    points[0] = Vector3d( 3.02, 6.89, -4.32 );
-    points[1] = Vector3d( 2.01, 5.39, -3.79 );
-    points[2] = Vector3d( 2.41, 6.01, -4.01 );
-    points[3] = Vector3d( 2.09, 5.55, -3.86 );
-    points[4] = Vector3d( 2.58, 6.32, -4.10 );
-  * @endcode
-  * Suppose that we want to express the second coordinate (\f$y\f$) as a linear
-  * expression in \f$x\f$ and \f$z\f$, that is,
-  * \f[ y=ax+bz+c \f]
-  * for some constants \f$a,b,c\f$. Thus, we want to find the best possible
-  * constants \f$a,b,c\f$ so that the plane of equation \f$y=ax+bz+c\f$ fits
-  * best the five above points. To do that, call this function as follows:
-  * @code
-    Vector3d coeffs; // will store the coefficients a, b, c
-    linearRegression(
-      5,
-      &points,
-      &coeffs,
-      1 // the coord to express as a function of
-        // the other ones. 0 means x, 1 means y, 2 means z.
-    );
-  * @endcode
-  * Now the vector \a coeffs is approximately
-  * \f$( 0.495 ,  -1.927 ,  -2.906 )\f$.
-  * Thus, we get \f$a=0.495, b = -1.927, c = -2.906\f$. Let us check for
-  * instance how near points[0] is from the plane of equation \f$y=ax+bz+c\f$.
-  * Looking at the coords of points[0], we see that:
-  * \f[ax+bz+c = 0.495 * 3.02 + (-1.927) * (-4.32) + (-2.906) = 6.91.\f]
-  * On the other hand, we have \f$y=6.89\f$. We see that the values
-  * \f$6.91\f$ and \f$6.89\f$
-  * are near, so points[0] is very near the plane of equation \f$y=ax+bz+c\f$.
-  *
-  * Let's now describe precisely the parameters:
-  * @param numPoints the number of points
-  * @param points the array of pointers to the points on which to perform the linear regression
-  * @param result pointer to the vector in which to store the result.
-                  This vector must be of the same type and size as the
-                  data points. The meaning of its coords is as follows.
-                  For brevity, let \f$n=Size\f$,
-                  \f$r_i=result[i]\f$,
-                  and \f$f=funcOfOthers\f$. Denote by
-                  \f$x_0,\ldots,x_{n-1}\f$
-                  the n coordinates in the n-dimensional space.
-                  Then the resulting equation is:
-                  \f[ x_f = r_0 x_0 + \cdots + r_{f-1}x_{f-1}
-                   + r_{f+1}x_{f+1} + \cdots + r_{n-1}x_{n-1} + r_n. \f]
-  * @param funcOfOthers Determines which coord to express as a function of the
-                        others. Coords are numbered starting from 0, so that a
-                        value of 0 means \f$x\f$, 1 means \f$y\f$,
-                        2 means \f$z\f$, ...
-  *
-  * \sa fitHyperplane()
-  */
-template<typename VectorType>
-void linearRegression(int numPoints,
-                      VectorType **points,
-                      VectorType *result,
-                      int funcOfOthers )
-{
-  typedef typename VectorType::Scalar Scalar;
-  typedef Hyperplane<Scalar, VectorType::SizeAtCompileTime> HyperplaneType;
-  const int size = points[0]->size();
-  result->resize(size);
-  HyperplaneType h(size);
-  fitHyperplane(numPoints, points, &h);
-  for(int i = 0; i < funcOfOthers; i++)
-    result->coeffRef(i) = - h.coeffs()[i] / h.coeffs()[funcOfOthers];
-  for(int i = funcOfOthers; i < size; i++)
-    result->coeffRef(i) = - h.coeffs()[i+1] / h.coeffs()[funcOfOthers];
-}
-
-/** \ingroup LeastSquares_Module
-  *
-  * \leastsquares_module
-  *
-  * This function is quite similar to linearRegression(), so we refer to the
-  * documentation of this function and only list here the differences.
-  *
-  * The main difference from linearRegression() is that this function doesn't
-  * take a \a funcOfOthers argument. Instead, it finds a general equation
-  * of the form
-  * \f[ r_0 x_0 + \cdots + r_{n-1}x_{n-1} + r_n = 0, \f]
-  * where \f$n=Size\f$, \f$r_i=retCoefficients[i]\f$, and we denote by
-  * \f$x_0,\ldots,x_{n-1}\f$ the n coordinates in the n-dimensional space.
-  *
-  * Thus, the vector \a retCoefficients has size \f$n+1\f$, which is another
-  * difference from linearRegression().
-  *
-  * In practice, this function performs an hyper-plane fit in a total least square sense
-  * via the following steps:
-  *  1 - center the data to the mean
-  *  2 - compute the covariance matrix
-  *  3 - pick the eigenvector corresponding to the smallest eigenvalue of the covariance matrix
-  * The ratio of the smallest eigenvalue and the second one gives us a hint about the relevance
-  * of the solution. This value is optionally returned in \a soundness.
-  *
-  * \sa linearRegression()
-  */
-template<typename VectorType, typename HyperplaneType>
-void fitHyperplane(int numPoints,
-                   VectorType **points,
-                   HyperplaneType *result,
-                   typename NumTraits<typename VectorType::Scalar>::Real* soundness = 0)
-{
-  typedef typename VectorType::Scalar Scalar;
-  typedef Matrix<Scalar,VectorType::SizeAtCompileTime,VectorType::SizeAtCompileTime> CovMatrixType;
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType)
-  ei_assert(numPoints >= 1);
-  int size = points[0]->size();
-  ei_assert(size+1 == result->coeffs().size());
-
-  // compute the mean of the data
-  VectorType mean = VectorType::Zero(size);
-  for(int i = 0; i < numPoints; ++i)
-    mean += *(points[i]);
-  mean /= numPoints;
-
-  // compute the covariance matrix
-  CovMatrixType covMat = CovMatrixType::Zero(size, size);
-  for(int i = 0; i < numPoints; ++i)
-  {
-    VectorType diff = (*(points[i]) - mean).conjugate();
-    covMat += diff * diff.adjoint();
-  }
-
-  // now we just have to pick the eigen vector with smallest eigen value
-  SelfAdjointEigenSolver<CovMatrixType> eig(covMat);
-  result->normal() = eig.eigenvectors().col(0);
-  if (soundness)
-    *soundness = eig.eigenvalues().coeff(0)/eig.eigenvalues().coeff(1);
-
-  // let's compute the constant coefficient such that the
-  // plane pass trough the mean point:
-  result->offset() = - (result->normal().cwise()* mean).sum();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_LEASTSQUARES_H
diff --git a/Eigen/src/Eigen2Support/Macros.h b/Eigen/src/Eigen2Support/Macros.h
deleted file mode 100644
index 351c32afb..000000000
--- a/Eigen/src/Eigen2Support/Macros.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_MACROS_H
-#define EIGEN2_MACROS_H
-
-#define ei_assert eigen_assert
-#define ei_internal_assert eigen_internal_assert
-
-#define EIGEN_ALIGN_128 EIGEN_ALIGN16
-
-#define EIGEN_ARCH_WANTS_ALIGNMENT EIGEN_ALIGN_STATICALLY
-
-#endif // EIGEN2_MACROS_H
diff --git a/Eigen/src/Eigen2Support/MathFunctions.h b/Eigen/src/Eigen2Support/MathFunctions.h
deleted file mode 100644
index 3544af253..000000000
--- a/Eigen/src/Eigen2Support/MathFunctions.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_MATH_FUNCTIONS_H
-#define EIGEN2_MATH_FUNCTIONS_H
-
-namespace Eigen { 
-
-template<typename T> inline typename NumTraits<T>::Real ei_real(const T& x) { return numext::real(x); }
-template<typename T> inline typename NumTraits<T>::Real ei_imag(const T& x) { return numext::imag(x); }
-template<typename T> inline T ei_conj(const T& x) { return numext::conj(x); }
-template<typename T> inline typename NumTraits<T>::Real ei_abs (const T& x) { using std::abs; return abs(x); }
-template<typename T> inline typename NumTraits<T>::Real ei_abs2(const T& x) { return numext::abs2(x); }
-template<typename T> inline T ei_sqrt(const T& x) { using std::sqrt; return sqrt(x); }
-template<typename T> inline T ei_exp (const T& x) { using std::exp;  return exp(x); }
-template<typename T> inline T ei_log (const T& x) { using std::log;  return log(x); }
-template<typename T> inline T ei_sin (const T& x) { using std::sin;  return sin(x); }
-template<typename T> inline T ei_cos (const T& x) { using std::cos;  return cos(x); }
-template<typename T> inline T ei_atan2(const T& x,const T& y) { using std::atan2; return atan2(x,y); }
-template<typename T> inline T ei_pow (const T& x,const T& y) { return numext::pow(x,y); }
-template<typename T> inline T ei_random () { return internal::random<T>(); }
-template<typename T> inline T ei_random (const T& x, const T& y) { return internal::random(x, y); }
-
-template<typename T> inline T precision () { return NumTraits<T>::dummy_precision(); }
-template<typename T> inline T machine_epsilon () { return NumTraits<T>::epsilon(); }
-
-
-template<typename Scalar, typename OtherScalar>
-inline bool ei_isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
-                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
-  return internal::isMuchSmallerThan(x, y, precision);
-}
-
-template<typename Scalar>
-inline bool ei_isApprox(const Scalar& x, const Scalar& y,
-                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
-  return internal::isApprox(x, y, precision);
-}
-
-template<typename Scalar>
-inline bool ei_isApproxOrLessThan(const Scalar& x, const Scalar& y,
-                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
-  return internal::isApproxOrLessThan(x, y, precision);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_MATH_FUNCTIONS_H
diff --git a/Eigen/src/Eigen2Support/Memory.h b/Eigen/src/Eigen2Support/Memory.h
deleted file mode 100644
index f86372b6b..000000000
--- a/Eigen/src/Eigen2Support/Memory.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_MEMORY_H
-#define EIGEN2_MEMORY_H
-
-namespace Eigen { 
-
-inline void* ei_aligned_malloc(size_t size) { return internal::aligned_malloc(size); }
-inline void  ei_aligned_free(void *ptr) { internal::aligned_free(ptr); }
-inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size) { return internal::aligned_realloc(ptr, new_size, old_size); }
-inline void* ei_handmade_aligned_malloc(size_t size) { return internal::handmade_aligned_malloc(size); }
-inline void  ei_handmade_aligned_free(void *ptr) { internal::handmade_aligned_free(ptr); }
-
-template<bool Align> inline void* ei_conditional_aligned_malloc(size_t size)
-{
-  return internal::conditional_aligned_malloc<Align>(size);
-}
-template<bool Align> inline void ei_conditional_aligned_free(void *ptr)
-{
-  internal::conditional_aligned_free<Align>(ptr);
-}
-template<bool Align> inline void* ei_conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
-{
-  return internal::conditional_aligned_realloc<Align>(ptr, new_size, old_size);
-}
-
-template<typename T> inline T* ei_aligned_new(size_t size)
-{
-  return internal::aligned_new<T>(size);
-}
-template<typename T> inline void ei_aligned_delete(T *ptr, size_t size)
-{
-  return internal::aligned_delete(ptr, size);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_MACROS_H
diff --git a/Eigen/src/Eigen2Support/Meta.h b/Eigen/src/Eigen2Support/Meta.h
deleted file mode 100644
index fa37cfc96..000000000
--- a/Eigen/src/Eigen2Support/Meta.h
+++ /dev/null
@@ -1,75 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_META_H
-#define EIGEN2_META_H
-
-namespace Eigen { 
-
-template<typename T>
-struct ei_traits : internal::traits<T>
-{};
-
-struct ei_meta_true {  enum { ret = 1 }; };
-struct ei_meta_false { enum { ret = 0 }; };
-
-template<bool Condition, typename Then, typename Else>
-struct ei_meta_if { typedef Then ret; };
-
-template<typename Then, typename Else>
-struct ei_meta_if <false, Then, Else> { typedef Else ret; };
-
-template<typename T, typename U> struct ei_is_same_type { enum { ret = 0 }; };
-template<typename T> struct ei_is_same_type<T,T> { enum { ret = 1 }; };
-
-template<typename T> struct ei_unref { typedef T type; };
-template<typename T> struct ei_unref<T&> { typedef T type; };
-
-template<typename T> struct ei_unpointer { typedef T type; };
-template<typename T> struct ei_unpointer<T*> { typedef T type; };
-template<typename T> struct ei_unpointer<T*const> { typedef T type; };
-
-template<typename T> struct ei_unconst { typedef T type; };
-template<typename T> struct ei_unconst<const T> { typedef T type; };
-template<typename T> struct ei_unconst<T const &> { typedef T & type; };
-template<typename T> struct ei_unconst<T const *> { typedef T * type; };
-
-template<typename T> struct ei_cleantype { typedef T type; };
-template<typename T> struct ei_cleantype<const T>   { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<const T&>  { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<T&>        { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<const T*>  { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<T*>        { typedef typename ei_cleantype<T>::type type; };
-
-/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
-  * Usage example: \code ei_meta_sqrt<1023>::ret \endcode
-  */
-template<int Y,
-         int InfX = 0,
-         int SupX = ((Y==1) ? 1 : Y/2),
-         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
-                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
-class ei_meta_sqrt
-{
-    enum {
-      MidX = (InfX+SupX)/2,
-      TakeInf = MidX*MidX > Y ? 1 : 0,
-      NewInf = int(TakeInf) ? InfX : int(MidX),
-      NewSup = int(TakeInf) ? int(MidX) : SupX
-    };
-  public:
-    enum { ret = ei_meta_sqrt<Y,NewInf,NewSup>::ret };
-};
-
-template<int Y, int InfX, int SupX>
-class ei_meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
-
-} // end namespace Eigen
-
-#endif // EIGEN2_META_H
diff --git a/Eigen/src/Eigen2Support/Minor.h b/Eigen/src/Eigen2Support/Minor.h
deleted file mode 100644
index 4cded5734..000000000
--- a/Eigen/src/Eigen2Support/Minor.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MINOR_H
-#define EIGEN_MINOR_H
-
-namespace Eigen { 
-
-/**
-  * \class Minor
-  *
-  * \brief Expression of a minor
-  *
-  * \param MatrixType the type of the object in which we are taking a minor
-  *
-  * This class represents an expression of a minor. It is the return
-  * type of MatrixBase::minor() and most of the time this is the only way it
-  * is used.
-  *
-  * \sa MatrixBase::minor()
-  */
-
-namespace internal {
-template<typename MatrixType>
-struct traits<Minor<MatrixType> >
- : traits<MatrixType>
-{
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
-  typedef typename MatrixType::StorageKind StorageKind;
-  enum {
-    RowsAtCompileTime = (MatrixType::RowsAtCompileTime != Dynamic) ?
-                          int(MatrixType::RowsAtCompileTime) - 1 : Dynamic,
-    ColsAtCompileTime = (MatrixType::ColsAtCompileTime != Dynamic) ?
-                          int(MatrixType::ColsAtCompileTime) - 1 : Dynamic,
-    MaxRowsAtCompileTime = (MatrixType::MaxRowsAtCompileTime != Dynamic) ?
-                             int(MatrixType::MaxRowsAtCompileTime) - 1 : Dynamic,
-    MaxColsAtCompileTime = (MatrixType::MaxColsAtCompileTime != Dynamic) ?
-                             int(MatrixType::MaxColsAtCompileTime) - 1 : Dynamic,
-    Flags = _MatrixTypeNested::Flags & (HereditaryBits | LvalueBit),
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost // minor is used typically on tiny matrices,
-      // where loops are unrolled and the 'if' evaluates at compile time
-  };
-};
-}
-
-template<typename MatrixType> class Minor
-  : public MatrixBase<Minor<MatrixType> >
-{
-  public:
-
-    typedef MatrixBase<Minor> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Minor)
-
-    inline Minor(const MatrixType& matrix,
-                       Index row, Index col)
-      : m_matrix(matrix), m_row(row), m_col(col)
-    {
-      eigen_assert(row >= 0 && row < matrix.rows()
-          && col >= 0 && col < matrix.cols());
-    }
-
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Minor)
-
-    inline Index rows() const { return m_matrix.rows() - 1; }
-    inline Index cols() const { return m_matrix.cols() - 1; }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(row + (row >= m_row), col + (col >= m_col));
-    }
-
-    inline const Scalar coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(row + (row >= m_row), col + (col >= m_col));
-    }
-
-  protected:
-    const typename MatrixType::Nested m_matrix;
-    const Index m_row, m_col;
-};
-
-/**
-  * \return an expression of the (\a row, \a col)-minor of *this,
-  * i.e. an expression constructed from *this by removing the specified
-  * row and column.
-  *
-  * Example: \include MatrixBase_minor.cpp
-  * Output: \verbinclude MatrixBase_minor.out
-  *
-  * \sa class Minor
-  */
-template<typename Derived>
-inline Minor<Derived>
-MatrixBase<Derived>::minor(Index row, Index col)
-{
-  return Minor<Derived>(derived(), row, col);
-}
-
-/**
-  * This is the const version of minor(). */
-template<typename Derived>
-inline const Minor<Derived>
-MatrixBase<Derived>::minor(Index row, Index col) const
-{
-  return Minor<Derived>(derived(), row, col);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_MINOR_H
diff --git a/Eigen/src/Eigen2Support/QR.h b/Eigen/src/Eigen2Support/QR.h
deleted file mode 100644
index 2042c9851..000000000
--- a/Eigen/src/Eigen2Support/QR.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_QR_H
-#define EIGEN2_QR_H
-
-namespace Eigen { 
-
-template<typename MatrixType>
-class QR : public HouseholderQR<MatrixType>
-{
-  public:
-
-    typedef HouseholderQR<MatrixType> Base;
-    typedef Block<const MatrixType, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> MatrixRBlockType;
-
-    QR() : Base() {}
-
-    template<typename T>
-    explicit QR(const T& t) : Base(t) {}
-
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = static_cast<const Base*>(this)->solve(b);
-      return true;
-    }
-
-    MatrixType matrixQ(void) const {
-      MatrixType ret = MatrixType::Identity(this->rows(), this->cols());
-      ret = this->householderQ() * ret;
-      return ret;
-    }
-
-    bool isFullRank() const {
-      return true;
-    }
-    
-    const TriangularView<MatrixRBlockType, UpperTriangular>
-    matrixR(void) const
-    {
-      int cols = this->cols();
-      return MatrixRBlockType(this->matrixQR(), 0, 0, cols, cols).template triangularView<UpperTriangular>();
-    }
-};
-
-/** \return the QR decomposition of \c *this.
-  *
-  * \sa class QR
-  */
-template<typename Derived>
-const QR<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::qr() const
-{
-  return QR<PlainObject>(eval());
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_QR_H
diff --git a/Eigen/src/Eigen2Support/SVD.h b/Eigen/src/Eigen2Support/SVD.h
deleted file mode 100644
index 3d03d2288..000000000
--- a/Eigen/src/Eigen2Support/SVD.h
+++ /dev/null
@@ -1,637 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_SVD_H
-#define EIGEN2_SVD_H
-
-namespace Eigen {
-
-/** \ingroup SVD_Module
-  * \nonstableyet
-  *
-  * \class SVD
-  *
-  * \brief Standard SVD decomposition of a matrix and associated features
-  *
-  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
-  *
-  * This class performs a standard SVD decomposition of a real matrix A of size \c M x \c N
-  * with \c M \>= \c N.
-  *
-  *
-  * \sa MatrixBase::SVD()
-  */
-template<typename MatrixType> class SVD
-{
-  private:
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-
-    enum {
-      PacketSize = internal::packet_traits<Scalar>::size,
-      AlignmentMask = int(PacketSize)-1,
-      MinSize = EIGEN_SIZE_MIN_PREFER_DYNAMIC(MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime)
-    };
-
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> ColVector;
-    typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> RowVector;
-
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MinSize> MatrixUType;
-    typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> MatrixVType;
-    typedef Matrix<Scalar, MinSize, 1> SingularValuesType;
-
-  public:
-
-    SVD() {} // a user who relied on compiler-generated default compiler reported problems with MSVC in 2.0.7
-    
-    SVD(const MatrixType& matrix)
-      : m_matU(matrix.rows(), (std::min)(matrix.rows(), matrix.cols())),
-        m_matV(matrix.cols(),matrix.cols()),
-        m_sigma((std::min)(matrix.rows(),matrix.cols()))
-    {
-      compute(matrix);
-    }
-
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived> &b, ResultType* result) const;
-
-    const MatrixUType& matrixU() const { return m_matU; }
-    const SingularValuesType& singularValues() const { return m_sigma; }
-    const MatrixVType& matrixV() const { return m_matV; }
-
-    void compute(const MatrixType& matrix);
-    SVD& sort();
-
-    template<typename UnitaryType, typename PositiveType>
-    void computeUnitaryPositive(UnitaryType *unitary, PositiveType *positive) const;
-    template<typename PositiveType, typename UnitaryType>
-    void computePositiveUnitary(PositiveType *positive, UnitaryType *unitary) const;
-    template<typename RotationType, typename ScalingType>
-    void computeRotationScaling(RotationType *unitary, ScalingType *positive) const;
-    template<typename ScalingType, typename RotationType>
-    void computeScalingRotation(ScalingType *positive, RotationType *unitary) const;
-
-  protected:
-    /** \internal */
-    MatrixUType m_matU;
-    /** \internal */
-    MatrixVType m_matV;
-    /** \internal */
-    SingularValuesType m_sigma;
-};
-
-/** Computes / recomputes the SVD decomposition A = U S V^* of \a matrix
-  *
-  * \note this code has been adapted from JAMA (public domain)
-  */
-template<typename MatrixType>
-void SVD<MatrixType>::compute(const MatrixType& matrix)
-{
-  const int m = matrix.rows();
-  const int n = matrix.cols();
-  const int nu = (std::min)(m,n);
-  ei_assert(m>=n && "In Eigen 2.0, SVD only works for MxN matrices with M>=N. Sorry!");
-  ei_assert(m>1 && "In Eigen 2.0, SVD doesn't work on 1x1 matrices");
-
-  m_matU.resize(m, nu);
-  m_matU.setZero();
-  m_sigma.resize((std::min)(m,n));
-  m_matV.resize(n,n);
-
-  RowVector e(n);
-  ColVector work(m);
-  MatrixType matA(matrix);
-  const bool wantu = true;
-  const bool wantv = true;
-  int i=0, j=0, k=0;
-
-  // Reduce A to bidiagonal form, storing the diagonal elements
-  // in s and the super-diagonal elements in e.
-  int nct = (std::min)(m-1,n);
-  int nrt = (std::max)(0,(std::min)(n-2,m));
-  for (k = 0; k < (std::max)(nct,nrt); ++k)
-  {
-    if (k < nct)
-    {
-      // Compute the transformation for the k-th column and
-      // place the k-th diagonal in m_sigma[k].
-      m_sigma[k] = matA.col(k).end(m-k).norm();
-      if (m_sigma[k] != 0.0) // FIXME
-      {
-        if (matA(k,k) < 0.0)
-          m_sigma[k] = -m_sigma[k];
-        matA.col(k).end(m-k) /= m_sigma[k];
-        matA(k,k) += 1.0;
-      }
-      m_sigma[k] = -m_sigma[k];
-    }
-
-    for (j = k+1; j < n; ++j)
-    {
-      if ((k < nct) && (m_sigma[k] != 0.0))
-      {
-        // Apply the transformation.
-        Scalar t = matA.col(k).end(m-k).eigen2_dot(matA.col(j).end(m-k)); // FIXME dot product or cwise prod + .sum() ??
-        t = -t/matA(k,k);
-        matA.col(j).end(m-k) += t * matA.col(k).end(m-k);
-      }
-
-      // Place the k-th row of A into e for the
-      // subsequent calculation of the row transformation.
-      e[j] = matA(k,j);
-    }
-
-    // Place the transformation in U for subsequent back multiplication.
-    if (wantu & (k < nct))
-      m_matU.col(k).end(m-k) = matA.col(k).end(m-k);
-
-    if (k < nrt)
-    {
-      // Compute the k-th row transformation and place the
-      // k-th super-diagonal in e[k].
-      e[k] = e.end(n-k-1).norm();
-      if (e[k] != 0.0)
-      {
-          if (e[k+1] < 0.0)
-            e[k] = -e[k];
-          e.end(n-k-1) /= e[k];
-          e[k+1] += 1.0;
-      }
-      e[k] = -e[k];
-      if ((k+1 < m) & (e[k] != 0.0))
-      {
-        // Apply the transformation.
-        work.end(m-k-1) = matA.corner(BottomRight,m-k-1,n-k-1) * e.end(n-k-1);
-        for (j = k+1; j < n; ++j)
-          matA.col(j).end(m-k-1) += (-e[j]/e[k+1]) * work.end(m-k-1);
-      }
-
-      // Place the transformation in V for subsequent back multiplication.
-      if (wantv)
-        m_matV.col(k).end(n-k-1) = e.end(n-k-1);
-    }
-  }
-
-
-  // Set up the final bidiagonal matrix or order p.
-  int p = (std::min)(n,m+1);
-  if (nct < n)
-    m_sigma[nct] = matA(nct,nct);
-  if (m < p)
-    m_sigma[p-1] = 0.0;
-  if (nrt+1 < p)
-    e[nrt] = matA(nrt,p-1);
-  e[p-1] = 0.0;
-
-  // If required, generate U.
-  if (wantu)
-  {
-    for (j = nct; j < nu; ++j)
-    {
-      m_matU.col(j).setZero();
-      m_matU(j,j) = 1.0;
-    }
-    for (k = nct-1; k >= 0; k--)
-    {
-      if (m_sigma[k] != 0.0)
-      {
-        for (j = k+1; j < nu; ++j)
-        {
-          Scalar t = m_matU.col(k).end(m-k).eigen2_dot(m_matU.col(j).end(m-k)); // FIXME is it really a dot product we want ?
-          t = -t/m_matU(k,k);
-          m_matU.col(j).end(m-k) += t * m_matU.col(k).end(m-k);
-        }
-        m_matU.col(k).end(m-k) = - m_matU.col(k).end(m-k);
-        m_matU(k,k) = Scalar(1) + m_matU(k,k);
-        if (k-1>0)
-          m_matU.col(k).start(k-1).setZero();
-      }
-      else
-      {
-        m_matU.col(k).setZero();
-        m_matU(k,k) = 1.0;
-      }
-    }
-  }
-
-  // If required, generate V.
-  if (wantv)
-  {
-    for (k = n-1; k >= 0; k--)
-    {
-      if ((k < nrt) & (e[k] != 0.0))
-      {
-        for (j = k+1; j < nu; ++j)
-        {
-          Scalar t = m_matV.col(k).end(n-k-1).eigen2_dot(m_matV.col(j).end(n-k-1)); // FIXME is it really a dot product we want ?
-          t = -t/m_matV(k+1,k);
-          m_matV.col(j).end(n-k-1) += t * m_matV.col(k).end(n-k-1);
-        }
-      }
-      m_matV.col(k).setZero();
-      m_matV(k,k) = 1.0;
-    }
-  }
-
-  // Main iteration loop for the singular values.
-  int pp = p-1;
-  int iter = 0;
-  Scalar eps = ei_pow(Scalar(2),ei_is_same_type<Scalar,float>::ret ? Scalar(-23) : Scalar(-52));
-  while (p > 0)
-  {
-    int k=0;
-    int kase=0;
-
-    // Here is where a test for too many iterations would go.
-
-    // This section of the program inspects for
-    // negligible elements in the s and e arrays.  On
-    // completion the variables kase and k are set as follows.
-
-    // kase = 1     if s(p) and e[k-1] are negligible and k<p
-    // kase = 2     if s(k) is negligible and k<p
-    // kase = 3     if e[k-1] is negligible, k<p, and
-    //              s(k), ..., s(p) are not negligible (qr step).
-    // kase = 4     if e(p-1) is negligible (convergence).
-
-    for (k = p-2; k >= -1; --k)
-    {
-      if (k == -1)
-          break;
-      if (ei_abs(e[k]) <= eps*(ei_abs(m_sigma[k]) + ei_abs(m_sigma[k+1])))
-      {
-          e[k] = 0.0;
-          break;
-      }
-    }
-    if (k == p-2)
-    {
-      kase = 4;
-    }
-    else
-    {
-      int ks;
-      for (ks = p-1; ks >= k; --ks)
-      {
-        if (ks == k)
-          break;
-        Scalar t = (ks != p ? ei_abs(e[ks]) : Scalar(0)) + (ks != k+1 ? ei_abs(e[ks-1]) : Scalar(0));
-        if (ei_abs(m_sigma[ks]) <= eps*t)
-        {
-          m_sigma[ks] = 0.0;
-          break;
-        }
-      }
-      if (ks == k)
-      {
-        kase = 3;
-      }
-      else if (ks == p-1)
-      {
-        kase = 1;
-      }
-      else
-      {
-        kase = 2;
-        k = ks;
-      }
-    }
-    ++k;
-
-    // Perform the task indicated by kase.
-    switch (kase)
-    {
-
-      // Deflate negligible s(p).
-      case 1:
-      {
-        Scalar f(e[p-2]);
-        e[p-2] = 0.0;
-        for (j = p-2; j >= k; --j)
-        {
-          Scalar t(numext::hypot(m_sigma[j],f));
-          Scalar cs(m_sigma[j]/t);
-          Scalar sn(f/t);
-          m_sigma[j] = t;
-          if (j != k)
-          {
-            f = -sn*e[j-1];
-            e[j-1] = cs*e[j-1];
-          }
-          if (wantv)
-          {
-            for (i = 0; i < n; ++i)
-            {
-              t = cs*m_matV(i,j) + sn*m_matV(i,p-1);
-              m_matV(i,p-1) = -sn*m_matV(i,j) + cs*m_matV(i,p-1);
-              m_matV(i,j) = t;
-            }
-          }
-        }
-      }
-      break;
-
-      // Split at negligible s(k).
-      case 2:
-      {
-        Scalar f(e[k-1]);
-        e[k-1] = 0.0;
-        for (j = k; j < p; ++j)
-        {
-          Scalar t(numext::hypot(m_sigma[j],f));
-          Scalar cs( m_sigma[j]/t);
-          Scalar sn(f/t);
-          m_sigma[j] = t;
-          f = -sn*e[j];
-          e[j] = cs*e[j];
-          if (wantu)
-          {
-            for (i = 0; i < m; ++i)
-            {
-              t = cs*m_matU(i,j) + sn*m_matU(i,k-1);
-              m_matU(i,k-1) = -sn*m_matU(i,j) + cs*m_matU(i,k-1);
-              m_matU(i,j) = t;
-            }
-          }
-        }
-      }
-      break;
-
-      // Perform one qr step.
-      case 3:
-      {
-        // Calculate the shift.
-        Scalar scale = (std::max)((std::max)((std::max)((std::max)(
-                        ei_abs(m_sigma[p-1]),ei_abs(m_sigma[p-2])),ei_abs(e[p-2])),
-                        ei_abs(m_sigma[k])),ei_abs(e[k]));
-        Scalar sp = m_sigma[p-1]/scale;
-        Scalar spm1 = m_sigma[p-2]/scale;
-        Scalar epm1 = e[p-2]/scale;
-        Scalar sk = m_sigma[k]/scale;
-        Scalar ek = e[k]/scale;
-        Scalar b = ((spm1 + sp)*(spm1 - sp) + epm1*epm1)/Scalar(2);
-        Scalar c = (sp*epm1)*(sp*epm1);
-        Scalar shift(0);
-        if ((b != 0.0) || (c != 0.0))
-        {
-          shift = ei_sqrt(b*b + c);
-          if (b < 0.0)
-            shift = -shift;
-          shift = c/(b + shift);
-        }
-        Scalar f = (sk + sp)*(sk - sp) + shift;
-        Scalar g = sk*ek;
-
-        // Chase zeros.
-
-        for (j = k; j < p-1; ++j)
-        {
-          Scalar t = numext::hypot(f,g);
-          Scalar cs = f/t;
-          Scalar sn = g/t;
-          if (j != k)
-            e[j-1] = t;
-          f = cs*m_sigma[j] + sn*e[j];
-          e[j] = cs*e[j] - sn*m_sigma[j];
-          g = sn*m_sigma[j+1];
-          m_sigma[j+1] = cs*m_sigma[j+1];
-          if (wantv)
-          {
-            for (i = 0; i < n; ++i)
-            {
-              t = cs*m_matV(i,j) + sn*m_matV(i,j+1);
-              m_matV(i,j+1) = -sn*m_matV(i,j) + cs*m_matV(i,j+1);
-              m_matV(i,j) = t;
-            }
-          }
-          t = numext::hypot(f,g);
-          cs = f/t;
-          sn = g/t;
-          m_sigma[j] = t;
-          f = cs*e[j] + sn*m_sigma[j+1];
-          m_sigma[j+1] = -sn*e[j] + cs*m_sigma[j+1];
-          g = sn*e[j+1];
-          e[j+1] = cs*e[j+1];
-          if (wantu && (j < m-1))
-          {
-            for (i = 0; i < m; ++i)
-            {
-              t = cs*m_matU(i,j) + sn*m_matU(i,j+1);
-              m_matU(i,j+1) = -sn*m_matU(i,j) + cs*m_matU(i,j+1);
-              m_matU(i,j) = t;
-            }
-          }
-        }
-        e[p-2] = f;
-        iter = iter + 1;
-      }
-      break;
-
-      // Convergence.
-      case 4:
-      {
-        // Make the singular values positive.
-        if (m_sigma[k] <= 0.0)
-        {
-          m_sigma[k] = m_sigma[k] < Scalar(0) ? -m_sigma[k] : Scalar(0);
-          if (wantv)
-            m_matV.col(k).start(pp+1) = -m_matV.col(k).start(pp+1);
-        }
-
-        // Order the singular values.
-        while (k < pp)
-        {
-          if (m_sigma[k] >= m_sigma[k+1])
-            break;
-          Scalar t = m_sigma[k];
-          m_sigma[k] = m_sigma[k+1];
-          m_sigma[k+1] = t;
-          if (wantv && (k < n-1))
-            m_matV.col(k).swap(m_matV.col(k+1));
-          if (wantu && (k < m-1))
-            m_matU.col(k).swap(m_matU.col(k+1));
-          ++k;
-        }
-        iter = 0;
-        p--;
-      }
-      break;
-    } // end big switch
-  } // end iterations
-}
-
-template<typename MatrixType>
-SVD<MatrixType>& SVD<MatrixType>::sort()
-{
-  int mu = m_matU.rows();
-  int mv = m_matV.rows();
-  int n  = m_matU.cols();
-
-  for (int i=0; i<n; ++i)
-  {
-    int  k = i;
-    Scalar p = m_sigma.coeff(i);
-
-    for (int j=i+1; j<n; ++j)
-    {
-      if (m_sigma.coeff(j) > p)
-      {
-        k = j;
-        p = m_sigma.coeff(j);
-      }
-    }
-    if (k != i)
-    {
-      m_sigma.coeffRef(k) = m_sigma.coeff(i);  // i.e.
-      m_sigma.coeffRef(i) = p;                 // swaps the i-th and the k-th elements
-
-      int j = mu;
-      for(int s=0; j!=0; ++s, --j)
-        std::swap(m_matU.coeffRef(s,i), m_matU.coeffRef(s,k));
-
-      j = mv;
-      for (int s=0; j!=0; ++s, --j)
-        std::swap(m_matV.coeffRef(s,i), m_matV.coeffRef(s,k));
-    }
-  }
-  return *this;
-}
-
-/** \returns the solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-  * The parts of the solution corresponding to zero singular values are ignored.
-  *
-  * \sa MatrixBase::svd(), LU::solve(), LLT::solve()
-  */
-template<typename MatrixType>
-template<typename OtherDerived, typename ResultType>
-bool SVD<MatrixType>::solve(const MatrixBase<OtherDerived> &b, ResultType* result) const
-{
-  ei_assert(b.rows() == m_matU.rows());
-
-  Scalar maxVal = m_sigma.cwise().abs().maxCoeff();
-  for (int j=0; j<b.cols(); ++j)
-  {
-    Matrix<Scalar,MatrixUType::RowsAtCompileTime,1> aux = m_matU.transpose() * b.col(j);
-
-    for (int i = 0; i <m_matU.cols(); ++i)
-    {
-      Scalar si = m_sigma.coeff(i);
-      if (ei_isMuchSmallerThan(ei_abs(si),maxVal))
-        aux.coeffRef(i) = 0;
-      else
-        aux.coeffRef(i) /= si;
-    }
-
-    result->col(j) = m_matV * aux;
-  }
-  return true;
-}
-
-/** Computes the polar decomposition of the matrix, as a product unitary x positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * Only for square matrices.
-  *
-  * \sa computePositiveUnitary(), computeRotationScaling()
-  */
-template<typename MatrixType>
-template<typename UnitaryType, typename PositiveType>
-void SVD<MatrixType>::computeUnitaryPositive(UnitaryType *unitary,
-                                             PositiveType *positive) const
-{
-  ei_assert(m_matU.cols() == m_matV.cols() && "Polar decomposition is only for square matrices");
-  if(unitary) *unitary = m_matU * m_matV.adjoint();
-  if(positive) *positive = m_matV * m_sigma.asDiagonal() * m_matV.adjoint();
-}
-
-/** Computes the polar decomposition of the matrix, as a product positive x unitary.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * Only for square matrices.
-  *
-  * \sa computeUnitaryPositive(), computeRotationScaling()
-  */
-template<typename MatrixType>
-template<typename UnitaryType, typename PositiveType>
-void SVD<MatrixType>::computePositiveUnitary(UnitaryType *positive,
-                                             PositiveType *unitary) const
-{
-  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
-  if(unitary) *unitary = m_matU * m_matV.adjoint();
-  if(positive) *positive = m_matU * m_sigma.asDiagonal() * m_matU.adjoint();
-}
-
-/** decomposes the matrix as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * This method requires the Geometry module.
-  *
-  * \sa computeScalingRotation(), computeUnitaryPositive()
-  */
-template<typename MatrixType>
-template<typename RotationType, typename ScalingType>
-void SVD<MatrixType>::computeRotationScaling(RotationType *rotation, ScalingType *scaling) const
-{
-  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
-  Scalar x = (m_matU * m_matV.adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> sv(m_sigma);
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(m_matV * sv.asDiagonal() * m_matV.adjoint());
-  if(rotation)
-  {
-    MatrixType m(m_matU);
-    m.col(0) /= x;
-    rotation->lazyAssign(m * m_matV.adjoint());
-  }
-}
-
-/** decomposes the matrix as a product scaling x rotation, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * This method requires the Geometry module.
-  *
-  * \sa computeRotationScaling(), computeUnitaryPositive()
-  */
-template<typename MatrixType>
-template<typename ScalingType, typename RotationType>
-void SVD<MatrixType>::computeScalingRotation(ScalingType *scaling, RotationType *rotation) const
-{
-  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
-  Scalar x = (m_matU * m_matV.adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> sv(m_sigma);
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(m_matU * sv.asDiagonal() * m_matU.adjoint());
-  if(rotation)
-  {
-    MatrixType m(m_matU);
-    m.col(0) /= x;
-    rotation->lazyAssign(m * m_matV.adjoint());
-  }
-}
-
-
-/** \svd_module
-  * \returns the SVD decomposition of \c *this
-  */
-template<typename Derived>
-inline SVD<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::svd() const
-{
-  return SVD<PlainObject>(derived());
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_SVD_H
diff --git a/Eigen/src/Eigen2Support/TriangularSolver.h b/Eigen/src/Eigen2Support/TriangularSolver.h
deleted file mode 100644
index ebbeb3b49..000000000
--- a/Eigen/src/Eigen2Support/TriangularSolver.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TRIANGULAR_SOLVER2_H
-#define EIGEN_TRIANGULAR_SOLVER2_H
-
-namespace Eigen { 
-
-const unsigned int UnitDiagBit = UnitDiag;
-const unsigned int SelfAdjointBit = SelfAdjoint;
-const unsigned int UpperTriangularBit = Upper;
-const unsigned int LowerTriangularBit = Lower;
-
-const unsigned int UpperTriangular = Upper;
-const unsigned int LowerTriangular = Lower;
-const unsigned int UnitUpperTriangular = UnitUpper;
-const unsigned int UnitLowerTriangular = UnitLower;
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed>
-template<typename OtherDerived>
-typename ExpressionType::PlainObject
-Flagged<ExpressionType,Added,Removed>::solveTriangular(const MatrixBase<OtherDerived>& other) const
-{
-  return m_matrix.template triangularView<Added>().solve(other.derived());
-}
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed>
-template<typename OtherDerived>
-void Flagged<ExpressionType,Added,Removed>::solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const
-{
-  m_matrix.template triangularView<Added>().solveInPlace(other.derived());
-}
-
-} // end namespace Eigen
-    
-#endif // EIGEN_TRIANGULAR_SOLVER2_H
diff --git a/Eigen/src/Eigen2Support/VectorBlock.h b/Eigen/src/Eigen2Support/VectorBlock.h
deleted file mode 100644
index 71a8080a9..000000000
--- a/Eigen/src/Eigen2Support/VectorBlock.h
+++ /dev/null
@@ -1,94 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_VECTORBLOCK_H
-#define EIGEN2_VECTORBLOCK_H
-
-namespace Eigen { 
-
-/** \deprecated use DenseMase::head(Index) */
-template<typename Derived>
-inline VectorBlock<Derived>
-MatrixBase<Derived>::start(Index size)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived>(derived(), 0, size);
-}
-
-/** \deprecated use DenseMase::head(Index) */
-template<typename Derived>
-inline const VectorBlock<const Derived>
-MatrixBase<Derived>::start(Index size) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived>(derived(), 0, size);
-}
-
-/** \deprecated use DenseMase::tail(Index) */
-template<typename Derived>
-inline VectorBlock<Derived>
-MatrixBase<Derived>::end(Index size)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived>(derived(), this->size() - size, size);
-}
-
-/** \deprecated use DenseMase::tail(Index) */
-template<typename Derived>
-inline const VectorBlock<const Derived>
-MatrixBase<Derived>::end(Index size) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived>(derived(), this->size() - size, size);
-}
-
-/** \deprecated use DenseMase::head() */
-template<typename Derived>
-template<int Size>
-inline VectorBlock<Derived,Size>
-MatrixBase<Derived>::start()
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived,Size>(derived(), 0);
-}
-
-/** \deprecated use DenseMase::head() */
-template<typename Derived>
-template<int Size>
-inline const VectorBlock<const Derived,Size>
-MatrixBase<Derived>::start() const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived,Size>(derived(), 0);
-}
-
-/** \deprecated use DenseMase::tail() */
-template<typename Derived>
-template<int Size>
-inline VectorBlock<Derived,Size>
-MatrixBase<Derived>::end()
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived, Size>(derived(), size() - Size);
-}
-
-/** \deprecated use DenseMase::tail() */
-template<typename Derived>
-template<int Size>
-inline const VectorBlock<const Derived,Size>
-MatrixBase<Derived>::end() const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived, Size>(derived(), size() - Size);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_VECTORBLOCK_H
diff --git a/Eigen/src/Eigenvalues/CMakeLists.txt b/Eigen/src/Eigenvalues/CMakeLists.txt
deleted file mode 100644
index 193e02685..000000000
--- a/Eigen/src/Eigenvalues/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_EIGENVALUES_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_EIGENVALUES_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Eigenvalues COMPONENT Devel
-  )
diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index 417c72944..dc5fae06a 100644
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -60,7 +60,7 @@ template<typename _MatrixType> class ComplexEigenSolver
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for #MatrixType.
       *
@@ -104,7 +104,7 @@ template<typename _MatrixType> class ComplexEigenSolver
       * according to the specified problem \a size.
       * \sa ComplexEigenSolver()
       */
-    ComplexEigenSolver(Index size)
+    explicit ComplexEigenSolver(Index size)
             : m_eivec(size, size),
               m_eivalues(size),
               m_schur(size),
@@ -122,7 +122,8 @@ template<typename _MatrixType> class ComplexEigenSolver
       *
       * This constructor calls compute() to compute the eigendecomposition.
       */
-      ComplexEigenSolver(const MatrixType& matrix, bool computeEigenvectors = true)
+    template<typename InputType>
+    explicit ComplexEigenSolver(const EigenBase<InputType>& matrix, bool computeEigenvectors = true)
             : m_eivec(matrix.rows(),matrix.cols()),
               m_eivalues(matrix.cols()),
               m_schur(matrix.rows()),
@@ -130,7 +131,7 @@ template<typename _MatrixType> class ComplexEigenSolver
               m_eigenvectorsOk(false),
               m_matX(matrix.rows(),matrix.cols())
     {
-      compute(matrix, computeEigenvectors);
+      compute(matrix.derived(), computeEigenvectors);
     }
 
     /** \brief Returns the eigenvectors of given matrix.
@@ -208,7 +209,8 @@ template<typename _MatrixType> class ComplexEigenSolver
       * Example: \include ComplexEigenSolver_compute.cpp
       * Output: \verbinclude ComplexEigenSolver_compute.out
       */
-    ComplexEigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
+    template<typename InputType>
+    ComplexEigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
 
     /** \brief Reports whether previous computation was successful.
       *
@@ -248,14 +250,15 @@ template<typename _MatrixType> class ComplexEigenSolver
     EigenvectorType m_matX;
 
   private:
-    void doComputeEigenvectors(const RealScalar& matrixnorm);
+    void doComputeEigenvectors(RealScalar matrixnorm);
     void sortEigenvalues(bool computeEigenvectors);
 };
 
 
 template<typename MatrixType>
+template<typename InputType>
 ComplexEigenSolver<MatrixType>& 
-ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
+ComplexEigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeEigenvectors)
 {
   check_template_parameters();
   
@@ -264,13 +267,13 @@ ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEi
 
   // Do a complex Schur decomposition, A = U T U^*
   // The eigenvalues are on the diagonal of T.
-  m_schur.compute(matrix, computeEigenvectors);
+  m_schur.compute(matrix.derived(), computeEigenvectors);
 
   if(m_schur.info() == Success)
   {
     m_eivalues = m_schur.matrixT().diagonal();
     if(computeEigenvectors)
-      doComputeEigenvectors(matrix.norm());
+      doComputeEigenvectors(m_schur.matrixT().norm());
     sortEigenvalues(computeEigenvectors);
   }
 
@@ -281,10 +284,12 @@ ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEi
 
 
 template<typename MatrixType>
-void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(const RealScalar& matrixnorm)
+void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(RealScalar matrixnorm)
 {
   const Index n = m_eivalues.size();
 
+  matrixnorm = numext::maxi(matrixnorm,(std::numeric_limits<RealScalar>::min)());
+
   // Compute X such that T = X D X^(-1), where D is the diagonal of T.
   // The matrix X is unit triangular.
   m_matX = EigenvectorType::Zero(n, n);
diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 89e6cade3..7f38919f7 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -63,7 +63,7 @@ template<typename _MatrixType> class ComplexSchur
     /** \brief Scalar type for matrices of type \p _MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for \p _MatrixType. 
       *
@@ -91,7 +91,7 @@ template<typename _MatrixType> class ComplexSchur
       *
       * \sa compute() for an example.
       */
-    ComplexSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
+    explicit ComplexSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
       : m_matT(size,size),
         m_matU(size,size),
         m_hess(size),
@@ -109,7 +109,8 @@ template<typename _MatrixType> class ComplexSchur
       *
       * \sa matrixT() and matrixU() for examples.
       */
-    ComplexSchur(const MatrixType& matrix, bool computeU = true)
+    template<typename InputType>
+    explicit ComplexSchur(const EigenBase<InputType>& matrix, bool computeU = true)
       : m_matT(matrix.rows(),matrix.cols()),
         m_matU(matrix.rows(),matrix.cols()),
         m_hess(matrix.rows()),
@@ -117,7 +118,7 @@ template<typename _MatrixType> class ComplexSchur
         m_matUisUptodate(false),
         m_maxIters(-1)
     {
-      compute(matrix, computeU);
+      compute(matrix.derived(), computeU);
     }
 
     /** \brief Returns the unitary matrix in the Schur decomposition. 
@@ -186,7 +187,8 @@ template<typename _MatrixType> class ComplexSchur
       *
       * \sa compute(const MatrixType&, bool, Index)
       */
-    ComplexSchur& compute(const MatrixType& matrix, bool computeU = true);
+    template<typename InputType>
+    ComplexSchur& compute(const EigenBase<InputType>& matrix, bool computeU = true);
     
     /** \brief Compute Schur decomposition from a given Hessenberg matrix
      *  \param[in] matrixH Matrix in Hessenberg form H
@@ -313,14 +315,15 @@ typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::compu
 
 
 template<typename MatrixType>
-ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool computeU)
+template<typename InputType>
+ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeU)
 {
   m_matUisUptodate = false;
   eigen_assert(matrix.cols() == matrix.rows());
 
   if(matrix.cols() == 1)
   {
-    m_matT = matrix.template cast<ComplexScalar>();
+    m_matT = matrix.derived().template cast<ComplexScalar>();
     if(computeU)  m_matU = ComplexMatrixType::Identity(1,1);
     m_info = Success;
     m_isInitialized = true;
@@ -328,7 +331,7 @@ ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const MatrixType& ma
     return *this;
   }
 
-  internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>::run(*this, matrix, computeU);
+  internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>::run(*this, matrix.derived(), computeU);
   computeFromHessenberg(m_matT, m_matU, computeU);
   return *this;
 }
diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h
index 91496ae5b..4980a3ede 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h
@@ -25,27 +25,24 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Complex Schur needed to complex unsymmetrical eigenvalues/eigenvectors.
  ********************************************************************************
 */
 
-#ifndef EIGEN_COMPLEX_SCHUR_MKL_H
-#define EIGEN_COMPLEX_SCHUR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_COMPLEX_SCHUR_LAPACKE_H
+#define EIGEN_COMPLEX_SCHUR_LAPACKE_H
 
 namespace Eigen { 
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_SCHUR_COMPLEX(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
-template<> inline \
+#define EIGEN_LAPACKE_SCHUR_COMPLEX(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, LAPACKE_PREFIX_U, EIGCOLROW, LAPACKE_COLROW) \
+template<> template<typename InputType> inline \
 ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, bool computeU) \
+ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
 { \
   typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
   typedef MatrixType::RealScalar RealScalar; \
   typedef std::complex<RealScalar> ComplexScalar; \
 \
@@ -54,25 +51,25 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matri
   m_matUisUptodate = false; \
   if(matrix.cols() == 1) \
   { \
-    m_matT = matrix.cast<ComplexScalar>(); \
+    m_matT = matrix.derived().template cast<ComplexScalar>(); \
     if(computeU)  m_matU = ComplexMatrixType::Identity(1,1); \
       m_info = Success; \
       m_isInitialized = true; \
       m_matUisUptodate = computeU; \
       return *this; \
   } \
-  lapack_int n = matrix.cols(), sdim, info; \
-  lapack_int lda = matrix.outerStride(); \
-  lapack_int matrix_order = MKLCOLROW; \
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), sdim, info; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
   char jobvs, sort='N'; \
-  LAPACK_##MKLPREFIX_U##_SELECT1 select = 0; \
+  LAPACK_##LAPACKE_PREFIX_U##_SELECT1 select = 0; \
   jobvs = (computeU) ? 'V' : 'N'; \
   m_matU.resize(n, n); \
-  lapack_int ldvs  = m_matU.outerStride(); \
+  lapack_int ldvs  = internal::convert_index<lapack_int>(m_matU.outerStride()); \
   m_matT = matrix; \
+  lapack_int lda = internal::convert_index<lapack_int>(m_matT.outerStride()); \
   Matrix<EIGTYPE, Dynamic, Dynamic> w; \
   w.resize(n, 1);\
-  info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)w.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
+  info = LAPACKE_##LAPACKE_PREFIX##gees( matrix_order, jobvs, sort, select, n, (LAPACKE_TYPE*)m_matT.data(), lda, &sdim, (LAPACKE_TYPE*)w.data(), (LAPACKE_TYPE*)m_matU.data(), ldvs ); \
   if(info == 0) \
     m_info = Success; \
   else \
@@ -84,11 +81,11 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matri
 \
 }
 
-EIGEN_MKL_SCHUR_COMPLEX(dcomplex, MKL_Complex16, z, Z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(scomplex, MKL_Complex8,  c, C, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(dcomplex, MKL_Complex16, z, Z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(scomplex, MKL_Complex8,  c, C, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(dcomplex, lapack_complex_double, z, Z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(scomplex, lapack_complex_float,  c, C, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(dcomplex, lapack_complex_double, z, Z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(scomplex, lapack_complex_float,  c, C, RowMajor, LAPACK_ROW_MAJOR)
 
 } // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_SCHUR_MKL_H
+#endif // EIGEN_COMPLEX_SCHUR_LAPACKE_H
diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index 20c59a7a2..f205b185d 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@@ -79,7 +79,7 @@ template<typename _MatrixType> class EigenSolver
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for #MatrixType. 
       *
@@ -110,7 +110,7 @@ template<typename _MatrixType> class EigenSolver
       *
       * \sa compute() for an example.
       */
- EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {}
+    EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {}
 
     /** \brief Default constructor with memory preallocation
       *
@@ -118,7 +118,7 @@ template<typename _MatrixType> class EigenSolver
       * according to the specified problem \a size.
       * \sa EigenSolver()
       */
-    EigenSolver(Index size)
+    explicit EigenSolver(Index size)
       : m_eivec(size, size),
         m_eivalues(size),
         m_isInitialized(false),
@@ -143,7 +143,8 @@ template<typename _MatrixType> class EigenSolver
       *
       * \sa compute()
       */
-    EigenSolver(const MatrixType& matrix, bool computeEigenvectors = true)
+    template<typename InputType>
+    explicit EigenSolver(const EigenBase<InputType>& matrix, bool computeEigenvectors = true)
       : m_eivec(matrix.rows(), matrix.cols()),
         m_eivalues(matrix.cols()),
         m_isInitialized(false),
@@ -152,7 +153,7 @@ template<typename _MatrixType> class EigenSolver
         m_matT(matrix.rows(), matrix.cols()), 
         m_tmp(matrix.cols())
     {
-      compute(matrix, computeEigenvectors);
+      compute(matrix.derived(), computeEigenvectors);
     }
 
     /** \brief Returns the eigenvectors of given matrix. 
@@ -273,12 +274,14 @@ template<typename _MatrixType> class EigenSolver
       * Example: \include EigenSolver_compute.cpp
       * Output: \verbinclude EigenSolver_compute.out
       */
-    EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
+    template<typename InputType>
+    EigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
 
+    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-      return m_realSchur.info();
+      return m_info;
     }
 
     /** \brief Sets the maximum number of iterations allowed. */
@@ -309,6 +312,7 @@ template<typename _MatrixType> class EigenSolver
     EigenvalueType m_eivalues;
     bool m_isInitialized;
     bool m_eigenvectorsOk;
+    ComputationInfo m_info;
     RealSchur<MatrixType> m_realSchur;
     MatrixType m_matT;
 
@@ -320,11 +324,12 @@ template<typename MatrixType>
 MatrixType EigenSolver<MatrixType>::pseudoEigenvalueMatrix() const
 {
   eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
   Index n = m_eivalues.rows();
   MatrixType matD = MatrixType::Zero(n,n);
   for (Index i=0; i<n; ++i)
   {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i))))
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i)), precision))
       matD.coeffRef(i,i) = numext::real(m_eivalues.coeff(i));
     else
     {
@@ -341,11 +346,12 @@ typename EigenSolver<MatrixType>::EigenvectorsType EigenSolver<MatrixType>::eige
 {
   eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
   eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
   Index n = m_eivec.cols();
   EigenvectorsType matV(n,n);
   for (Index j=0; j<n; ++j)
   {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j))) || j+1==n)
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j)), precision) || j+1==n)
     {
       // we have a real eigen value
       matV.col(j) = m_eivec.col(j).template cast<ComplexScalar>();
@@ -368,19 +374,23 @@ typename EigenSolver<MatrixType>::EigenvectorsType EigenSolver<MatrixType>::eige
 }
 
 template<typename MatrixType>
+template<typename InputType>
 EigenSolver<MatrixType>& 
-EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
+EigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeEigenvectors)
 {
   check_template_parameters();
   
   using std::sqrt;
   using std::abs;
+  using numext::isfinite;
   eigen_assert(matrix.cols() == matrix.rows());
 
   // Reduce to real Schur form.
-  m_realSchur.compute(matrix, computeEigenvectors);
+  m_realSchur.compute(matrix.derived(), computeEigenvectors);
+  
+  m_info = m_realSchur.info();
 
-  if (m_realSchur.info() == Success)
+  if (m_info == Success)
   {
     m_matT = m_realSchur.matrixT();
     if (computeEigenvectors)
@@ -394,14 +404,40 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
       if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) 
       {
         m_eivalues.coeffRef(i) = m_matT.coeff(i, i);
+        if(!(isfinite)(m_eivalues.coeffRef(i)))
+        {
+          m_isInitialized = true;
+          m_eigenvectorsOk = false;
+          m_info = NumericalIssue;
+          return *this;
+        }
         ++i;
       }
       else
       {
         Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1));
-        Scalar z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
+        Scalar z;
+        // Compute z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
+        // without overflow
+        {
+          Scalar t0 = m_matT.coeff(i+1, i);
+          Scalar t1 = m_matT.coeff(i, i+1);
+          Scalar maxval = numext::maxi<Scalar>(abs(p),numext::maxi<Scalar>(abs(t0),abs(t1)));
+          t0 /= maxval;
+          t1 /= maxval;
+          Scalar p0 = p/maxval;
+          z = maxval * sqrt(abs(p0 * p0 + t0 * t1));
+        }
+        
         m_eivalues.coeffRef(i)   = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z);
         m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z);
+        if(!((isfinite)(m_eivalues.coeffRef(i)) && (isfinite)(m_eivalues.coeffRef(i+1))))
+        {
+          m_isInitialized = true;
+          m_eigenvectorsOk = false;
+          m_info = NumericalIssue;
+          return *this;
+        }
         i += 2;
       }
     }
@@ -417,26 +453,6 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
   return *this;
 }
 
-// Complex scalar division.
-template<typename Scalar>
-std::complex<Scalar> cdiv(const Scalar& xr, const Scalar& xi, const Scalar& yr, const Scalar& yi)
-{
-  using std::abs;
-  Scalar r,d;
-  if (abs(yr) > abs(yi))
-  {
-      r = yi/yr;
-      d = yr + r*yi;
-      return std::complex<Scalar>((xr + r*xi)/d, (xi - r*xr)/d);
-  }
-  else
-  {
-      r = yr/yi;
-      d = yi + r*yr;
-      return std::complex<Scalar>((r*xr + xi)/d, (r*xi - xr)/d);
-  }
-}
-
 
 template<typename MatrixType>
 void EigenSolver<MatrixType>::doComputeEigenvectors()
@@ -453,7 +469,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
   }
   
   // Backsubstitute to find vectors of upper triangular form
-  if (norm == 0.0)
+  if (norm == Scalar(0))
   {
     return;
   }
@@ -469,13 +485,13 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
       Scalar lastr(0), lastw(0);
       Index l = n;
 
-      m_matT.coeffRef(n,n) = 1.0;
+      m_matT.coeffRef(n,n) = Scalar(1);
       for (Index i = n-1; i >= 0; i--)
       {
         Scalar w = m_matT.coeff(i,i) - p;
         Scalar r = m_matT.row(i).segment(l,n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
 
-        if (m_eivalues.coeff(i).imag() < 0.0)
+        if (m_eivalues.coeff(i).imag() < Scalar(0))
         {
           lastw = w;
           lastr = r;
@@ -483,9 +499,9 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
         else
         {
           l = i;
-          if (m_eivalues.coeff(i).imag() == 0.0)
+          if (m_eivalues.coeff(i).imag() == Scalar(0))
           {
-            if (w != 0.0)
+            if (w != Scalar(0))
               m_matT.coeffRef(i,n) = -r / w;
             else
               m_matT.coeffRef(i,n) = -r / (eps * norm);
@@ -523,19 +539,19 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
       }
       else
       {
-        std::complex<Scalar> cc = cdiv<Scalar>(0.0,-m_matT.coeff(n-1,n),m_matT.coeff(n-1,n-1)-p,q);
+        ComplexScalar cc = ComplexScalar(Scalar(0),-m_matT.coeff(n-1,n)) / ComplexScalar(m_matT.coeff(n-1,n-1)-p,q);
         m_matT.coeffRef(n-1,n-1) = numext::real(cc);
         m_matT.coeffRef(n-1,n) = numext::imag(cc);
       }
-      m_matT.coeffRef(n,n-1) = 0.0;
-      m_matT.coeffRef(n,n) = 1.0;
+      m_matT.coeffRef(n,n-1) = Scalar(0);
+      m_matT.coeffRef(n,n) = Scalar(1);
       for (Index i = n-2; i >= 0; i--)
       {
         Scalar ra = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n-1).segment(l, n-l+1));
         Scalar sa = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
         Scalar w = m_matT.coeff(i,i) - p;
 
-        if (m_eivalues.coeff(i).imag() < 0.0)
+        if (m_eivalues.coeff(i).imag() < Scalar(0))
         {
           lastw = w;
           lastra = ra;
@@ -546,7 +562,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
           l = i;
           if (m_eivalues.coeff(i).imag() == RealScalar(0))
           {
-            std::complex<Scalar> cc = cdiv(-ra,-sa,w,q);
+            ComplexScalar cc = ComplexScalar(-ra,-sa) / ComplexScalar(w,q);
             m_matT.coeffRef(i,n-1) = numext::real(cc);
             m_matT.coeffRef(i,n) = numext::imag(cc);
           }
@@ -557,10 +573,10 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
             Scalar y = m_matT.coeff(i+1,i);
             Scalar vr = (m_eivalues.coeff(i).real() - p) * (m_eivalues.coeff(i).real() - p) + m_eivalues.coeff(i).imag() * m_eivalues.coeff(i).imag() - q * q;
             Scalar vi = (m_eivalues.coeff(i).real() - p) * Scalar(2) * q;
-            if ((vr == 0.0) && (vi == 0.0))
+            if ((vr == Scalar(0)) && (vi == Scalar(0)))
               vr = eps * norm * (abs(w) + abs(q) + abs(x) + abs(y) + abs(lastw));
 
-            std::complex<Scalar> cc = cdiv(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra,vr,vi);
+            ComplexScalar cc = ComplexScalar(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra) / ComplexScalar(vr,vi);
             m_matT.coeffRef(i,n-1) = numext::real(cc);
             m_matT.coeffRef(i,n) = numext::imag(cc);
             if (abs(x) > (abs(lastw) + abs(q)))
@@ -570,15 +586,14 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
             }
             else
             {
-              cc = cdiv(-lastra-y*m_matT.coeff(i,n-1),-lastsa-y*m_matT.coeff(i,n),lastw,q);
+              cc = ComplexScalar(-lastra-y*m_matT.coeff(i,n-1),-lastsa-y*m_matT.coeff(i,n)) / ComplexScalar(lastw,q);
               m_matT.coeffRef(i+1,n-1) = numext::real(cc);
               m_matT.coeffRef(i+1,n) = numext::imag(cc);
             }
           }
 
           // Overflow control
-          using std::max;
-          Scalar t = (max)(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
+          Scalar t = numext::maxi<Scalar>(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
           if ((eps * t) * t > Scalar(1))
             m_matT.block(i, n-1, size-i, 2) /= t;
 
@@ -590,7 +605,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
     }
     else
     {
-      eigen_assert(0 && "Internal bug in EigenSolver"); // this should not happen
+      eigen_assert(0 && "Internal bug in EigenSolver (INF or NaN has not been detected)"); // this should not happen
     }
   }
 
diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index 956e80d9e..36a91dffc 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -1,8 +1,9 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010,2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2016 Tobias Wood <tobias@spinicist.org.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -72,7 +73,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for #MatrixType. 
       *
@@ -89,7 +90,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       */
     typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> VectorType;
 
-    /** \brief Type for vector of complex scalar values eigenvalues as returned by betas().
+    /** \brief Type for vector of complex scalar values eigenvalues as returned by alphas().
       *
       * This is a column vector with entries of type #ComplexScalar.
       * The length of the vector is the size of #MatrixType.
@@ -114,7 +115,14 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       *
       * \sa compute() for an example.
       */
-    GeneralizedEigenSolver() : m_eivec(), m_alphas(), m_betas(), m_isInitialized(false), m_realQZ(), m_matS(), m_tmp() {}
+    GeneralizedEigenSolver()
+      : m_eivec(),
+        m_alphas(),
+        m_betas(),
+        m_valuesOkay(false),
+        m_vectorsOkay(false),
+        m_realQZ()
+    {}
 
     /** \brief Default constructor with memory preallocation
       *
@@ -122,14 +130,13 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       * according to the specified problem \a size.
       * \sa GeneralizedEigenSolver()
       */
-    GeneralizedEigenSolver(Index size)
+    explicit GeneralizedEigenSolver(Index size)
       : m_eivec(size, size),
         m_alphas(size),
         m_betas(size),
-        m_isInitialized(false),
-        m_eigenvectorsOk(false),
+        m_valuesOkay(false),
+        m_vectorsOkay(false),
         m_realQZ(size),
-        m_matS(size, size),
         m_tmp(size)
     {}
 
@@ -149,10 +156,9 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       : m_eivec(A.rows(), A.cols()),
         m_alphas(A.cols()),
         m_betas(A.cols()),
-        m_isInitialized(false),
-        m_eigenvectorsOk(false),
+        m_valuesOkay(false),
+        m_vectorsOkay(false),
         m_realQZ(A.cols()),
-        m_matS(A.rows(), A.cols()),
         m_tmp(A.cols())
     {
       compute(A, B, computeEigenvectors);
@@ -160,22 +166,20 @@ template<typename _MatrixType> class GeneralizedEigenSolver
 
     /* \brief Returns the computed generalized eigenvectors.
       *
-      * \returns  %Matrix whose columns are the (possibly complex) eigenvectors.
+      * \returns  %Matrix whose columns are the (possibly complex) right eigenvectors.
+      * i.e. the eigenvectors that solve (A - l*B)x = 0. The ordering matches the eigenvalues.
       *
       * \pre Either the constructor 
       * GeneralizedEigenSolver(const MatrixType&,const MatrixType&, bool) or the member function
       * compute(const MatrixType&, const MatrixType& bool) has been called before, and
       * \p computeEigenvectors was set to true (the default).
       *
-      * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
-      * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
-      * eigenvectors are normalized to have (Euclidean) norm equal to one. The
-      * matrix returned by this function is the matrix \f$ V \f$ in the
-      * generalized eigendecomposition \f$ A = B V D V^{-1} \f$, if it exists.
-      *
       * \sa eigenvalues()
       */
-//    EigenvectorsType eigenvectors() const;
+    EigenvectorsType eigenvectors() const {
+      eigen_assert(m_vectorsOkay && "Eigenvectors for GeneralizedEigenSolver were not calculated.");
+      return m_eivec;
+    }
 
     /** \brief Returns an expression of the computed generalized eigenvalues.
       *
@@ -197,7 +201,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       */
     EigenvalueType eigenvalues() const
     {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
       return EigenvalueType(m_alphas,m_betas);
     }
 
@@ -208,7 +212,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       * \sa betas(), eigenvalues() */
     ComplexVectorType alphas() const
     {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
       return m_alphas;
     }
 
@@ -219,7 +223,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       * \sa alphas(), eigenvalues() */
     VectorType betas() const
     {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
       return m_betas;
     }
 
@@ -250,7 +254,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
 
     ComputationInfo info() const
     {
-      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "EigenSolver is not initialized.");
       return m_realQZ.info();
     }
 
@@ -270,29 +274,14 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
     }
     
-    MatrixType m_eivec;
+    EigenvectorsType m_eivec;
     ComplexVectorType m_alphas;
     VectorType m_betas;
-    bool m_isInitialized;
-    bool m_eigenvectorsOk;
+    bool m_valuesOkay, m_vectorsOkay;
     RealQZ<MatrixType> m_realQZ;
-    MatrixType m_matS;
-
-    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
-    ColumnVectorType m_tmp;
+    ComplexVectorType m_tmp;
 };
 
-//template<typename MatrixType>
-//typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType GeneralizedEigenSolver<MatrixType>::eigenvectors() const
-//{
-//  eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-//  eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
-//  Index n = m_eivec.cols();
-//  EigenvectorsType matV(n,n);
-//  // TODO
-//  return matV;
-//}
-
 template<typename MatrixType>
 GeneralizedEigenSolver<MatrixType>&
 GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors)
@@ -302,46 +291,126 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
   using std::sqrt;
   using std::abs;
   eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
-
+  Index size = A.cols();
+  m_valuesOkay = false;
+  m_vectorsOkay = false;
   // Reduce to generalized real Schur form:
   // A = Q S Z and B = Q T Z
   m_realQZ.compute(A, B, computeEigenvectors);
-
   if (m_realQZ.info() == Success)
   {
-    m_matS = m_realQZ.matrixS();
+    // Resize storage
+    m_alphas.resize(size);
+    m_betas.resize(size);
     if (computeEigenvectors)
-      m_eivec = m_realQZ.matrixZ().transpose();
-  
-    // Compute eigenvalues from matS
-    m_alphas.resize(A.cols());
-    m_betas.resize(A.cols());
+    {
+      m_eivec.resize(size,size);
+      m_tmp.resize(size);
+    }
+
+    // Aliases:
+    Map<VectorType> v(reinterpret_cast<Scalar*>(m_tmp.data()), size);
+    ComplexVectorType &cv = m_tmp;
+    const MatrixType &mZ = m_realQZ.matrixZ();
+    const MatrixType &mS = m_realQZ.matrixS();
+    const MatrixType &mT = m_realQZ.matrixT();
+
     Index i = 0;
-    while (i < A.cols())
+    while (i < size)
     {
-      if (i == A.cols() - 1 || m_matS.coeff(i+1, i) == Scalar(0))
+      if (i == size - 1 || mS.coeff(i+1, i) == Scalar(0))
       {
-        m_alphas.coeffRef(i) = m_matS.coeff(i, i);
-        m_betas.coeffRef(i)  = m_realQZ.matrixT().coeff(i,i);
+        // Real eigenvalue
+        m_alphas.coeffRef(i) = mS.diagonal().coeff(i);
+        m_betas.coeffRef(i)  = mT.diagonal().coeff(i);
+        if (computeEigenvectors)
+        {
+          v.setConstant(Scalar(0.0));
+          v.coeffRef(i) = Scalar(1.0);
+          // For singular eigenvalues do nothing more
+          if(abs(m_betas.coeffRef(i)) >= (std::numeric_limits<RealScalar>::min)())
+          {
+            // Non-singular eigenvalue
+            const Scalar alpha = real(m_alphas.coeffRef(i));
+            const Scalar beta = m_betas.coeffRef(i);
+            for (Index j = i-1; j >= 0; j--)
+            {
+              const Index st = j+1;
+              const Index sz = i-j;
+              if (j > 0 && mS.coeff(j, j-1) != Scalar(0))
+              {
+                // 2x2 block
+                Matrix<Scalar, 2, 1> rhs = (alpha*mT.template block<2,Dynamic>(j-1,st,2,sz) - beta*mS.template block<2,Dynamic>(j-1,st,2,sz)) .lazyProduct( v.segment(st,sz) );
+                Matrix<Scalar, 2, 2> lhs = beta * mS.template block<2,2>(j-1,j-1) - alpha * mT.template block<2,2>(j-1,j-1);
+                v.template segment<2>(j-1) = lhs.partialPivLu().solve(rhs);
+                j--;
+              }
+              else
+              {
+                v.coeffRef(j) = -v.segment(st,sz).transpose().cwiseProduct(beta*mS.block(j,st,1,sz) - alpha*mT.block(j,st,1,sz)).sum() / (beta*mS.coeffRef(j,j) - alpha*mT.coeffRef(j,j));
+              }
+            }
+          }
+          m_eivec.col(i).real().noalias() = mZ.transpose() * v;
+          m_eivec.col(i).real().normalize();
+          m_eivec.col(i).imag().setConstant(0);
+        }
         ++i;
       }
       else
       {
-        Scalar p = Scalar(0.5) * (m_matS.coeff(i, i) - m_matS.coeff(i+1, i+1));
-        Scalar z = sqrt(abs(p * p + m_matS.coeff(i+1, i) * m_matS.coeff(i, i+1)));
-        m_alphas.coeffRef(i)   = ComplexScalar(m_matS.coeff(i+1, i+1) + p, z);
-        m_alphas.coeffRef(i+1) = ComplexScalar(m_matS.coeff(i+1, i+1) + p, -z);
-
-        m_betas.coeffRef(i)   = m_realQZ.matrixT().coeff(i,i);
-        m_betas.coeffRef(i+1) = m_realQZ.matrixT().coeff(i,i);
+        // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a positive diagonal 2x2 block T
+        // Then taking beta=T_00*T_11, we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00):
+
+        // T =  [a 0]
+        //      [0 b]
+        RealScalar a = mT.diagonal().coeff(i),
+                   b = mT.diagonal().coeff(i+1);
+        const RealScalar beta = m_betas.coeffRef(i) = m_betas.coeffRef(i+1) = a*b;
+
+        // ^^ NOTE: using diagonal()(i) instead of coeff(i,i) workarounds a MSVC bug.
+        Matrix<RealScalar,2,2> S2 = mS.template block<2,2>(i,i) * Matrix<Scalar,2,1>(b,a).asDiagonal();
+
+        Scalar p = Scalar(0.5) * (S2.coeff(0,0) - S2.coeff(1,1));
+        Scalar z = sqrt(abs(p * p + S2.coeff(1,0) * S2.coeff(0,1)));
+        const ComplexScalar alpha = ComplexScalar(S2.coeff(1,1) + p, (beta > 0) ? z : -z);
+        m_alphas.coeffRef(i)   = conj(alpha);
+        m_alphas.coeffRef(i+1) = alpha;
+
+        if (computeEigenvectors) {
+          // Compute eigenvector in position (i+1) and then position (i) is just the conjugate
+          cv.setZero();
+          cv.coeffRef(i+1) = Scalar(1.0);
+          // here, the "static_cast" workaound expression template issues.
+          cv.coeffRef(i) = -(static_cast<Scalar>(beta*mS.coeffRef(i,i+1)) - alpha*mT.coeffRef(i,i+1))
+                          / (static_cast<Scalar>(beta*mS.coeffRef(i,i))   - alpha*mT.coeffRef(i,i));
+          for (Index j = i-1; j >= 0; j--)
+          {
+            const Index st = j+1;
+            const Index sz = i+1-j;
+            if (j > 0 && mS.coeff(j, j-1) != Scalar(0))
+            {
+              // 2x2 block
+              Matrix<ComplexScalar, 2, 1> rhs = (alpha*mT.template block<2,Dynamic>(j-1,st,2,sz) - beta*mS.template block<2,Dynamic>(j-1,st,2,sz)) .lazyProduct( cv.segment(st,sz) );
+              Matrix<ComplexScalar, 2, 2> lhs = beta * mS.template block<2,2>(j-1,j-1) - alpha * mT.template block<2,2>(j-1,j-1);
+              cv.template segment<2>(j-1) = lhs.partialPivLu().solve(rhs);
+              j--;
+            } else {
+              cv.coeffRef(j) =  cv.segment(st,sz).transpose().cwiseProduct(beta*mS.block(j,st,1,sz) - alpha*mT.block(j,st,1,sz)).sum()
+                              / (alpha*mT.coeffRef(j,j) - static_cast<Scalar>(beta*mS.coeffRef(j,j)));
+            }
+          }
+          m_eivec.col(i+1).noalias() = (mZ.transpose() * cv);
+          m_eivec.col(i+1).normalize();
+          m_eivec.col(i) = m_eivec.col(i+1).conjugate();
+        }
         i += 2;
       }
     }
-  }
-
-  m_isInitialized = true;
-  m_eigenvectorsOk = false;//computeEigenvectors;
 
+    m_valuesOkay = true;
+    m_vectorsOkay = computeEigenvectors;
+  }
   return *this;
 }
 
diff --git a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
index 07bf1ea09..5f6bb8289 100644
--- a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
@@ -50,7 +50,6 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT
     typedef SelfAdjointEigenSolver<_MatrixType> Base;
   public:
 
-    typedef typename Base::Index Index;
     typedef _MatrixType MatrixType;
 
     /** \brief Default constructor for fixed-size matrices.
@@ -74,7 +73,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT
       *
       * \sa compute() for an example
       */
-    GeneralizedSelfAdjointEigenSolver(Index size)
+    explicit GeneralizedSelfAdjointEigenSolver(Index size)
         : Base(size)
     {}
 
diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index 3db0c0106..f647f69b0 100644
--- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
@@ -71,7 +71,7 @@ template<typename _MatrixType> class HessenbergDecomposition
 
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Type for vector of Householder coefficients.
       *
@@ -97,7 +97,7 @@ template<typename _MatrixType> class HessenbergDecomposition
       *
       * \sa compute() for an example.
       */
-    HessenbergDecomposition(Index size = Size==Dynamic ? 2 : Size)
+    explicit HessenbergDecomposition(Index size = Size==Dynamic ? 2 : Size)
       : m_matrix(size,size),
         m_temp(size),
         m_isInitialized(false)
@@ -115,8 +115,9 @@ template<typename _MatrixType> class HessenbergDecomposition
       *
       * \sa matrixH() for an example.
       */
-    HessenbergDecomposition(const MatrixType& matrix)
-      : m_matrix(matrix),
+    template<typename InputType>
+    explicit HessenbergDecomposition(const EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
         m_temp(matrix.rows()),
         m_isInitialized(false)
     {
@@ -147,9 +148,10 @@ template<typename _MatrixType> class HessenbergDecomposition
       * Example: \include HessenbergDecomposition_compute.cpp
       * Output: \verbinclude HessenbergDecomposition_compute.out
       */
-    HessenbergDecomposition& compute(const MatrixType& matrix)
+    template<typename InputType>
+    HessenbergDecomposition& compute(const EigenBase<InputType>& matrix)
     {
-      m_matrix = matrix;
+      m_matrix = matrix.derived();
       if(matrix.rows()<2)
       {
         m_isInitialized = true;
@@ -337,7 +339,6 @@ namespace internal {
 template<typename MatrixType> struct HessenbergDecompositionMatrixHReturnType
 : public ReturnByValue<HessenbergDecompositionMatrixHReturnType<MatrixType> >
 {
-    typedef typename MatrixType::Index Index;
   public:
     /** \brief Constructor.
       *
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index fba6f1d77..b3a910dd9 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -67,7 +67,7 @@ namespace Eigen {
       };
       typedef typename MatrixType::Scalar Scalar;
       typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef typename MatrixType::Index Index;
+      typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
       typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
       typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
@@ -83,7 +83,7 @@ namespace Eigen {
        *
        * \sa compute() for an example.
        */
-      RealQZ(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime) : 
+      explicit RealQZ(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime) :
         m_S(size, size),
         m_T(size, size),
         m_Q(size, size),
@@ -244,7 +244,7 @@ namespace Eigen {
             if (m_computeQZ)
               m_Q.applyOnTheRight(i-1,i,G);
           }
-          // update Q
+          // kill T(i,i-1)
           if(m_T.coeff(i,i-1)!=Scalar(0))
           {
             G.makeGivens(m_T.coeff(i,i), m_T.coeff(i,i-1), &m_T.coeffRef(i,i));
@@ -276,7 +276,7 @@ namespace Eigen {
 
   /** \internal Look for single small sub-diagonal element S(res, res-1) and return res (or 0) */
   template<typename MatrixType>
-    inline typename MatrixType::Index RealQZ<MatrixType>::findSmallSubdiagEntry(Index iu)
+    inline Index RealQZ<MatrixType>::findSmallSubdiagEntry(Index iu)
     {
       using std::abs;
       Index res = iu;
@@ -294,7 +294,7 @@ namespace Eigen {
 
   /** \internal Look for single small diagonal element T(res, res) for res between f and l, and return res (or f-1)  */
   template<typename MatrixType>
-    inline typename MatrixType::Index RealQZ<MatrixType>::findSmallDiagEntry(Index f, Index l)
+    inline Index RealQZ<MatrixType>::findSmallDiagEntry(Index f, Index l)
     {
       using std::abs;
       Index res = l;
@@ -315,8 +315,8 @@ namespace Eigen {
       const Index dim=m_S.cols();
       if (abs(m_S.coeff(i+1,i))==Scalar(0))
         return;
-      Index z = findSmallDiagEntry(i,i+1);
-      if (z==i-1)
+      Index j = findSmallDiagEntry(i,i+1);
+      if (j==i-1)
       {
         // block of (S T^{-1})
         Matrix2s STi = m_T.template block<2,2>(i,i).template triangularView<Upper>().
@@ -352,7 +352,7 @@ namespace Eigen {
       }
       else
       {
-        pushDownZero(z,i,i+1);
+        pushDownZero(j,i,i+1);
       }
     }
 
@@ -552,7 +552,6 @@ namespace Eigen {
       m_T.coeffRef(l,l-1) = Scalar(0.0);
     }
 
-
   template<typename MatrixType>
     RealQZ<MatrixType>& RealQZ<MatrixType>::compute(const MatrixType& A_in, const MatrixType& B_in, bool computeQZ)
     {
@@ -616,6 +615,37 @@ namespace Eigen {
       }
       // check if we converged before reaching iterations limit
       m_info = (local_iter<m_maxIters) ? Success : NoConvergence;
+
+      // For each non triangular 2x2 diagonal block of S,
+      //    reduce the respective 2x2 diagonal block of T to positive diagonal form using 2x2 SVD.
+      // This step is not mandatory for QZ, but it does help further extraction of eigenvalues/eigenvectors,
+      // and is in par with Lapack/Matlab QZ.
+      if(m_info==Success)
+      {
+        for(Index i=0; i<dim-1; ++i)
+        {
+          if(m_S.coeff(i+1, i) != Scalar(0))
+          {
+            JacobiRotation<Scalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(m_T, i, i+1, &j_left, &j_right);
+
+            // Apply resulting Jacobi rotations
+            m_S.applyOnTheLeft(i,i+1,j_left);
+            m_S.applyOnTheRight(i,i+1,j_right);
+            m_T.applyOnTheLeft(i,i+1,j_left);
+            m_T.applyOnTheRight(i,i+1,j_right);
+            m_T(i+1,i) = m_T(i,i+1) = Scalar(0);
+
+            if(m_computeQZ) {
+              m_Q.applyOnTheRight(i,i+1,j_left.transpose());
+              m_Z.applyOnTheLeft(i,i+1,j_right.transpose());
+            }
+
+            i++;
+          }
+        }
+      }
+
       return *this;
     } // end compute
 
diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index 16d387537..f5c86041d 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -64,7 +64,7 @@ template<typename _MatrixType> class RealSchur
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
     typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
@@ -80,7 +80,7 @@ template<typename _MatrixType> class RealSchur
       *
       * \sa compute() for an example.
       */
-    RealSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
+    explicit RealSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
             : m_matT(size, size),
               m_matU(size, size),
               m_workspaceVector(size),
@@ -100,7 +100,8 @@ template<typename _MatrixType> class RealSchur
       * Example: \include RealSchur_RealSchur_MatrixType.cpp
       * Output: \verbinclude RealSchur_RealSchur_MatrixType.out
       */
-    RealSchur(const MatrixType& matrix, bool computeU = true)
+    template<typename InputType>
+    explicit RealSchur(const EigenBase<InputType>& matrix, bool computeU = true)
             : m_matT(matrix.rows(),matrix.cols()),
               m_matU(matrix.rows(),matrix.cols()),
               m_workspaceVector(matrix.rows()),
@@ -109,7 +110,7 @@ template<typename _MatrixType> class RealSchur
               m_matUisUptodate(false),
               m_maxIters(-1)
     {
-      compute(matrix, computeU);
+      compute(matrix.derived(), computeU);
     }
 
     /** \brief Returns the orthogonal matrix in the Schur decomposition. 
@@ -165,7 +166,8 @@ template<typename _MatrixType> class RealSchur
       *
       * \sa compute(const MatrixType&, bool, Index)
       */
-    RealSchur& compute(const MatrixType& matrix, bool computeU = true);
+    template<typename InputType>
+    RealSchur& compute(const EigenBase<InputType>& matrix, bool computeU = true);
 
     /** \brief Computes Schur decomposition of a Hessenberg matrix H = Z T Z^T
      *  \param[in] matrixH Matrix in Hessenberg form H
@@ -243,26 +245,45 @@ template<typename _MatrixType> class RealSchur
 
 
 template<typename MatrixType>
-RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const MatrixType& matrix, bool computeU)
+template<typename InputType>
+RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeU)
 {
+  const Scalar considerAsZero = (std::numeric_limits<Scalar>::min)();
+
   eigen_assert(matrix.cols() == matrix.rows());
   Index maxIters = m_maxIters;
   if (maxIters == -1)
     maxIters = m_maxIterationsPerRow * matrix.rows();
 
+  Scalar scale = matrix.derived().cwiseAbs().maxCoeff();
+  if(scale<considerAsZero)
+  {
+    m_matT.setZero(matrix.rows(),matrix.cols());
+    if(computeU)
+      m_matU.setIdentity(matrix.rows(),matrix.cols());
+    m_info = Success;
+    m_isInitialized = true;
+    m_matUisUptodate = computeU;
+    return *this;
+  }
+
   // Step 1. Reduce to Hessenberg form
-  m_hess.compute(matrix);
+  m_hess.compute(matrix.derived()/scale);
 
   // Step 2. Reduce to real Schur form  
   computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
+
+  m_matT *= scale;
   
   return *this;
 }
 template<typename MatrixType>
 template<typename HessMatrixType, typename OrthMatrixType>
 RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU)
-{  
-  m_matT = matrixH; 
+{
+  using std::abs;
+
+  m_matT = matrixH;
   if(computeU)
     m_matU = matrixQ;
   
@@ -343,7 +364,7 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()
 
 /** \internal Look for single small sub-diagonal element and returns its index */
 template<typename MatrixType>
-inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
+inline Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
 {
   using std::abs;
   Index res = iu;
diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h
index ad9736460..2c2251715 100644
--- a/Eigen/src/Eigenvalues/RealSchur_MKL.h
+++ b/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h
@@ -25,43 +25,37 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Real Schur needed to real unsymmetrical eigenvalues/eigenvectors.
  ********************************************************************************
 */
 
-#ifndef EIGEN_REAL_SCHUR_MKL_H
-#define EIGEN_REAL_SCHUR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_REAL_SCHUR_LAPACKE_H
+#define EIGEN_REAL_SCHUR_LAPACKE_H
 
 namespace Eigen { 
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_SCHUR_REAL(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
-template<> inline \
+#define EIGEN_LAPACKE_SCHUR_REAL(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, LAPACKE_PREFIX_U, EIGCOLROW, LAPACKE_COLROW) \
+template<> template<typename InputType> inline \
 RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, bool computeU) \
+RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
 { \
-  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
-  typedef MatrixType::RealScalar RealScalar; \
-\
   eigen_assert(matrix.cols() == matrix.rows()); \
 \
-  lapack_int n = matrix.cols(), sdim, info; \
-  lapack_int lda = matrix.outerStride(); \
-  lapack_int matrix_order = MKLCOLROW; \
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), sdim, info; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
   char jobvs, sort='N'; \
-  LAPACK_##MKLPREFIX_U##_SELECT2 select = 0; \
+  LAPACK_##LAPACKE_PREFIX_U##_SELECT2 select = 0; \
   jobvs = (computeU) ? 'V' : 'N'; \
   m_matU.resize(n, n); \
-  lapack_int ldvs  = m_matU.outerStride(); \
+  lapack_int ldvs  = internal::convert_index<lapack_int>(m_matU.outerStride()); \
   m_matT = matrix; \
+  lapack_int lda = internal::convert_index<lapack_int>(m_matT.outerStride()); \
   Matrix<EIGTYPE, Dynamic, Dynamic> wr, wi; \
   wr.resize(n, 1); wi.resize(n, 1); \
-  info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)wr.data(), (MKLTYPE*)wi.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
+  info = LAPACKE_##LAPACKE_PREFIX##gees( matrix_order, jobvs, sort, select, n, (LAPACKE_TYPE*)m_matT.data(), lda, &sdim, (LAPACKE_TYPE*)wr.data(), (LAPACKE_TYPE*)wi.data(), (LAPACKE_TYPE*)m_matU.data(), ldvs ); \
   if(info == 0) \
     m_info = Success; \
   else \
@@ -73,11 +67,11 @@ RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<E
 \
 }
 
-EIGEN_MKL_SCHUR_REAL(double,   double, d, D, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_REAL(float,    float,  s, S, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_REAL(double,   double, d, D, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SCHUR_REAL(float,    float,  s, S, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(double,   double, d, D, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(float,    float,  s, S, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(double,   double, d, D, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(float,    float,  s, S, RowMajor, LAPACK_ROW_MAJOR)
 
 } // end namespace Eigen
 
-#endif // EIGEN_REAL_SCHUR_MKL_H
+#endif // EIGEN_REAL_SCHUR_LAPACKE_H
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index c2e76c884..9ddd553f2 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -20,6 +20,8 @@ class GeneralizedSelfAdjointEigenSolver;
 
 namespace internal {
 template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
+template<typename MatrixType, typename DiagType, typename SubDiagType>
+ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec);
 }
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
@@ -79,7 +81,9 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     
     /** \brief Scalar type for matrices of type \p _MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    
+    typedef Matrix<Scalar,Size,Size,ColMajor,MaxColsAtCompileTime,MaxColsAtCompileTime> EigenvectorsType;
 
     /** \brief Real scalar type for \p _MatrixType.
       *
@@ -98,6 +102,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       */
     typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVectorType;
     typedef Tridiagonalization<MatrixType> TridiagonalizationType;
+    typedef typename TridiagonalizationType::SubDiagonalType SubDiagonalType;
 
     /** \brief Default constructor for fixed-size matrices.
       *
@@ -109,6 +114,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp
       * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver.out
       */
+    EIGEN_DEVICE_FUNC
     SelfAdjointEigenSolver()
         : m_eivec(),
           m_eivalues(),
@@ -128,7 +134,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \sa compute() for an example
       */
-    SelfAdjointEigenSolver(Index size)
+    EIGEN_DEVICE_FUNC
+    explicit SelfAdjointEigenSolver(Index size)
         : m_eivec(size, size),
           m_eivalues(size),
           m_subdiag(size > 1 ? size - 1 : 1),
@@ -150,13 +157,15 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \sa compute(const MatrixType&, int)
       */
-    SelfAdjointEigenSolver(const MatrixType& matrix, int options = ComputeEigenvectors)
+    template<typename InputType>
+    EIGEN_DEVICE_FUNC
+    explicit SelfAdjointEigenSolver(const EigenBase<InputType>& matrix, int options = ComputeEigenvectors)
       : m_eivec(matrix.rows(), matrix.cols()),
         m_eivalues(matrix.cols()),
         m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
         m_isInitialized(false)
     {
-      compute(matrix, options);
+      compute(matrix.derived(), options);
     }
 
     /** \brief Computes eigendecomposition of given matrix.
@@ -189,24 +198,45 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \sa SelfAdjointEigenSolver(const MatrixType&, int)
       */
-    SelfAdjointEigenSolver& compute(const MatrixType& matrix, int options = ComputeEigenvectors);
+    template<typename InputType>
+    EIGEN_DEVICE_FUNC
+    SelfAdjointEigenSolver& compute(const EigenBase<InputType>& matrix, int options = ComputeEigenvectors);
     
-    /** \brief Computes eigendecomposition of given matrix using a direct algorithm
+    /** \brief Computes eigendecomposition of given matrix using a closed-form algorithm
       *
       * This is a variant of compute(const MatrixType&, int options) which
       * directly solves the underlying polynomial equation.
       * 
-      * Currently only 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
+      * Currently only 2x2 and 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
       * 
-      * This method is usually significantly faster than the QR algorithm
+      * This method is usually significantly faster than the QR iterative algorithm
       * but it might also be less accurate. It is also worth noting that
       * for 3x3 matrices it involves trigonometric operations which are
       * not necessarily available for all scalar types.
+      * 
+      * For the 3x3 case, we observed the following worst case relative error regarding the eigenvalues:
+      *   - double: 1e-8
+      *   - float:  1e-3
       *
       * \sa compute(const MatrixType&, int options)
       */
+    EIGEN_DEVICE_FUNC
     SelfAdjointEigenSolver& computeDirect(const MatrixType& matrix, int options = ComputeEigenvectors);
 
+    /**
+      *\brief Computes the eigen decomposition from a tridiagonal symmetric matrix
+      *
+      * \param[in] diag The vector containing the diagonal of the matrix.
+      * \param[in] subdiag The subdiagonal of the matrix.
+      * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+      * \returns Reference to \c *this
+      *
+      * This function assumes that the matrix has been reduced to tridiagonal form.
+      *
+      * \sa compute(const MatrixType&, int) for more information
+      */
+    SelfAdjointEigenSolver& computeFromTridiagonal(const RealVectorType& diag, const SubDiagonalType& subdiag , int options=ComputeEigenvectors);
+
     /** \brief Returns the eigenvectors of given matrix.
       *
       * \returns  A const reference to the matrix whose columns are the eigenvectors.
@@ -225,7 +255,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \sa eigenvalues()
       */
-    const MatrixType& eigenvectors() const
+    EIGEN_DEVICE_FUNC
+    const EigenvectorsType& eigenvectors() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
       eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
@@ -247,6 +278,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \sa eigenvectors(), MatrixBase::eigenvalues()
       */
+    EIGEN_DEVICE_FUNC
     const RealVectorType& eigenvalues() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
@@ -268,9 +300,9 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       * Example: \include SelfAdjointEigenSolver_operatorSqrt.cpp
       * Output: \verbinclude SelfAdjointEigenSolver_operatorSqrt.out
       *
-      * \sa operatorInverseSqrt(),
-      *     \ref MatrixFunctions_Module "MatrixFunctions Module"
+      * \sa operatorInverseSqrt(), <a href="unsupported/group__MatrixFunctions__Module.html">MatrixFunctions Module</a>
       */
+    EIGEN_DEVICE_FUNC
     MatrixType operatorSqrt() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
@@ -293,9 +325,9 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       * Example: \include SelfAdjointEigenSolver_operatorInverseSqrt.cpp
       * Output: \verbinclude SelfAdjointEigenSolver_operatorInverseSqrt.out
       *
-      * \sa operatorSqrt(), MatrixBase::inverse(),
-      *     \ref MatrixFunctions_Module "MatrixFunctions Module"
+      * \sa operatorSqrt(), MatrixBase::inverse(), <a href="unsupported/group__MatrixFunctions__Module.html">MatrixFunctions Module</a>
       */
+    EIGEN_DEVICE_FUNC
     MatrixType operatorInverseSqrt() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
@@ -307,6 +339,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
       */
+    EIGEN_DEVICE_FUNC
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
@@ -320,43 +353,13 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       */
     static const int m_maxIterations = 30;
 
-    #ifdef EIGEN2_SUPPORT
-    SelfAdjointEigenSolver(const MatrixType& matrix, bool computeEigenvectors)
-      : m_eivec(matrix.rows(), matrix.cols()),
-        m_eivalues(matrix.cols()),
-        m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
-        m_isInitialized(false)
-    {
-      compute(matrix, computeEigenvectors);
-    }
-    
-    SelfAdjointEigenSolver(const MatrixType& matA, const MatrixType& matB, bool computeEigenvectors = true)
-        : m_eivec(matA.cols(), matA.cols()),
-          m_eivalues(matA.cols()),
-          m_subdiag(matA.cols() > 1 ? matA.cols() - 1 : 1),
-          m_isInitialized(false)
-    {
-      static_cast<GeneralizedSelfAdjointEigenSolver<MatrixType>*>(this)->compute(matA, matB, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
-    
-    void compute(const MatrixType& matrix, bool computeEigenvectors)
-    {
-      compute(matrix, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
-
-    void compute(const MatrixType& matA, const MatrixType& matB, bool computeEigenvectors = true)
-    {
-      compute(matA, matB, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
-    #endif // EIGEN2_SUPPORT
-
   protected:
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
     
-    MatrixType m_eivec;
+    EigenvectorsType m_eivec;
     RealVectorType m_eivalues;
     typename TridiagonalizationType::SubDiagonalType m_subdiag;
     ComputationInfo m_info;
@@ -364,6 +367,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     bool m_eigenvectorsOk;
 };
 
+namespace internal {
 /** \internal
   *
   * \eigenvalues_module \ingroup Eigenvalues_Module
@@ -371,8 +375,12 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
   * Performs a QR step on a tridiagonal symmetric matrix represented as a
   * pair of two vectors \a diag and \a subdiag.
   *
-  * \param matA the input selfadjoint matrix
-  * \param hCoeffs returned Householder coefficients
+  * \param diag the diagonal part of the input selfadjoint tridiagonal matrix
+  * \param subdiag the sub-diagonal part of the input selfadjoint tridiagonal matrix
+  * \param start starting index of the submatrix to work on
+  * \param end last+1 index of the submatrix to work on
+  * \param matrixQ pointer to the column-major matrix holding the eigenvectors, can be 0
+  * \param n size of the input matrix
   *
   * For compilation efficiency reasons, this procedure does not use eigen expression
   * for its arguments.
@@ -380,17 +388,21 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
   * Implemented from Golub's "Matrix Computations", algorithm 8.3.2:
   * "implicit symmetric QR step with Wilkinson shift"
   */
-namespace internal {
 template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n);
 }
 
 template<typename MatrixType>
+template<typename InputType>
+EIGEN_DEVICE_FUNC
 SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
-::compute(const MatrixType& matrix, int options)
+::compute(const EigenBase<InputType>& a_matrix, int options)
 {
   check_template_parameters();
   
+  const InputType &matrix(a_matrix.derived());
+  
   using std::abs;
   eigen_assert(matrix.cols() == matrix.rows());
   eigen_assert((options&~(EigVecMask|GenEigMask))==0
@@ -402,7 +414,8 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 
   if(n==1)
   {
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0));
+    m_eivec = matrix;
+    m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0));
     if(computeEigenvectors)
       m_eivec.setOnes(n,n);
     m_info = Success;
@@ -413,7 +426,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 
   // declare some aliases
   RealVectorType& diag = m_eivalues;
-  MatrixType& mat = m_eivec;
+  EigenvectorsType& mat = m_eivec;
 
   // map the matrix coefficients to [-1:1] to avoid over- and underflow.
   mat = matrix.template triangularView<Lower>();
@@ -422,19 +435,74 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
   mat.template triangularView<Lower>() /= scale;
   m_subdiag.resize(n-1);
   internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors);
+
+  m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
   
+  // scale back the eigen values
+  m_eivalues *= scale;
+
+  m_isInitialized = true;
+  m_eigenvectorsOk = computeEigenvectors;
+  return *this;
+}
+
+template<typename MatrixType>
+SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
+::computeFromTridiagonal(const RealVectorType& diag, const SubDiagonalType& subdiag , int options)
+{
+  //TODO : Add an option to scale the values beforehand
+  bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
+
+  m_eivalues = diag;
+  m_subdiag = subdiag;
+  if (computeEigenvectors)
+  {
+    m_eivec.setIdentity(diag.size(), diag.size());
+  }
+  m_info = internal::computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
+
+  m_isInitialized = true;
+  m_eigenvectorsOk = computeEigenvectors;
+  return *this;
+}
+
+namespace internal {
+/**
+  * \internal
+  * \brief Compute the eigendecomposition from a tridiagonal matrix
+  *
+  * \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
+  * \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition)
+  * \param[in] maxIterations : the maximum number of iterations
+  * \param[in] computeEigenvectors : whether the eigenvectors have to be computed or not
+  * \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input.
+  * \returns \c Success or \c NoConvergence
+  */
+template<typename MatrixType, typename DiagType, typename SubDiagType>
+ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
+{
+  using std::abs;
+
+  ComputationInfo info;
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index n = diag.size();
   Index end = n-1;
   Index start = 0;
   Index iter = 0; // total number of iterations
-
+  
+  typedef typename DiagType::RealScalar RealScalar;
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
+  
   while (end>0)
   {
     for (Index i = start; i<end; ++i)
-      if (internal::isMuchSmallerThan(abs(m_subdiag[i]),(abs(diag[i])+abs(diag[i+1]))))
-        m_subdiag[i] = 0;
+      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1])),precision) || abs(subdiag[i]) <= considerAsZero)
+        subdiag[i] = 0;
 
     // find the largest unreduced block
-    while (end>0 && m_subdiag[end-1]==0)
+    while (end>0 && subdiag[end-1]==RealScalar(0))
     {
       end--;
     }
@@ -443,51 +511,42 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 
     // if we spent too many iterations, we give up
     iter++;
-    if(iter > m_maxIterations * n) break;
+    if(iter > maxIterations * n) break;
 
     start = end - 1;
-    while (start>0 && m_subdiag[start-1]!=0)
+    while (start>0 && subdiag[start-1]!=0)
       start--;
 
-    internal::tridiagonal_qr_step<MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor>(diag.data(), m_subdiag.data(), start, end, computeEigenvectors ? m_eivec.data() : (Scalar*)0, n);
+    internal::tridiagonal_qr_step<MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor>(diag.data(), subdiag.data(), start, end, computeEigenvectors ? eivec.data() : (Scalar*)0, n);
   }
-
-  if (iter <= m_maxIterations * n)
-    m_info = Success;
+  if (iter <= maxIterations * n)
+    info = Success;
   else
-    m_info = NoConvergence;
+    info = NoConvergence;
 
   // Sort eigenvalues and corresponding vectors.
   // TODO make the sort optional ?
   // TODO use a better sort algorithm !!
-  if (m_info == Success)
+  if (info == Success)
   {
     for (Index i = 0; i < n-1; ++i)
     {
       Index k;
-      m_eivalues.segment(i,n-i).minCoeff(&k);
+      diag.segment(i,n-i).minCoeff(&k);
       if (k > 0)
       {
-        std::swap(m_eivalues[i], m_eivalues[k+i]);
+        std::swap(diag[i], diag[k+i]);
         if(computeEigenvectors)
-          m_eivec.col(i).swap(m_eivec.col(k+i));
+          eivec.col(i).swap(eivec.col(k+i));
       }
     }
   }
-  
-  // scale back the eigen values
-  m_eivalues *= scale;
-
-  m_isInitialized = true;
-  m_eigenvectorsOk = computeEigenvectors;
-  return *this;
+  return info;
 }
-
-
-namespace internal {
   
 template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(SolverType& eig, const typename SolverType::MatrixType& A, int options)
   { eig.compute(A,options); }
 };
@@ -497,20 +556,22 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
   typedef typename SolverType::MatrixType MatrixType;
   typedef typename SolverType::RealVectorType VectorType;
   typedef typename SolverType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
+  typedef typename SolverType::EigenvectorsType EigenvectorsType;
   
+
   /** \internal
    * Computes the roots of the characteristic polynomial of \a m.
    * For numerical stability m.trace() should be near zero and to avoid over- or underflow m should be normalized.
    */
+  EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
-    using std::sqrt;
-    using std::atan2;
-    using std::cos;
-    using std::sin;
-    const Scalar s_inv3 = Scalar(1.0)/Scalar(3.0);
-    const Scalar s_sqrt3 = sqrt(Scalar(3.0));
+    EIGEN_USING_STD_MATH(sqrt)
+    EIGEN_USING_STD_MATH(atan2)
+    EIGEN_USING_STD_MATH(cos)
+    EIGEN_USING_STD_MATH(sin)
+    const Scalar s_inv3 = Scalar(1)/Scalar(3);
+    const Scalar s_sqrt3 = sqrt(Scalar(3));
 
     // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0.  The
     // eigenvalues are the roots to this equation, all guaranteed to be
@@ -523,14 +584,12 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     // and in solving the equation for the roots in closed form.
     Scalar c2_over_3 = c2*s_inv3;
     Scalar a_over_3 = (c2*c2_over_3 - c1)*s_inv3;
-    if(a_over_3<Scalar(0))
-      a_over_3 = Scalar(0);
+    a_over_3 = numext::maxi(a_over_3, Scalar(0));
 
     Scalar half_b = Scalar(0.5)*(c0 + c2_over_3*(Scalar(2)*c2_over_3*c2_over_3 - c1));
 
     Scalar q = a_over_3*a_over_3*a_over_3 - half_b*half_b;
-    if(q<Scalar(0))
-      q = Scalar(0);
+    q = numext::maxi(q, Scalar(0));
 
     // Compute the eigenvalues by solving for the roots of the polynomial.
     Scalar rho = sqrt(a_over_3);
@@ -543,6 +602,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     roots(2) = c2_over_3 + Scalar(2)*rho*cos_theta;
   }
 
+  EIGEN_DEVICE_FUNC
   static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
   {
     using std::abs;
@@ -562,6 +622,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     return true;
   }
 
+  EIGEN_DEVICE_FUNC
   static inline void run(SolverType& solver, const MatrixType& mat, int options)
   {
     eigen_assert(mat.cols() == 3 && mat.cols() == mat.rows());
@@ -570,7 +631,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
             && "invalid option parameter");
     bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
     
-    MatrixType& eivecs = solver.m_eivec;
+    EigenvectorsType& eivecs = solver.m_eivec;
     VectorType& eivals = solver.m_eivalues;
   
     // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
@@ -603,7 +664,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
         Index k(0), l(2);
         if(d0 > d1)
         {
-          std::swap(k,l);
+          numext::swap(k,l);
           d0 = d1;
         }
 
@@ -647,12 +708,15 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
 };
 
 // 2x2 direct eigenvalues decomposition, code from Hauke Heibel
-template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,2,false>
+template<typename SolverType> 
+struct direct_selfadjoint_eigenvalues<SolverType,2,false>
 {
   typedef typename SolverType::MatrixType MatrixType;
   typedef typename SolverType::RealVectorType VectorType;
   typedef typename SolverType::Scalar Scalar;
+  typedef typename SolverType::EigenvectorsType EigenvectorsType;
   
+  EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
     using std::sqrt;
@@ -662,28 +726,33 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,2
     roots(1) = t1 + t0;
   }
   
+  EIGEN_DEVICE_FUNC
   static inline void run(SolverType& solver, const MatrixType& mat, int options)
   {
-    using std::sqrt;
-    using std::abs;
-
+    EIGEN_USING_STD_MATH(sqrt);
+    EIGEN_USING_STD_MATH(abs);
+    
     eigen_assert(mat.cols() == 2 && mat.cols() == mat.rows());
     eigen_assert((options&~(EigVecMask|GenEigMask))==0
             && (options&EigVecMask)!=EigVecMask
             && "invalid option parameter");
     bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
     
-    MatrixType& eivecs = solver.m_eivec;
+    EigenvectorsType& eivecs = solver.m_eivec;
     VectorType& eivals = solver.m_eivalues;
   
-    // map the matrix coefficients to [-1:1] to avoid over- and underflow.
-    Scalar scale = mat.cwiseAbs().maxCoeff();
-    scale = (std::max)(scale,Scalar(1));
-    MatrixType scaledMat = mat / scale;
-    
+    // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
+    Scalar shift = mat.trace() / Scalar(2);
+    MatrixType scaledMat = mat;
+    scaledMat.coeffRef(0,1) = mat.coeff(1,0);
+    scaledMat.diagonal().array() -= shift;
+    Scalar scale = scaledMat.cwiseAbs().maxCoeff();
+    if(scale > Scalar(0))
+      scaledMat /= scale;
+
     // Compute the eigenvalues
     computeRoots(scaledMat,eivals);
-    
+
     // compute the eigen vectors
     if(computeEigenvectors)
     {
@@ -711,10 +780,11 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,2
         eivecs.col(0) << eivecs.col(1).unitOrthogonal();
       }
     }
-    
+
     // Rescale back to the original size.
     eivals *= scale;
-    
+    eivals.array() += shift;
+
     solver.m_info = Success;
     solver.m_isInitialized = true;
     solver.m_eigenvectorsOk = computeEigenvectors;
@@ -724,6 +794,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,2
 }
 
 template<typename MatrixType>
+EIGEN_DEVICE_FUNC
 SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 ::computeDirect(const MatrixType& matrix, int options)
 {
@@ -733,6 +804,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 
 namespace internal {
 template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
 {
   using std::abs;
@@ -744,14 +816,14 @@ static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index sta
 //   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
   // This explain the following, somewhat more complicated, version:
   RealScalar mu = diag[end];
-  if(td==0)
+  if(td==RealScalar(0))
     mu -= abs(e);
   else
   {
     RealScalar e2 = numext::abs2(subdiag[end-1]);
     RealScalar h = numext::hypot(td,e);
-    if(e2==0)  mu -= (e / (td + (td>0 ? 1 : -1))) * (e / h);
-    else       mu -= e2 / (td + (td>0 ? h : -h));
+    if(e2==RealScalar(0)) mu -= (e / (td + (td>RealScalar(0) ? RealScalar(1) : RealScalar(-1)))) * (e / h);
+    else                  mu -= e2 / (td + (td>RealScalar(0) ? h : -h));
   }
   
   RealScalar x = diag[start] - mu;
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
index 17c0dadd2..3891cf883 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
@@ -25,38 +25,36 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Self-adjoint eigenvalues/eigenvectors.
  ********************************************************************************
 */
 
-#ifndef EIGEN_SAEIGENSOLVER_MKL_H
-#define EIGEN_SAEIGENSOLVER_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_SAEIGENSOLVER_LAPACKE_H
+#define EIGEN_SAEIGENSOLVER_LAPACKE_H
 
 namespace Eigen { 
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_EIG_SELFADJ(EIGTYPE, MKLTYPE, MKLRTYPE, MKLNAME, EIGCOLROW, MKLCOLROW ) \
-template<> inline \
+#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW, LAPACKE_COLROW ) \
+template<> template<typename InputType> inline \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, int options) \
+SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, int options) \
 { \
   eigen_assert(matrix.cols() == matrix.rows()); \
   eigen_assert((options&~(EigVecMask|GenEigMask))==0 \
           && (options&EigVecMask)!=EigVecMask \
           && "invalid option parameter"); \
   bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \
-  lapack_int n = matrix.cols(), lda, matrix_order, info; \
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), lda, matrix_order, info; \
   m_eivalues.resize(n,1); \
   m_subdiag.resize(n-1); \
   m_eivec = matrix; \
 \
   if(n==1) \
   { \
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0)); \
+    m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0)); \
     if(computeEigenvectors) m_eivec.setOnes(n,n); \
     m_info = Success; \
     m_isInitialized = true; \
@@ -64,12 +62,12 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
     return *this; \
   } \
 \
-  lda = matrix.outerStride(); \
-  matrix_order=MKLCOLROW; \
+  lda = internal::convert_index<lapack_int>(m_eivec.outerStride()); \
+  matrix_order=LAPACKE_COLROW; \
   char jobz, uplo='L'/*, range='A'*/; \
   jobz = computeEigenvectors ? 'V' : 'N'; \
 \
-  info = LAPACKE_##MKLNAME( matrix_order, jobz, uplo, n, (MKLTYPE*)m_eivec.data(), lda, (MKLRTYPE*)m_eivalues.data() ); \
+  info = LAPACKE_##LAPACKE_NAME( matrix_order, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \
   m_info = (info==0) ? Success : NoConvergence; \
   m_isInitialized = true; \
   m_eigenvectorsOk = computeEigenvectors; \
@@ -77,15 +75,15 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
 }
 
 
-EIGEN_MKL_EIG_SELFADJ(double,   double,        double, dsyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(float,    float,         float,  ssyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(dcomplex, MKL_Complex16, double, zheev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(scomplex, MKL_Complex8,  float,  cheev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev, ColMajor, LAPACK_COL_MAJOR)
 
-EIGEN_MKL_EIG_SELFADJ(double,   double,        double, dsyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(float,    float,         float,  ssyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(dcomplex, MKL_Complex16, double, zheev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(scomplex, MKL_Complex8,  float,  cheev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev, RowMajor, LAPACK_ROW_MAJOR)
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 192278d68..1d102c17b 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -18,8 +18,10 @@ namespace internal {
 template<typename MatrixType> struct TridiagonalizationMatrixTReturnType;
 template<typename MatrixType>
 struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
+  : public traits<typename MatrixType::PlainObject>
 {
-  typedef typename MatrixType::PlainObject ReturnType;
+  typedef typename MatrixType::PlainObject ReturnType; // FIXME shall it be a BandMatrix?
+  enum { Flags = 0 };
 };
 
 template<typename MatrixType, typename CoeffVectorType>
@@ -67,7 +69,7 @@ template<typename _MatrixType> class Tridiagonalization
 
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     enum {
       Size = MatrixType::RowsAtCompileTime,
@@ -89,10 +91,8 @@ template<typename _MatrixType> class Tridiagonalization
             >::type DiagonalReturnType;
 
     typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-              typename internal::add_const_on_value_type<typename Diagonal<
-                Block<const MatrixType,SizeMinusOne,SizeMinusOne> >::RealReturnType>::type,
-              const Diagonal<
-                Block<const MatrixType,SizeMinusOne,SizeMinusOne> >
+              typename internal::add_const_on_value_type<typename Diagonal<const MatrixType, -1>::RealReturnType>::type,
+              const Diagonal<const MatrixType, -1>
             >::type SubDiagonalReturnType;
 
     /** \brief Return type of matrixQ() */
@@ -110,7 +110,7 @@ template<typename _MatrixType> class Tridiagonalization
       *
       * \sa compute() for an example.
       */
-    Tridiagonalization(Index size = Size==Dynamic ? 2 : Size)
+    explicit Tridiagonalization(Index size = Size==Dynamic ? 2 : Size)
       : m_matrix(size,size),
         m_hCoeffs(size > 1 ? size-1 : 1),
         m_isInitialized(false)
@@ -126,8 +126,9 @@ template<typename _MatrixType> class Tridiagonalization
       * Example: \include Tridiagonalization_Tridiagonalization_MatrixType.cpp
       * Output: \verbinclude Tridiagonalization_Tridiagonalization_MatrixType.out
       */
-    Tridiagonalization(const MatrixType& matrix)
-      : m_matrix(matrix),
+    template<typename InputType>
+    explicit Tridiagonalization(const EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
         m_hCoeffs(matrix.cols() > 1 ? matrix.cols()-1 : 1),
         m_isInitialized(false)
     {
@@ -152,9 +153,10 @@ template<typename _MatrixType> class Tridiagonalization
       * Example: \include Tridiagonalization_compute.cpp
       * Output: \verbinclude Tridiagonalization_compute.out
       */
-    Tridiagonalization& compute(const MatrixType& matrix)
+    template<typename InputType>
+    Tridiagonalization& compute(const EigenBase<InputType>& matrix)
     {
-      m_matrix = matrix;
+      m_matrix = matrix.derived();
       m_hCoeffs.resize(matrix.rows()-1, 1);
       internal::tridiagonalization_inplace(m_matrix, m_hCoeffs);
       m_isInitialized = true;
@@ -305,7 +307,7 @@ typename Tridiagonalization<MatrixType>::DiagonalReturnType
 Tridiagonalization<MatrixType>::diagonal() const
 {
   eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-  return m_matrix.diagonal();
+  return m_matrix.diagonal().real();
 }
 
 template<typename MatrixType>
@@ -313,8 +315,7 @@ typename Tridiagonalization<MatrixType>::SubDiagonalReturnType
 Tridiagonalization<MatrixType>::subDiagonal() const
 {
   eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-  Index n = m_matrix.rows();
-  return Block<const MatrixType,SizeMinusOne,SizeMinusOne>(m_matrix, 1, 0, n-1,n-1).diagonal();
+  return m_matrix.template diagonal<-1>().real();
 }
 
 namespace internal {
@@ -346,7 +347,6 @@ template<typename MatrixType, typename CoeffVectorType>
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
 {
   using numext::conj;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   Index n = matA.rows();
@@ -367,10 +367,10 @@ void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
     hCoeffs.tail(n-i-1).noalias() = (matA.bottomRightCorner(remainingSize,remainingSize).template selfadjointView<Lower>()
                                   * (conj(h) * matA.col(i).tail(remainingSize)));
 
-    hCoeffs.tail(n-i-1) += (conj(h)*Scalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
+    hCoeffs.tail(n-i-1) += (conj(h)*RealScalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
 
     matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>()
-      .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), -1);
+      .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), Scalar(-1));
 
     matA.col(i).coeffRef(i+1) = beta;
     hCoeffs.coeffRef(i) = h;
@@ -438,7 +438,6 @@ struct tridiagonalization_inplace_selector
 {
   typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
   typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
-  typedef typename MatrixType::Index Index;
   template<typename DiagonalType, typename SubDiagonalType>
   static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
   {
@@ -467,9 +466,10 @@ struct tridiagonalization_inplace_selector<MatrixType,3,false>
   static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
   {
     using std::sqrt;
+    const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
     diag[0] = mat(0,0);
     RealScalar v1norm2 = numext::abs2(mat(2,0));
-    if(v1norm2 == RealScalar(0))
+    if(v1norm2 <= tol)
     {
       diag[1] = mat(1,1);
       diag[2] = mat(2,2);
@@ -526,7 +526,6 @@ struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
 template<typename MatrixType> struct TridiagonalizationMatrixTReturnType
 : public ReturnByValue<TridiagonalizationMatrixTReturnType<MatrixType> >
 {
-    typedef typename MatrixType::Index Index;
   public:
     /** \brief Constructor.
       *
diff --git a/Eigen/src/Geometry/AlignedBox.h b/Eigen/src/Geometry/AlignedBox.h
index b226336de..066eae4f9 100644
--- a/Eigen/src/Geometry/AlignedBox.h
+++ b/Eigen/src/Geometry/AlignedBox.h
@@ -34,10 +34,11 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   enum { AmbientDimAtCompileTime = _AmbientDim };
   typedef _Scalar                                   Scalar;
   typedef NumTraits<Scalar>                         ScalarTraits;
-  typedef DenseIndex                                Index;
+  typedef Eigen::Index                              Index; ///< \deprecated since Eigen 3.3
   typedef typename ScalarTraits::Real               RealScalar;
-  typedef typename ScalarTraits::NonInteger      NonInteger;
+  typedef typename ScalarTraits::NonInteger         NonInteger;
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1>  VectorType;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const VectorType, const VectorType> VectorTypeSum;
 
   /** Define constants to name the corners of a 1D, 2D or 3D axis aligned bounding box */
   enum CornerType
@@ -61,77 +62,76 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
 
   /** Default constructor initializing a null box. */
-  inline AlignedBox()
+  EIGEN_DEVICE_FUNC inline AlignedBox()
   { if (AmbientDimAtCompileTime!=Dynamic) setEmpty(); }
 
   /** Constructs a null box with \a _dim the dimension of the ambient space. */
-  inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim)
+  EIGEN_DEVICE_FUNC inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim)
   { setEmpty(); }
 
   /** Constructs a box with extremities \a _min and \a _max.
    * \warning If either component of \a _min is larger than the same component of \a _max, the constructed box is empty. */
   template<typename OtherVectorType1, typename OtherVectorType2>
-  inline AlignedBox(const OtherVectorType1& _min, const OtherVectorType2& _max) : m_min(_min), m_max(_max) {}
+  EIGEN_DEVICE_FUNC inline AlignedBox(const OtherVectorType1& _min, const OtherVectorType2& _max) : m_min(_min), m_max(_max) {}
 
   /** Constructs a box containing a single point \a p. */
   template<typename Derived>
-  inline explicit AlignedBox(const MatrixBase<Derived>& p) : m_min(p), m_max(m_min)
+  EIGEN_DEVICE_FUNC inline explicit AlignedBox(const MatrixBase<Derived>& p) : m_min(p), m_max(m_min)
   { }
 
-  ~AlignedBox() {}
+  EIGEN_DEVICE_FUNC ~AlignedBox() {}
 
   /** \returns the dimension in which the box holds */
-  inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size() : Index(AmbientDimAtCompileTime); }
+  EIGEN_DEVICE_FUNC inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size() : Index(AmbientDimAtCompileTime); }
 
   /** \deprecated use isEmpty() */
-  inline bool isNull() const { return isEmpty(); }
+  EIGEN_DEVICE_FUNC inline bool isNull() const { return isEmpty(); }
 
   /** \deprecated use setEmpty() */
-  inline void setNull() { setEmpty(); }
+  EIGEN_DEVICE_FUNC inline void setNull() { setEmpty(); }
 
   /** \returns true if the box is empty.
    * \sa setEmpty */
-  inline bool isEmpty() const { return (m_min.array() > m_max.array()).any(); }
+  EIGEN_DEVICE_FUNC inline bool isEmpty() const { return (m_min.array() > m_max.array()).any(); }
 
   /** Makes \c *this an empty box.
    * \sa isEmpty */
-  inline void setEmpty()
+  EIGEN_DEVICE_FUNC inline void setEmpty()
   {
     m_min.setConstant( ScalarTraits::highest() );
     m_max.setConstant( ScalarTraits::lowest() );
   }
 
   /** \returns the minimal corner */
-  inline const VectorType& (min)() const { return m_min; }
+  EIGEN_DEVICE_FUNC inline const VectorType& (min)() const { return m_min; }
   /** \returns a non const reference to the minimal corner */
-  inline VectorType& (min)() { return m_min; }
+  EIGEN_DEVICE_FUNC inline VectorType& (min)() { return m_min; }
   /** \returns the maximal corner */
-  inline const VectorType& (max)() const { return m_max; }
+  EIGEN_DEVICE_FUNC inline const VectorType& (max)() const { return m_max; }
   /** \returns a non const reference to the maximal corner */
-  inline VectorType& (max)() { return m_max; }
+  EIGEN_DEVICE_FUNC inline VectorType& (max)() { return m_max; }
 
   /** \returns the center of the box */
-  inline const CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>,
-                            const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const VectorType, const VectorType> >
+  EIGEN_DEVICE_FUNC inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(VectorTypeSum, RealScalar, quotient)
   center() const
-  { return (m_min+m_max)/2; }
+  { return (m_min+m_max)/RealScalar(2); }
 
   /** \returns the lengths of the sides of the bounding box.
     * Note that this function does not get the same
     * result for integral or floating scalar types: see
     */
-  inline const CwiseBinaryOp< internal::scalar_difference_op<Scalar>, const VectorType, const VectorType> sizes() const
+  EIGEN_DEVICE_FUNC inline const CwiseBinaryOp< internal::scalar_difference_op<Scalar,Scalar>, const VectorType, const VectorType> sizes() const
   { return m_max - m_min; }
 
   /** \returns the volume of the bounding box */
-  inline Scalar volume() const
+  EIGEN_DEVICE_FUNC inline Scalar volume() const
   { return sizes().prod(); }
 
   /** \returns an expression for the bounding box diagonal vector
     * if the length of the diagonal is needed: diagonal().norm()
     * will provide it.
     */
-  inline CwiseBinaryOp< internal::scalar_difference_op<Scalar>, const VectorType, const VectorType> diagonal() const
+  EIGEN_DEVICE_FUNC inline CwiseBinaryOp< internal::scalar_difference_op<Scalar,Scalar>, const VectorType, const VectorType> diagonal() const
   { return sizes(); }
 
   /** \returns the vertex of the bounding box at the corner defined by
@@ -143,7 +143,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     * For 3D bounding boxes, the following names are added:
     * BottomLeftCeil, BottomRightCeil, TopLeftCeil, TopRightCeil.
     */
-  inline VectorType corner(CornerType corner) const
+  EIGEN_DEVICE_FUNC inline VectorType corner(CornerType corner) const
   {
     EIGEN_STATIC_ASSERT(_AmbientDim <= 3, THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE);
 
@@ -161,9 +161,9 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** \returns a random point inside the bounding box sampled with
    * a uniform distribution */
-  inline VectorType sample() const
+  EIGEN_DEVICE_FUNC inline VectorType sample() const
   {
-    VectorType r;
+    VectorType r(dim());
     for(Index d=0; d<dim(); ++d)
     {
       if(!ScalarTraits::IsInteger)
@@ -179,27 +179,27 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** \returns true if the point \a p is inside the box \c *this. */
   template<typename Derived>
-  inline bool contains(const MatrixBase<Derived>& p) const
+  EIGEN_DEVICE_FUNC inline bool contains(const MatrixBase<Derived>& p) const
   {
-    typename internal::nested<Derived,2>::type p_n(p.derived());
+    typename internal::nested_eval<Derived,2>::type p_n(p.derived());
     return (m_min.array()<=p_n.array()).all() && (p_n.array()<=m_max.array()).all();
   }
 
   /** \returns true if the box \a b is entirely inside the box \c *this. */
-  inline bool contains(const AlignedBox& b) const
+  EIGEN_DEVICE_FUNC inline bool contains(const AlignedBox& b) const
   { return (m_min.array()<=(b.min)().array()).all() && ((b.max)().array()<=m_max.array()).all(); }
 
   /** \returns true if the box \a b is intersecting the box \c *this.
    * \sa intersection, clamp */
-  inline bool intersects(const AlignedBox& b) const
+  EIGEN_DEVICE_FUNC inline bool intersects(const AlignedBox& b) const
   { return (m_min.array()<=(b.max)().array()).all() && ((b.min)().array()<=m_max.array()).all(); }
 
   /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this.
    * \sa extend(const AlignedBox&) */
   template<typename Derived>
-  inline AlignedBox& extend(const MatrixBase<Derived>& p)
+  EIGEN_DEVICE_FUNC inline AlignedBox& extend(const MatrixBase<Derived>& p)
   {
-    typename internal::nested<Derived,2>::type p_n(p.derived());
+    typename internal::nested_eval<Derived,2>::type p_n(p.derived());
     m_min = m_min.cwiseMin(p_n);
     m_max = m_max.cwiseMax(p_n);
     return *this;
@@ -207,7 +207,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this.
    * \sa merged, extend(const MatrixBase&) */
-  inline AlignedBox& extend(const AlignedBox& b)
+  EIGEN_DEVICE_FUNC inline AlignedBox& extend(const AlignedBox& b)
   {
     m_min = m_min.cwiseMin(b.m_min);
     m_max = m_max.cwiseMax(b.m_max);
@@ -217,7 +217,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   /** Clamps \c *this by the box \a b and returns a reference to \c *this.
    * \note If the boxes don't intersect, the resulting box is empty.
    * \sa intersection(), intersects() */
-  inline AlignedBox& clamp(const AlignedBox& b)
+  EIGEN_DEVICE_FUNC inline AlignedBox& clamp(const AlignedBox& b)
   {
     m_min = m_min.cwiseMax(b.m_min);
     m_max = m_max.cwiseMin(b.m_max);
@@ -227,20 +227,20 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   /** Returns an AlignedBox that is the intersection of \a b and \c *this
    * \note If the boxes don't intersect, the resulting box is empty.
    * \sa intersects(), clamp, contains()  */
-  inline AlignedBox intersection(const AlignedBox& b) const
+  EIGEN_DEVICE_FUNC inline AlignedBox intersection(const AlignedBox& b) const
   {return AlignedBox(m_min.cwiseMax(b.m_min), m_max.cwiseMin(b.m_max)); }
 
   /** Returns an AlignedBox that is the union of \a b and \c *this.
    * \note Merging with an empty box may result in a box bigger than \c *this. 
    * \sa extend(const AlignedBox&) */
-  inline AlignedBox merged(const AlignedBox& b) const
+  EIGEN_DEVICE_FUNC inline AlignedBox merged(const AlignedBox& b) const
   { return AlignedBox(m_min.cwiseMin(b.m_min), m_max.cwiseMax(b.m_max)); }
 
   /** Translate \c *this by the vector \a t and returns a reference to \c *this. */
   template<typename Derived>
-  inline AlignedBox& translate(const MatrixBase<Derived>& a_t)
+  EIGEN_DEVICE_FUNC inline AlignedBox& translate(const MatrixBase<Derived>& a_t)
   {
-    const typename internal::nested<Derived,2>::type t(a_t.derived());
+    const typename internal::nested_eval<Derived,2>::type t(a_t.derived());
     m_min += t;
     m_max += t;
     return *this;
@@ -251,28 +251,28 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     * \sa exteriorDistance(const MatrixBase&), squaredExteriorDistance(const AlignedBox&)
     */
   template<typename Derived>
-  inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& p) const;
+  EIGEN_DEVICE_FUNC inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& p) const;
 
   /** \returns the squared distance between the boxes \a b and \c *this,
     * and zero if the boxes intersect.
     * \sa exteriorDistance(const AlignedBox&), squaredExteriorDistance(const MatrixBase&)
     */
-  inline Scalar squaredExteriorDistance(const AlignedBox& b) const;
+  EIGEN_DEVICE_FUNC inline Scalar squaredExteriorDistance(const AlignedBox& b) const;
 
   /** \returns the distance between the point \a p and the box \c *this,
     * and zero if \a p is inside the box.
     * \sa squaredExteriorDistance(const MatrixBase&), exteriorDistance(const AlignedBox&)
     */
   template<typename Derived>
-  inline NonInteger exteriorDistance(const MatrixBase<Derived>& p) const
-  { using std::sqrt; return sqrt(NonInteger(squaredExteriorDistance(p))); }
+  EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const MatrixBase<Derived>& p) const
+  { EIGEN_USING_STD_MATH(sqrt) return sqrt(NonInteger(squaredExteriorDistance(p))); }
 
   /** \returns the distance between the boxes \a b and \c *this,
     * and zero if the boxes intersect.
     * \sa squaredExteriorDistance(const AlignedBox&), exteriorDistance(const MatrixBase&)
     */
-  inline NonInteger exteriorDistance(const AlignedBox& b) const
-  { using std::sqrt; return sqrt(NonInteger(squaredExteriorDistance(b))); }
+  EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const AlignedBox& b) const
+  { EIGEN_USING_STD_MATH(sqrt) return sqrt(NonInteger(squaredExteriorDistance(b))); }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -280,7 +280,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<AlignedBox,
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<AlignedBox,
            AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
   {
     return typename internal::cast_return_type<AlignedBox,
@@ -289,7 +289,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType>
-  inline explicit AlignedBox(const AlignedBox<OtherScalarType,AmbientDimAtCompileTime>& other)
+  EIGEN_DEVICE_FUNC inline explicit AlignedBox(const AlignedBox<OtherScalarType,AmbientDimAtCompileTime>& other)
   {
     m_min = (other.min)().template cast<Scalar>();
     m_max = (other.max)().template cast<Scalar>();
@@ -299,7 +299,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const AlignedBox& other, const RealScalar& prec = ScalarTraits::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const AlignedBox& other, const RealScalar& prec = ScalarTraits::dummy_precision()) const
   { return m_min.isApprox(other.m_min, prec) && m_max.isApprox(other.m_max, prec); }
 
 protected:
@@ -311,9 +311,9 @@ protected:
 
 template<typename Scalar,int AmbientDim>
 template<typename Derived>
-inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const MatrixBase<Derived>& a_p) const
+EIGEN_DEVICE_FUNC inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const MatrixBase<Derived>& a_p) const
 {
-  typename internal::nested<Derived,2*AmbientDim>::type p(a_p.derived());
+  typename internal::nested_eval<Derived,2*AmbientDim>::type p(a_p.derived());
   Scalar dist2(0);
   Scalar aux;
   for (Index k=0; k<dim(); ++k)
@@ -333,7 +333,7 @@ inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const Matri
 }
 
 template<typename Scalar,int AmbientDim>
-inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const AlignedBox& b) const
+EIGEN_DEVICE_FUNC inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const AlignedBox& b) const
 {
   Scalar dist2(0);
   Scalar aux;
diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h
index 553d38c74..0af3c1b08 100644
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@@ -69,50 +69,61 @@ protected:
 public:
 
   /** Default constructor without initialization. */
-  AngleAxis() {}
+  EIGEN_DEVICE_FUNC AngleAxis() {}
   /** Constructs and initialize the angle-axis rotation from an \a angle in radian
     * and an \a axis which \b must \b be \b normalized.
     *
     * \warning If the \a axis vector is not normalized, then the angle-axis object
     *          represents an invalid rotation. */
   template<typename Derived>
+  EIGEN_DEVICE_FUNC 
   inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
-  /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
-  template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
+  /** Constructs and initialize the angle-axis rotation from a quaternion \a q.
+    * This function implicitly normalizes the quaternion \a q.
+    */
+  template<typename QuatDerived> 
+  EIGEN_DEVICE_FUNC inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
   /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
   template<typename Derived>
-  inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
+  EIGEN_DEVICE_FUNC inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
 
-  Scalar angle() const { return m_angle; }
-  Scalar& angle() { return m_angle; }
+  /** \returns the value of the rotation angle in radian */
+  EIGEN_DEVICE_FUNC Scalar angle() const { return m_angle; }
+  /** \returns a read-write reference to the stored angle in radian */
+  EIGEN_DEVICE_FUNC Scalar& angle() { return m_angle; }
 
-  const Vector3& axis() const { return m_axis; }
-  Vector3& axis() { return m_axis; }
+  /** \returns the rotation axis */
+  EIGEN_DEVICE_FUNC const Vector3& axis() const { return m_axis; }
+  /** \returns a read-write reference to the stored rotation axis.
+    *
+    * \warning The rotation axis must remain a \b unit vector.
+    */
+  EIGEN_DEVICE_FUNC Vector3& axis() { return m_axis; }
 
   /** Concatenates two rotations */
-  inline QuaternionType operator* (const AngleAxis& other) const
+  EIGEN_DEVICE_FUNC inline QuaternionType operator* (const AngleAxis& other) const
   { return QuaternionType(*this) * QuaternionType(other); }
 
   /** Concatenates two rotations */
-  inline QuaternionType operator* (const QuaternionType& other) const
+  EIGEN_DEVICE_FUNC inline QuaternionType operator* (const QuaternionType& other) const
   { return QuaternionType(*this) * other; }
 
   /** Concatenates two rotations */
-  friend inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
+  friend EIGEN_DEVICE_FUNC inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
   { return a * QuaternionType(b); }
 
   /** \returns the inverse rotation, i.e., an angle-axis with opposite rotation angle */
-  AngleAxis inverse() const
+  EIGEN_DEVICE_FUNC AngleAxis inverse() const
   { return AngleAxis(-m_angle, m_axis); }
 
   template<class QuatDerived>
-  AngleAxis& operator=(const QuaternionBase<QuatDerived>& q);
+  EIGEN_DEVICE_FUNC AngleAxis& operator=(const QuaternionBase<QuatDerived>& q);
   template<typename Derived>
-  AngleAxis& operator=(const MatrixBase<Derived>& m);
+  EIGEN_DEVICE_FUNC AngleAxis& operator=(const MatrixBase<Derived>& m);
 
   template<typename Derived>
-  AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix3 toRotationMatrix(void) const;
+  EIGEN_DEVICE_FUNC AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
+  EIGEN_DEVICE_FUNC Matrix3 toRotationMatrix(void) const;
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -120,24 +131,24 @@ public:
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type cast() const
   { return typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type(*this); }
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType>
-  inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other)
+  EIGEN_DEVICE_FUNC inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other)
   {
     m_axis = other.axis().template cast<Scalar>();
     m_angle = Scalar(other.angle());
   }
 
-  static inline const AngleAxis Identity() { return AngleAxis(0, Vector3::UnitX()); }
+  EIGEN_DEVICE_FUNC static inline const AngleAxis Identity() { return AngleAxis(Scalar(0), Vector3::UnitX()); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const AngleAxis& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const AngleAxis& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_axis.isApprox(other.m_axis, prec) && internal::isApprox(m_angle,other.m_angle, prec); }
 };
 
@@ -149,29 +160,32 @@ typedef AngleAxis<float> AngleAxisf;
 typedef AngleAxis<double> AngleAxisd;
 
 /** Set \c *this from a \b unit quaternion.
-  * The axis is normalized.
+  *
+  * The resulting axis is normalized, and the computed angle is in the [0,pi] range.
   * 
-  * \warning As any other method dealing with quaternion, if the input quaternion
-  *          is not normalized then the result is undefined.
+  * This function implicitly normalizes the quaternion \a q.
   */
 template<typename Scalar>
 template<typename QuatDerived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
 {
-  using std::acos;
-  using std::min;
-  using std::max;
-  using std::sqrt;
-  Scalar n2 = q.vec().squaredNorm();
-  if (n2 < NumTraits<Scalar>::dummy_precision()*NumTraits<Scalar>::dummy_precision())
+  EIGEN_USING_STD_MATH(atan2)
+  EIGEN_USING_STD_MATH(abs)
+  Scalar n = q.vec().norm();
+  if(n<NumTraits<Scalar>::epsilon())
+    n = q.vec().stableNorm();
+
+  if (n != Scalar(0))
   {
-    m_angle = 0;
-    m_axis << 1, 0, 0;
+    m_angle = Scalar(2)*atan2(n, abs(q.w()));
+    if(q.w() < 0)
+      n = -n;
+    m_axis  = q.vec() / n;
   }
   else
   {
-    m_angle = Scalar(2)*acos((min)((max)(Scalar(-1),q.w()),Scalar(1)));
-    m_axis = q.vec() / sqrt(n2);
+    m_angle = Scalar(0);
+    m_axis << Scalar(1), Scalar(0), Scalar(0);
   }
   return *this;
 }
@@ -180,7 +194,7 @@ AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived
   */
 template<typename Scalar>
 template<typename Derived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
 {
   // Since a direct conversion would not be really faster,
   // let's use the robust Quaternion implementation:
@@ -192,7 +206,7 @@ AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
 **/
 template<typename Scalar>
 template<typename Derived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
 {
   return *this = QuaternionType(mat);
 }
@@ -201,10 +215,10 @@ AngleAxis<Scalar>& AngleAxis<Scalar>::fromRotationMatrix(const MatrixBase<Derive
   */
 template<typename Scalar>
 typename AngleAxis<Scalar>::Matrix3
-AngleAxis<Scalar>::toRotationMatrix(void) const
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>::toRotationMatrix(void) const
 {
-  using std::sin;
-  using std::cos;
+  EIGEN_USING_STD_MATH(sin)
+  EIGEN_USING_STD_MATH(cos)
   Matrix3 res;
   Vector3 sin_axis  = sin(m_angle) * m_axis;
   Scalar c = cos(m_angle);
diff --git a/Eigen/src/Geometry/CMakeLists.txt b/Eigen/src/Geometry/CMakeLists.txt
deleted file mode 100644
index f8f728b84..000000000
--- a/Eigen/src/Geometry/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_Geometry_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Geometry_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Geometry COMPONENT Devel
-  )
-
-ADD_SUBDIRECTORY(arch)
diff --git a/Eigen/src/Geometry/EulerAngles.h b/Eigen/src/Geometry/EulerAngles.h
index 82802fb43..c633268af 100644
--- a/Eigen/src/Geometry/EulerAngles.h
+++ b/Eigen/src/Geometry/EulerAngles.h
@@ -33,12 +33,12 @@ namespace Eigen {
   * \sa class AngleAxis
   */
 template<typename Derived>
-inline Matrix<typename MatrixBase<Derived>::Scalar,3,1>
+EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar,3,1>
 MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
 {
-  using std::atan2;
-  using std::sin;
-  using std::cos;
+  EIGEN_USING_STD_MATH(atan2)
+  EIGEN_USING_STD_MATH(sin)
+  EIGEN_USING_STD_MATH(cos)
   /* Implemented from Graphics Gems IV */
   EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived,3,3)
 
@@ -55,7 +55,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
     res[0] = atan2(coeff(j,i), coeff(k,i));
     if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
     {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI);
+      if(res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      }
+      else {
+        res[0] += Scalar(EIGEN_PI);
+      }
       Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
       res[1] = -atan2(s2, coeff(i,i));
     }
@@ -84,7 +89,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
     res[0] = atan2(coeff(j,k), coeff(k,k));
     Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
     if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI);
+      if(res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      }
+      else {
+        res[0] += Scalar(EIGEN_PI);
+      }
       res[1] = atan2(-coeff(i,k), -c2);
     }
     else
diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 372e422b9..5f0da1a9e 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -34,7 +34,7 @@ struct traits<Homogeneous<MatrixType,Direction> >
  : traits<MatrixType>
 {
   typedef typename traits<MatrixType>::StorageKind StorageKind;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsPlusOne = (MatrixType::RowsAtCompileTime != Dynamic) ?
@@ -48,8 +48,7 @@ struct traits<Homogeneous<MatrixType,Direction> >
     TmpFlags = _MatrixTypeNested::Flags & HereditaryBits,
     Flags = ColsAtCompileTime==1 ? (TmpFlags & ~RowMajorBit)
           : RowsAtCompileTime==1 ? (TmpFlags | RowMajorBit)
-          : TmpFlags,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+          : TmpFlags
   };
 };
 
@@ -59,102 +58,117 @@ template<typename MatrixType,typename Rhs> struct homogeneous_right_product_impl
 } // end namespace internal
 
 template<typename MatrixType,int _Direction> class Homogeneous
-  : internal::no_assignment_operator, public MatrixBase<Homogeneous<MatrixType,_Direction> >
+  : public MatrixBase<Homogeneous<MatrixType,_Direction> >, internal::no_assignment_operator
 {
   public:
 
+    typedef MatrixType NestedExpression;
     enum { Direction = _Direction };
 
     typedef MatrixBase<Homogeneous> Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Homogeneous)
 
-    inline Homogeneous(const MatrixType& matrix)
+    EIGEN_DEVICE_FUNC explicit inline Homogeneous(const MatrixType& matrix)
       : m_matrix(matrix)
     {}
 
-    inline Index rows() const { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
-    inline Index cols() const { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
-
-    inline Scalar coeff(Index row, Index col) const
-    {
-      if(  (int(Direction)==Vertical   && row==m_matrix.rows())
-        || (int(Direction)==Horizontal && col==m_matrix.cols()))
-        return Scalar(1);
-      return m_matrix.coeff(row, col);
-    }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
+    
+    EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; }
 
     template<typename Rhs>
-    inline const internal::homogeneous_right_product_impl<Homogeneous,Rhs>
+    EIGEN_DEVICE_FUNC inline const Product<Homogeneous,Rhs>
     operator* (const MatrixBase<Rhs>& rhs) const
     {
       eigen_assert(int(Direction)==Horizontal);
-      return internal::homogeneous_right_product_impl<Homogeneous,Rhs>(m_matrix,rhs.derived());
+      return Product<Homogeneous,Rhs>(*this,rhs.derived());
     }
 
     template<typename Lhs> friend
-    inline const internal::homogeneous_left_product_impl<Homogeneous,Lhs>
+    EIGEN_DEVICE_FUNC inline const Product<Lhs,Homogeneous>
     operator* (const MatrixBase<Lhs>& lhs, const Homogeneous& rhs)
     {
       eigen_assert(int(Direction)==Vertical);
-      return internal::homogeneous_left_product_impl<Homogeneous,Lhs>(lhs.derived(),rhs.m_matrix);
+      return Product<Lhs,Homogeneous>(lhs.derived(),rhs);
     }
 
     template<typename Scalar, int Dim, int Mode, int Options> friend
-    inline const internal::homogeneous_left_product_impl<Homogeneous,Transform<Scalar,Dim,Mode,Options> >
+    EIGEN_DEVICE_FUNC inline const Product<Transform<Scalar,Dim,Mode,Options>, Homogeneous >
     operator* (const Transform<Scalar,Dim,Mode,Options>& lhs, const Homogeneous& rhs)
     {
       eigen_assert(int(Direction)==Vertical);
-      return internal::homogeneous_left_product_impl<Homogeneous,Transform<Scalar,Dim,Mode,Options> >(lhs,rhs.m_matrix);
+      return Product<Transform<Scalar,Dim,Mode,Options>, Homogeneous>(lhs,rhs);
+    }
+
+    template<typename Func>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::result_of<Func(Scalar,Scalar)>::type
+    redux(const Func& func) const
+    {
+      return func(m_matrix.redux(func), Scalar(1));
     }
 
   protected:
     typename MatrixType::Nested m_matrix;
 };
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \returns a vector expression that is one longer than the vector argument, with the value 1 symbolically appended as the last coefficient.
   *
-  * \return an expression of the equivalent homogeneous vector
+  * This can be used to convert affine coordinates to homogeneous coordinates.
   *
   * \only_for_vectors
   *
   * Example: \include MatrixBase_homogeneous.cpp
   * Output: \verbinclude MatrixBase_homogeneous.out
   *
-  * \sa class Homogeneous
+  * \sa VectorwiseOp::homogeneous(), class Homogeneous
   */
 template<typename Derived>
-inline typename MatrixBase<Derived>::HomogeneousReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::HomogeneousReturnType
 MatrixBase<Derived>::homogeneous() const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return derived();
+  return HomogeneousReturnType(derived());
 }
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
-  * \returns a matrix expression of homogeneous column (or row) vectors
+  * \returns an expression where the value 1 is symbolically appended as the final coefficient to each column (or row) of the matrix.
+  *
+  * This can be used to convert affine coordinates to homogeneous coordinates.
   *
   * Example: \include VectorwiseOp_homogeneous.cpp
   * Output: \verbinclude VectorwiseOp_homogeneous.out
   *
-  * \sa MatrixBase::homogeneous() */
+  * \sa MatrixBase::homogeneous(), class Homogeneous */
 template<typename ExpressionType, int Direction>
-inline Homogeneous<ExpressionType,Direction>
+EIGEN_DEVICE_FUNC inline Homogeneous<ExpressionType,Direction>
 VectorwiseOp<ExpressionType,Direction>::homogeneous() const
 {
-  return _expression();
+  return HomogeneousReturnType(_expression());
 }
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
-  * \returns an expression of the homogeneous normalized vector of \c *this
+  * \brief homogeneous normalization
+  *
+  * \returns a vector expression of the N-1 first coefficients of \c *this divided by that last coefficient.
+  *
+  * This can be used to convert homogeneous coordinates to affine coordinates.
+  *
+  * It is essentially a shortcut for:
+  * \code
+    this->head(this->size()-1)/this->coeff(this->size()-1);
+    \endcode
   *
   * Example: \include MatrixBase_hnormalized.cpp
   * Output: \verbinclude MatrixBase_hnormalized.out
   *
   * \sa VectorwiseOp::hnormalized() */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::HNormalizedReturnType
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::HNormalizedReturnType
 MatrixBase<Derived>::hnormalized() const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
@@ -163,16 +177,22 @@ MatrixBase<Derived>::hnormalized() const
     ColsAtCompileTime==1?1:size()-1) / coeff(size()-1);
 }
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \brief column or row-wise homogeneous normalization
   *
-  * \returns an expression of the homogeneous normalized vector of \c *this
+  * \returns an expression of the first N-1 coefficients of each column (or row) of \c *this divided by the last coefficient of each column (or row).
+  *
+  * This can be used to convert homogeneous coordinates to affine coordinates.
+  *
+  * It is conceptually equivalent to calling MatrixBase::hnormalized() to each column (or row) of \c *this.
   *
   * Example: \include DirectionWise_hnormalized.cpp
   * Output: \verbinclude DirectionWise_hnormalized.out
   *
   * \sa MatrixBase::hnormalized() */
 template<typename ExpressionType, int Direction>
-inline const typename VectorwiseOp<ExpressionType,Direction>::HNormalizedReturnType
+EIGEN_DEVICE_FUNC inline const typename VectorwiseOp<ExpressionType,Direction>::HNormalizedReturnType
 VectorwiseOp<ExpressionType,Direction>::hnormalized() const
 {
   return HNormalized_Block(_expression(),0,0,
@@ -196,7 +216,7 @@ template<typename MatrixOrTransformType>
 struct take_matrix_for_product
 {
   typedef MatrixOrTransformType type;
-  static const type& run(const type &x) { return x; }
+  EIGEN_DEVICE_FUNC static const type& run(const type &x) { return x; }
 };
 
 template<typename Scalar, int Dim, int Mode,int Options>
@@ -204,7 +224,7 @@ struct take_matrix_for_product<Transform<Scalar, Dim, Mode, Options> >
 {
   typedef Transform<Scalar, Dim, Mode, Options> TransformType;
   typedef typename internal::add_const<typename TransformType::ConstAffinePart>::type type;
-  static type run (const TransformType& x) { return x.affine(); }
+  EIGEN_DEVICE_FUNC static type run (const TransformType& x) { return x.affine(); }
 };
 
 template<typename Scalar, int Dim, int Options>
@@ -212,7 +232,7 @@ struct take_matrix_for_product<Transform<Scalar, Dim, Projective, Options> >
 {
   typedef Transform<Scalar, Dim, Projective, Options> TransformType;
   typedef typename TransformType::MatrixType type;
-  static const type& run (const TransformType& x) { return x.matrix(); }
+  EIGEN_DEVICE_FUNC static const type& run (const TransformType& x) { return x.matrix(); }
 };
 
 template<typename MatrixType,typename Lhs>
@@ -237,16 +257,15 @@ struct homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs>
   typedef typename traits<homogeneous_left_product_impl>::LhsMatrixType LhsMatrixType;
   typedef typename remove_all<LhsMatrixType>::type LhsMatrixTypeCleaned;
   typedef typename remove_all<typename LhsMatrixTypeCleaned::Nested>::type LhsMatrixTypeNested;
-  typedef typename MatrixType::Index Index;
-  homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
+  EIGEN_DEVICE_FUNC homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
     : m_lhs(take_matrix_for_product<Lhs>::run(lhs)),
       m_rhs(rhs)
   {}
 
-  inline Index rows() const { return m_lhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  template<typename Dest> EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const
   {
     // FIXME investigate how to allow lazy evaluation of this product when possible
     dst = Block<const LhsMatrixTypeNested,
@@ -277,15 +296,14 @@ struct homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs>
   : public ReturnByValue<homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs> >
 {
   typedef typename remove_all<typename Rhs::Nested>::type RhsNested;
-  typedef typename MatrixType::Index Index;
-  homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs)
+  EIGEN_DEVICE_FUNC homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs)
     : m_lhs(lhs), m_rhs(rhs)
   {}
 
-  inline Index rows() const { return m_lhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  template<typename Dest> EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const
   {
     // FIXME investigate how to allow lazy evaluation of this product when possible
     dst = m_lhs * Block<const RhsNested,
@@ -300,6 +318,178 @@ struct homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs>
   typename Rhs::Nested m_rhs;
 };
 
+template<typename ArgType,int Direction>
+struct evaluator_traits<Homogeneous<ArgType,Direction> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename ArgType::StorageKind>::Kind Kind;
+  typedef HomogeneousShape Shape;  
+};
+
+template<> struct AssignmentKind<DenseShape,HomogeneousShape> { typedef Dense2Dense Kind; };
+
+
+template<typename ArgType,int Direction>
+struct unary_evaluator<Homogeneous<ArgType,Direction>, IndexBased>
+  : evaluator<typename Homogeneous<ArgType,Direction>::PlainObject >
+{
+  typedef Homogeneous<ArgType,Direction> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
+    : Base(), m_temp(op)
+  {
+    ::new (static_cast<Base*>(this)) Base(m_temp);
+  }
+
+protected:
+  PlainObject m_temp;
+};
+
+// dense = homogeneous
+template< typename DstXprType, typename ArgType, typename Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType,Vertical>, internal::assign_op<Scalar,typename ArgType::Scalar>, Dense2Dense>
+{
+  typedef Homogeneous<ArgType,Vertical> SrcXprType;
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename ArgType::Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    dst.template topRows<ArgType::RowsAtCompileTime>(src.nestedExpression().rows()) = src.nestedExpression();
+    dst.row(dst.rows()-1).setOnes();
+  }
+};
+
+// dense = homogeneous
+template< typename DstXprType, typename ArgType, typename Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType,Horizontal>, internal::assign_op<Scalar,typename ArgType::Scalar>, Dense2Dense>
+{
+  typedef Homogeneous<ArgType,Horizontal> SrcXprType;
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename ArgType::Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    dst.template leftCols<ArgType::ColsAtCompileTime>(src.nestedExpression().cols()) = src.nestedExpression();
+    dst.col(dst.cols()-1).setOnes();
+  }
+};
+
+template<typename LhsArg, typename Rhs, int ProductTag>
+struct generic_product_impl<Homogeneous<LhsArg,Horizontal>, Rhs, HomogeneousShape, DenseShape, ProductTag>
+{
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC static void evalTo(Dest& dst, const Homogeneous<LhsArg,Horizontal>& lhs, const Rhs& rhs)
+  {
+    homogeneous_right_product_impl<Homogeneous<LhsArg,Horizontal>, Rhs>(lhs.nestedExpression(), rhs).evalTo(dst);
+  }
+};
+
+template<typename Lhs,typename Rhs>
+struct homogeneous_right_product_refactoring_helper
+{
+  enum {
+    Dim  = Lhs::ColsAtCompileTime,
+    Rows = Lhs::RowsAtCompileTime
+  };
+  typedef typename Rhs::template ConstNRowsBlockXpr<Dim>::Type          LinearBlockConst;
+  typedef typename remove_const<LinearBlockConst>::type                 LinearBlock;
+  typedef typename Rhs::ConstRowXpr                                     ConstantColumn;
+  typedef Replicate<const ConstantColumn,Rows,1>                        ConstantBlock;
+  typedef Product<Lhs,LinearBlock,LazyProduct>                          LinearProduct;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar,typename Rhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, HomogeneousShape, DenseShape>
+ : public evaluator<typename homogeneous_right_product_refactoring_helper<typename Lhs::NestedExpression,Rhs>::Xpr>
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef homogeneous_right_product_refactoring_helper<typename Lhs::NestedExpression,Rhs> helper;
+  typedef typename helper::ConstantBlock ConstantBlock;
+  typedef typename helper::Xpr RefactoredXpr;
+  typedef evaluator<RefactoredXpr> Base;
+  
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(  xpr.lhs().nestedExpression() .lazyProduct(  xpr.rhs().template topRows<helper::Dim>(xpr.lhs().nestedExpression().cols()) )
+            + ConstantBlock(xpr.rhs().row(xpr.rhs().rows()-1),xpr.lhs().rows(), 1) )
+  {}
+};
+
+template<typename Lhs, typename RhsArg, int ProductTag>
+struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, HomogeneousShape, ProductTag>
+{
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    homogeneous_left_product_impl<Homogeneous<RhsArg,Vertical>, Lhs>(lhs, rhs.nestedExpression()).evalTo(dst);
+  }
+};
+
+// TODO: the following specialization is to address a regression from 3.2 to 3.3
+// In the future, this path should be optimized.
+template<typename Lhs, typename RhsArg, int ProductTag>
+struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, TriangularShape, HomogeneousShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    dst.noalias() = lhs * rhs.eval();
+  }
+};
+
+template<typename Lhs,typename Rhs>
+struct homogeneous_left_product_refactoring_helper
+{
+  enum {
+    Dim = Rhs::RowsAtCompileTime,
+    Cols = Rhs::ColsAtCompileTime
+  };
+  typedef typename Lhs::template ConstNColsBlockXpr<Dim>::Type          LinearBlockConst;
+  typedef typename remove_const<LinearBlockConst>::type                 LinearBlock;
+  typedef typename Lhs::ConstColXpr                                     ConstantColumn;
+  typedef Replicate<const ConstantColumn,1,Cols>                        ConstantBlock;
+  typedef Product<LinearBlock,Rhs,LazyProduct>                          LinearProduct;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar,typename Rhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, HomogeneousShape>
+ : public evaluator<typename homogeneous_left_product_refactoring_helper<Lhs,typename Rhs::NestedExpression>::Xpr>
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef homogeneous_left_product_refactoring_helper<Lhs,typename Rhs::NestedExpression> helper;
+  typedef typename helper::ConstantBlock ConstantBlock;
+  typedef typename helper::Xpr RefactoredXpr;
+  typedef evaluator<RefactoredXpr> Base;
+  
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(   xpr.lhs().template leftCols<helper::Dim>(xpr.rhs().nestedExpression().rows()) .lazyProduct( xpr.rhs().nestedExpression() )
+            + ConstantBlock(xpr.lhs().col(xpr.lhs().cols()-1),1,xpr.rhs().cols()) )
+  {}
+};
+
+template<typename Scalar, int Dim, int Mode,int Options, typename RhsArg, int ProductTag>
+struct generic_product_impl<Transform<Scalar,Dim,Mode,Options>, Homogeneous<RhsArg,Vertical>, DenseShape, HomogeneousShape, ProductTag>
+{
+  typedef Transform<Scalar,Dim,Mode,Options> TransformType;
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC static void evalTo(Dest& dst, const TransformType& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    homogeneous_left_product_impl<Homogeneous<RhsArg,Vertical>, TransformType>(lhs, rhs.nestedExpression()).evalTo(dst);
+  }
+};
+
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, HomogeneousShape>
+  : public permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
+{};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Geometry/Hyperplane.h b/Eigen/src/Geometry/Hyperplane.h
index 00b7c4300..05929b299 100644
--- a/Eigen/src/Geometry/Hyperplane.h
+++ b/Eigen/src/Geometry/Hyperplane.h
@@ -22,8 +22,8 @@ namespace Eigen {
   * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
   * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
   *             Notice that the dimension of the hyperplane is _AmbientDim-1.
   *
   * This class represents an hyperplane as the zero set of the implicit equation
@@ -41,7 +41,7 @@ public:
   };
   typedef _Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef DenseIndex Index;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
   typedef Matrix<Scalar,Index(AmbientDimAtCompileTime)==Dynamic
                         ? Dynamic
@@ -50,21 +50,21 @@ public:
   typedef const Block<const Coefficients,AmbientDimAtCompileTime,1> ConstNormalReturnType;
 
   /** Default constructor without initialization */
-  inline Hyperplane() {}
+  EIGEN_DEVICE_FUNC inline Hyperplane() {}
   
   template<int OtherOptions>
-  Hyperplane(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC Hyperplane(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
    : m_coeffs(other.coeffs())
   {}
 
   /** Constructs a dynamic-size hyperplane with \a _dim the dimension
     * of the ambient space */
-  inline explicit Hyperplane(Index _dim) : m_coeffs(_dim+1) {}
+  EIGEN_DEVICE_FUNC inline explicit Hyperplane(Index _dim) : m_coeffs(_dim+1) {}
 
   /** Construct a plane from its normal \a n and a point \a e onto the plane.
     * \warning the vector normal is assumed to be normalized.
     */
-  inline Hyperplane(const VectorType& n, const VectorType& e)
+  EIGEN_DEVICE_FUNC inline Hyperplane(const VectorType& n, const VectorType& e)
     : m_coeffs(n.size()+1)
   {
     normal() = n;
@@ -75,7 +75,7 @@ public:
     * such that the algebraic equation of the plane is \f$ n \cdot x + d = 0 \f$.
     * \warning the vector normal is assumed to be normalized.
     */
-  inline Hyperplane(const VectorType& n, const Scalar& d)
+  EIGEN_DEVICE_FUNC inline Hyperplane(const VectorType& n, const Scalar& d)
     : m_coeffs(n.size()+1)
   {
     normal() = n;
@@ -85,7 +85,7 @@ public:
   /** Constructs a hyperplane passing through the two points. If the dimension of the ambient space
     * is greater than 2, then there isn't uniqueness, so an arbitrary choice is made.
     */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1)
+  EIGEN_DEVICE_FUNC static inline Hyperplane Through(const VectorType& p0, const VectorType& p1)
   {
     Hyperplane result(p0.size());
     result.normal() = (p1 - p0).unitOrthogonal();
@@ -96,7 +96,7 @@ public:
   /** Constructs a hyperplane passing through the three points. The dimension of the ambient space
     * is required to be exactly 3.
     */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2)
+  EIGEN_DEVICE_FUNC static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2)
   {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 3)
     Hyperplane result(p0.size());
@@ -120,19 +120,19 @@ public:
     * so an arbitrary choice is made.
     */
   // FIXME to be consitent with the rest this could be implemented as a static Through function ??
-  explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
+  EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
   {
     normal() = parametrized.direction().unitOrthogonal();
     offset() = -parametrized.origin().dot(normal());
   }
 
-  ~Hyperplane() {}
+  EIGEN_DEVICE_FUNC ~Hyperplane() {}
 
   /** \returns the dimension in which the plane holds */
-  inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_coeffs.size()-1 : Index(AmbientDimAtCompileTime); }
+  EIGEN_DEVICE_FUNC inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_coeffs.size()-1 : Index(AmbientDimAtCompileTime); }
 
   /** normalizes \c *this */
-  void normalize(void)
+  EIGEN_DEVICE_FUNC void normalize(void)
   {
     m_coeffs /= normal().norm();
   }
@@ -140,45 +140,45 @@ public:
   /** \returns the signed distance between the plane \c *this and a point \a p.
     * \sa absDistance()
     */
-  inline Scalar signedDistance(const VectorType& p) const { return normal().dot(p) + offset(); }
+  EIGEN_DEVICE_FUNC inline Scalar signedDistance(const VectorType& p) const { return normal().dot(p) + offset(); }
 
   /** \returns the absolute distance between the plane \c *this and a point \a p.
     * \sa signedDistance()
     */
-  inline Scalar absDistance(const VectorType& p) const { using std::abs; return abs(signedDistance(p)); }
+  EIGEN_DEVICE_FUNC inline Scalar absDistance(const VectorType& p) const { return numext::abs(signedDistance(p)); }
 
   /** \returns the projection of a point \a p onto the plane \c *this.
     */
-  inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
+  EIGEN_DEVICE_FUNC inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
 
   /** \returns a constant reference to the unit normal vector of the plane, which corresponds
     * to the linear part of the implicit equation.
     */
-  inline ConstNormalReturnType normal() const { return ConstNormalReturnType(m_coeffs,0,0,dim(),1); }
+  EIGEN_DEVICE_FUNC inline ConstNormalReturnType normal() const { return ConstNormalReturnType(m_coeffs,0,0,dim(),1); }
 
   /** \returns a non-constant reference to the unit normal vector of the plane, which corresponds
     * to the linear part of the implicit equation.
     */
-  inline NormalReturnType normal() { return NormalReturnType(m_coeffs,0,0,dim(),1); }
+  EIGEN_DEVICE_FUNC inline NormalReturnType normal() { return NormalReturnType(m_coeffs,0,0,dim(),1); }
 
   /** \returns the distance to the origin, which is also the "constant term" of the implicit equation
     * \warning the vector normal is assumed to be normalized.
     */
-  inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
+  EIGEN_DEVICE_FUNC inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
 
   /** \returns a non-constant reference to the distance to the origin, which is also the constant part
     * of the implicit equation */
-  inline Scalar& offset() { return m_coeffs(dim()); }
+  EIGEN_DEVICE_FUNC inline Scalar& offset() { return m_coeffs(dim()); }
 
   /** \returns a constant reference to the coefficients c_i of the plane equation:
     * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
     */
-  inline const Coefficients& coeffs() const { return m_coeffs; }
+  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
 
   /** \returns a non-constant reference to the coefficients c_i of the plane equation:
     * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
     */
-  inline Coefficients& coeffs() { return m_coeffs; }
+  EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
 
   /** \returns the intersection of *this with \a other.
     *
@@ -186,16 +186,15 @@ public:
     *
     * \note If \a other is approximately parallel to *this, this method will return any point on *this.
     */
-  VectorType intersection(const Hyperplane& other) const
+  EIGEN_DEVICE_FUNC VectorType intersection(const Hyperplane& other) const
   {
-    using std::abs;
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
     Scalar det = coeffs().coeff(0) * other.coeffs().coeff(1) - coeffs().coeff(1) * other.coeffs().coeff(0);
     // since the line equations ax+by=c are normalized with a^2+b^2=1, the following tests
     // whether the two lines are approximately parallel.
     if(internal::isMuchSmallerThan(det, Scalar(1)))
     {   // special case where the two lines are approximately parallel. Pick any point on the first line.
-        if(abs(coeffs().coeff(1))>abs(coeffs().coeff(0)))
+        if(numext::abs(coeffs().coeff(1))>numext::abs(coeffs().coeff(0)))
             return VectorType(coeffs().coeff(1), -coeffs().coeff(2)/coeffs().coeff(1)-coeffs().coeff(0));
         else
             return VectorType(-coeffs().coeff(2)/coeffs().coeff(0)-coeffs().coeff(1), coeffs().coeff(0));
@@ -215,10 +214,13 @@ public:
     *               or a more generic #Affine transformation. The default is #Affine.
     */
   template<typename XprType>
-  inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
+  EIGEN_DEVICE_FUNC inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
   {
     if (traits==Affine)
+    {
       normal() = mat.inverse().transpose() * normal();
+      m_coeffs /= normal().norm();
+    }
     else if (traits==Isometry)
       normal() = mat * normal();
     else
@@ -236,7 +238,7 @@ public:
     *               Other kind of transformations are not supported.
     */
   template<int TrOptions>
-  inline Hyperplane& transform(const Transform<Scalar,AmbientDimAtCompileTime,Affine,TrOptions>& t,
+  EIGEN_DEVICE_FUNC inline Hyperplane& transform(const Transform<Scalar,AmbientDimAtCompileTime,Affine,TrOptions>& t,
                                 TransformTraits traits = Affine)
   {
     transform(t.linear(), traits);
@@ -250,7 +252,7 @@ public:
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<Hyperplane,
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Hyperplane,
            Hyperplane<NewScalarType,AmbientDimAtCompileTime,Options> >::type cast() const
   {
     return typename internal::cast_return_type<Hyperplane,
@@ -259,7 +261,7 @@ public:
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType,int OtherOptions>
-  inline explicit Hyperplane(const Hyperplane<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC inline explicit Hyperplane(const Hyperplane<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
   { m_coeffs = other.coeffs().template cast<Scalar>(); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
@@ -267,7 +269,7 @@ public:
     *
     * \sa MatrixBase::isApprox() */
   template<int OtherOptions>
-  bool isApprox(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_coeffs.isApprox(other.m_coeffs, prec); }
 
 protected:
diff --git a/Eigen/src/Geometry/OrthoMethods.h b/Eigen/src/Geometry/OrthoMethods.h
index 556bc8160..a035e6310 100644
--- a/Eigen/src/Geometry/OrthoMethods.h
+++ b/Eigen/src/Geometry/OrthoMethods.h
@@ -13,16 +13,24 @@
 
 namespace Eigen { 
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
   * \returns the cross product of \c *this and \a other
   *
   * Here is a very good explanation of cross-product: http://xkcd.com/199/
+  * 
+  * With complex numbers, the cross product is implemented as
+  * \f$ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} - \mathbf{b} \times \mathbf{c})\f$
+  * 
   * \sa MatrixBase::cross3()
   */
 template<typename Derived>
 template<typename OtherDerived>
-inline typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
+#else
+inline typename MatrixBase<Derived>::PlainObject
+#endif
 MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,3)
@@ -30,8 +38,8 @@ MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
 
   // Note that there is no need for an expression here since the compiler
   // optimize such a small temporary very well (even within a complex expression)
-  typename internal::nested<Derived,2>::type lhs(derived());
-  typename internal::nested<OtherDerived,2>::type rhs(other.derived());
+  typename internal::nested_eval<Derived,2>::type lhs(derived());
+  typename internal::nested_eval<OtherDerived,2>::type rhs(other.derived());
   return typename cross_product_return_type<OtherDerived>::type(
     numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
     numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
@@ -45,7 +53,7 @@ template< int Arch,typename VectorLhs,typename VectorRhs,
           typename Scalar = typename VectorLhs::Scalar,
           bool Vectorizable = bool((VectorLhs::Flags&VectorRhs::Flags)&PacketAccessBit)>
 struct cross3_impl {
-  static inline typename internal::plain_matrix_type<VectorLhs>::type
+  EIGEN_DEVICE_FUNC static inline typename internal::plain_matrix_type<VectorLhs>::type
   run(const VectorLhs& lhs, const VectorRhs& rhs)
   {
     return typename internal::plain_matrix_type<VectorLhs>::type(
@@ -59,7 +67,7 @@ struct cross3_impl {
 
 }
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
   * \returns the cross product of \c *this and \a other using only the x, y, and z coefficients
   *
@@ -70,14 +78,14 @@ struct cross3_impl {
   */
 template<typename Derived>
 template<typename OtherDerived>
-inline typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::cross3(const MatrixBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,4)
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,4)
 
-  typedef typename internal::nested<Derived,2>::type DerivedNested;
-  typedef typename internal::nested<OtherDerived,2>::type OtherDerivedNested;
+  typedef typename internal::nested_eval<Derived,2>::type DerivedNested;
+  typedef typename internal::nested_eval<OtherDerived,2>::type OtherDerivedNested;
   DerivedNested lhs(derived());
   OtherDerivedNested rhs(other.derived());
 
@@ -86,38 +94,42 @@ MatrixBase<Derived>::cross3(const MatrixBase<OtherDerived>& other) const
                         typename internal::remove_all<OtherDerivedNested>::type>::run(lhs,rhs);
 }
 
-/** \returns a matrix expression of the cross product of each column or row
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \returns a matrix expression of the cross product of each column or row
   * of the referenced expression with the \a other vector.
   *
   * The referenced matrix must have one dimension equal to 3.
   * The result matrix has the same dimensions than the referenced one.
   *
-  * \geometry_module
-  *
   * \sa MatrixBase::cross() */
 template<typename ExpressionType, int Direction>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC 
 const typename VectorwiseOp<ExpressionType,Direction>::CrossReturnType
 VectorwiseOp<ExpressionType,Direction>::cross(const MatrixBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
     YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  
+  typename internal::nested_eval<ExpressionType,2>::type mat(_expression());
+  typename internal::nested_eval<OtherDerived,2>::type vec(other.derived());
 
   CrossReturnType res(_expression().rows(),_expression().cols());
   if(Direction==Vertical)
   {
     eigen_assert(CrossReturnType::RowsAtCompileTime==3 && "the matrix must have exactly 3 rows");
-    res.row(0) = (_expression().row(1) * other.coeff(2) - _expression().row(2) * other.coeff(1)).conjugate();
-    res.row(1) = (_expression().row(2) * other.coeff(0) - _expression().row(0) * other.coeff(2)).conjugate();
-    res.row(2) = (_expression().row(0) * other.coeff(1) - _expression().row(1) * other.coeff(0)).conjugate();
+    res.row(0) = (mat.row(1) * vec.coeff(2) - mat.row(2) * vec.coeff(1)).conjugate();
+    res.row(1) = (mat.row(2) * vec.coeff(0) - mat.row(0) * vec.coeff(2)).conjugate();
+    res.row(2) = (mat.row(0) * vec.coeff(1) - mat.row(1) * vec.coeff(0)).conjugate();
   }
   else
   {
     eigen_assert(CrossReturnType::ColsAtCompileTime==3 && "the matrix must have exactly 3 columns");
-    res.col(0) = (_expression().col(1) * other.coeff(2) - _expression().col(2) * other.coeff(1)).conjugate();
-    res.col(1) = (_expression().col(2) * other.coeff(0) - _expression().col(0) * other.coeff(2)).conjugate();
-    res.col(2) = (_expression().col(0) * other.coeff(1) - _expression().col(1) * other.coeff(0)).conjugate();
+    res.col(0) = (mat.col(1) * vec.coeff(2) - mat.col(2) * vec.coeff(1)).conjugate();
+    res.col(1) = (mat.col(2) * vec.coeff(0) - mat.col(0) * vec.coeff(2)).conjugate();
+    res.col(2) = (mat.col(0) * vec.coeff(1) - mat.col(1) * vec.coeff(0)).conjugate();
   }
   return res;
 }
@@ -130,8 +142,8 @@ struct unitOrthogonal_selector
   typedef typename plain_matrix_type<Derived>::type VectorType;
   typedef typename traits<Derived>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename Derived::Index Index;
   typedef Matrix<Scalar,2,1> Vector2;
+  EIGEN_DEVICE_FUNC
   static inline VectorType run(const Derived& src)
   {
     VectorType perp = VectorType::Zero(src.size());
@@ -154,6 +166,7 @@ struct unitOrthogonal_selector<Derived,3>
   typedef typename plain_matrix_type<Derived>::type VectorType;
   typedef typename traits<Derived>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline VectorType run(const Derived& src)
   {
     VectorType perp;
@@ -192,13 +205,16 @@ template<typename Derived>
 struct unitOrthogonal_selector<Derived,2>
 {
   typedef typename plain_matrix_type<Derived>::type VectorType;
+  EIGEN_DEVICE_FUNC
   static inline VectorType run(const Derived& src)
   { return VectorType(-numext::conj(src.y()), numext::conj(src.x())).normalized(); }
 };
 
 } // end namespace internal
 
-/** \returns a unit vector which is orthogonal to \c *this
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \returns a unit vector which is orthogonal to \c *this
   *
   * The size of \c *this must be at least 2. If the size is exactly 2,
   * then the returned vector is a counter clock wise rotation of \c *this, i.e., (-y,x).normalized().
@@ -206,7 +222,7 @@ struct unitOrthogonal_selector<Derived,2>
   * \sa cross()
   */
 template<typename Derived>
-typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::unitOrthogonal() const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
diff --git a/Eigen/src/Geometry/ParametrizedLine.h b/Eigen/src/Geometry/ParametrizedLine.h
index 77fa228e6..1e985d8cd 100644
--- a/Eigen/src/Geometry/ParametrizedLine.h
+++ b/Eigen/src/Geometry/ParametrizedLine.h
@@ -23,8 +23,8 @@ namespace Eigen {
   * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
   * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ t \in \mathbf{R} \f$.
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 class ParametrizedLine
@@ -37,49 +37,49 @@ public:
   };
   typedef _Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef DenseIndex Index;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1,Options> VectorType;
 
   /** Default constructor without initialization */
-  inline ParametrizedLine() {}
+  EIGEN_DEVICE_FUNC inline ParametrizedLine() {}
   
   template<int OtherOptions>
-  ParametrizedLine(const ParametrizedLine<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC ParametrizedLine(const ParametrizedLine<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
    : m_origin(other.origin()), m_direction(other.direction())
   {}
 
   /** Constructs a dynamic-size line with \a _dim the dimension
     * of the ambient space */
-  inline explicit ParametrizedLine(Index _dim) : m_origin(_dim), m_direction(_dim) {}
+  EIGEN_DEVICE_FUNC inline explicit ParametrizedLine(Index _dim) : m_origin(_dim), m_direction(_dim) {}
 
   /** Initializes a parametrized line of direction \a direction and origin \a origin.
     * \warning the vector direction is assumed to be normalized.
     */
-  ParametrizedLine(const VectorType& origin, const VectorType& direction)
+  EIGEN_DEVICE_FUNC ParametrizedLine(const VectorType& origin, const VectorType& direction)
     : m_origin(origin), m_direction(direction) {}
 
   template <int OtherOptions>
-  explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane);
+  EIGEN_DEVICE_FUNC explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane);
 
   /** Constructs a parametrized line going from \a p0 to \a p1. */
-  static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1)
+  EIGEN_DEVICE_FUNC static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1)
   { return ParametrizedLine(p0, (p1-p0).normalized()); }
 
-  ~ParametrizedLine() {}
+  EIGEN_DEVICE_FUNC ~ParametrizedLine() {}
 
   /** \returns the dimension in which the line holds */
-  inline Index dim() const { return m_direction.size(); }
+  EIGEN_DEVICE_FUNC inline Index dim() const { return m_direction.size(); }
 
-  const VectorType& origin() const { return m_origin; }
-  VectorType& origin() { return m_origin; }
+  EIGEN_DEVICE_FUNC const VectorType& origin() const { return m_origin; }
+  EIGEN_DEVICE_FUNC VectorType& origin() { return m_origin; }
 
-  const VectorType& direction() const { return m_direction; }
-  VectorType& direction() { return m_direction; }
+  EIGEN_DEVICE_FUNC const VectorType& direction() const { return m_direction; }
+  EIGEN_DEVICE_FUNC VectorType& direction() { return m_direction; }
 
   /** \returns the squared distance of a point \a p to its projection onto the line \c *this.
     * \sa distance()
     */
-  RealScalar squaredDistance(const VectorType& p) const
+  EIGEN_DEVICE_FUNC RealScalar squaredDistance(const VectorType& p) const
   {
     VectorType diff = p - origin();
     return (diff - direction().dot(diff) * direction()).squaredNorm();
@@ -87,22 +87,22 @@ public:
   /** \returns the distance of a point \a p to its projection onto the line \c *this.
     * \sa squaredDistance()
     */
-  RealScalar distance(const VectorType& p) const { using std::sqrt; return sqrt(squaredDistance(p)); }
+  EIGEN_DEVICE_FUNC RealScalar distance(const VectorType& p) const { EIGEN_USING_STD_MATH(sqrt) return sqrt(squaredDistance(p)); }
 
   /** \returns the projection of a point \a p onto the line \c *this. */
-  VectorType projection(const VectorType& p) const
+  EIGEN_DEVICE_FUNC VectorType projection(const VectorType& p) const
   { return origin() + direction().dot(p-origin()) * direction(); }
 
-  VectorType pointAt(const Scalar& t) const;
+  EIGEN_DEVICE_FUNC VectorType pointAt(const Scalar& t) const;
   
   template <int OtherOptions>
-  Scalar intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
+  EIGEN_DEVICE_FUNC Scalar intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
  
   template <int OtherOptions>
-  Scalar intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
+  EIGEN_DEVICE_FUNC Scalar intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
   
   template <int OtherOptions>
-  VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
+  EIGEN_DEVICE_FUNC VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -110,7 +110,7 @@ public:
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<ParametrizedLine,
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<ParametrizedLine,
            ParametrizedLine<NewScalarType,AmbientDimAtCompileTime,Options> >::type cast() const
   {
     return typename internal::cast_return_type<ParametrizedLine,
@@ -119,7 +119,7 @@ public:
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType,int OtherOptions>
-  inline explicit ParametrizedLine(const ParametrizedLine<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC inline explicit ParametrizedLine(const ParametrizedLine<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
   {
     m_origin = other.origin().template cast<Scalar>();
     m_direction = other.direction().template cast<Scalar>();
@@ -129,7 +129,7 @@ public:
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const ParametrizedLine& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const ParametrizedLine& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); }
 
 protected:
@@ -143,7 +143,7 @@ protected:
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 template <int OtherOptions>
-inline ParametrizedLine<_Scalar, _AmbientDim,_Options>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim,OtherOptions>& hyperplane)
+EIGEN_DEVICE_FUNC inline ParametrizedLine<_Scalar, _AmbientDim,_Options>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim,OtherOptions>& hyperplane)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
   direction() = hyperplane.normal().unitOrthogonal();
@@ -153,7 +153,7 @@ inline ParametrizedLine<_Scalar, _AmbientDim,_Options>::ParametrizedLine(const H
 /** \returns the point at \a t along this line
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
-inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
+EIGEN_DEVICE_FUNC inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
 ParametrizedLine<_Scalar, _AmbientDim,_Options>::pointAt(const _Scalar& t) const
 {
   return origin() + (direction()*t); 
@@ -163,7 +163,7 @@ ParametrizedLine<_Scalar, _AmbientDim,_Options>::pointAt(const _Scalar& t) const
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 template <int OtherOptions>
-inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
+EIGEN_DEVICE_FUNC inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
 {
   return -(hyperplane.offset()+hyperplane.normal().dot(origin()))
           / hyperplane.normal().dot(direction());
@@ -175,7 +175,7 @@ inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionPara
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 template <int OtherOptions>
-inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
+EIGEN_DEVICE_FUNC inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
 {
   return intersectionParameter(hyperplane);
 }
@@ -184,7 +184,7 @@ inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersection(con
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 template <int OtherOptions>
-inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
+EIGEN_DEVICE_FUNC inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
 ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
 {
   return pointAt(intersectionParameter(hyperplane));
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index 93056f60d..f6ef1bcf6 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -34,8 +34,9 @@ struct quaternionbase_assign_impl;
 template<class Derived>
 class QuaternionBase : public RotationBase<Derived, 3>
 {
+ public:
   typedef RotationBase<Derived, 3> Base;
-public:
+
   using Base::operator*;
   using Base::derived;
 
@@ -57,37 +58,37 @@ public:
 
 
   /** \returns the \c x coefficient */
-  inline Scalar x() const { return this->derived().coeffs().coeff(0); }
+  EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); }
   /** \returns the \c y coefficient */
-  inline Scalar y() const { return this->derived().coeffs().coeff(1); }
+  EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); }
   /** \returns the \c z coefficient */
-  inline Scalar z() const { return this->derived().coeffs().coeff(2); }
+  EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); }
   /** \returns the \c w coefficient */
-  inline Scalar w() const { return this->derived().coeffs().coeff(3); }
+  EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); }
 
   /** \returns a reference to the \c x coefficient */
-  inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
+  EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
   /** \returns a reference to the \c y coefficient */
-  inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
+  EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
   /** \returns a reference to the \c z coefficient */
-  inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
+  EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
   /** \returns a reference to the \c w coefficient */
-  inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
+  EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
 
   /** \returns a read-only vector expression of the imaginary part (x,y,z) */
-  inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
+  EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
 
   /** \returns a vector expression of the imaginary part (x,y,z) */
-  inline VectorBlock<Coefficients,3> vec() { return coeffs().template head<3>(); }
+  EIGEN_DEVICE_FUNC inline VectorBlock<Coefficients,3> vec() { return coeffs().template head<3>(); }
 
   /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
-  inline const typename internal::traits<Derived>::Coefficients& coeffs() const { return derived().coeffs(); }
+  EIGEN_DEVICE_FUNC inline const typename internal::traits<Derived>::Coefficients& coeffs() const { return derived().coeffs(); }
 
   /** \returns a vector expression of the coefficients (x,y,z,w) */
-  inline typename internal::traits<Derived>::Coefficients& coeffs() { return derived().coeffs(); }
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients& coeffs() { return derived().coeffs(); }
 
-  EIGEN_STRONG_INLINE QuaternionBase<Derived>& operator=(const QuaternionBase<Derived>& other);
-  template<class OtherDerived> EIGEN_STRONG_INLINE Derived& operator=(const QuaternionBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE QuaternionBase<Derived>& operator=(const QuaternionBase<Derived>& other);
+  template<class OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const QuaternionBase<OtherDerived>& other);
 
 // disabled this copy operator as it is giving very strange compilation errors when compiling
 // test_stdvector with GCC 4.4.2. This looks like a GCC bug though, so feel free to re-enable it if it's
@@ -96,72 +97,72 @@ public:
 //  Derived& operator=(const QuaternionBase& other)
 //  { return operator=<Derived>(other); }
 
-  Derived& operator=(const AngleAxisType& aa);
-  template<class OtherDerived> Derived& operator=(const MatrixBase<OtherDerived>& m);
+  EIGEN_DEVICE_FUNC Derived& operator=(const AngleAxisType& aa);
+  template<class OtherDerived> EIGEN_DEVICE_FUNC Derived& operator=(const MatrixBase<OtherDerived>& m);
 
   /** \returns a quaternion representing an identity rotation
     * \sa MatrixBase::Identity()
     */
-  static inline Quaternion<Scalar> Identity() { return Quaternion<Scalar>(1, 0, 0, 0); }
+  EIGEN_DEVICE_FUNC static inline Quaternion<Scalar> Identity() { return Quaternion<Scalar>(Scalar(1), Scalar(0), Scalar(0), Scalar(0)); }
 
   /** \sa QuaternionBase::Identity(), MatrixBase::setIdentity()
     */
-  inline QuaternionBase& setIdentity() { coeffs() << 0, 0, 0, 1; return *this; }
+  EIGEN_DEVICE_FUNC inline QuaternionBase& setIdentity() { coeffs() << Scalar(0), Scalar(0), Scalar(0), Scalar(1); return *this; }
 
   /** \returns the squared norm of the quaternion's coefficients
     * \sa QuaternionBase::norm(), MatrixBase::squaredNorm()
     */
-  inline Scalar squaredNorm() const { return coeffs().squaredNorm(); }
+  EIGEN_DEVICE_FUNC inline Scalar squaredNorm() const { return coeffs().squaredNorm(); }
 
   /** \returns the norm of the quaternion's coefficients
     * \sa QuaternionBase::squaredNorm(), MatrixBase::norm()
     */
-  inline Scalar norm() const { return coeffs().norm(); }
+  EIGEN_DEVICE_FUNC inline Scalar norm() const { return coeffs().norm(); }
 
   /** Normalizes the quaternion \c *this
     * \sa normalized(), MatrixBase::normalize() */
-  inline void normalize() { coeffs().normalize(); }
+  EIGEN_DEVICE_FUNC inline void normalize() { coeffs().normalize(); }
   /** \returns a normalized copy of \c *this
     * \sa normalize(), MatrixBase::normalized() */
-  inline Quaternion<Scalar> normalized() const { return Quaternion<Scalar>(coeffs().normalized()); }
+  EIGEN_DEVICE_FUNC inline Quaternion<Scalar> normalized() const { return Quaternion<Scalar>(coeffs().normalized()); }
 
     /** \returns the dot product of \c *this and \a other
     * Geometrically speaking, the dot product of two unit quaternions
     * corresponds to the cosine of half the angle between the two rotations.
     * \sa angularDistance()
     */
-  template<class OtherDerived> inline Scalar dot(const QuaternionBase<OtherDerived>& other) const { return coeffs().dot(other.coeffs()); }
+  template<class OtherDerived> EIGEN_DEVICE_FUNC inline Scalar dot(const QuaternionBase<OtherDerived>& other) const { return coeffs().dot(other.coeffs()); }
 
-  template<class OtherDerived> Scalar angularDistance(const QuaternionBase<OtherDerived>& other) const;
+  template<class OtherDerived> EIGEN_DEVICE_FUNC Scalar angularDistance(const QuaternionBase<OtherDerived>& other) const;
 
   /** \returns an equivalent 3x3 rotation matrix */
-  Matrix3 toRotationMatrix() const;
+  EIGEN_DEVICE_FUNC Matrix3 toRotationMatrix() const;
 
   /** \returns the quaternion which transform \a a into \a b through a rotation */
   template<typename Derived1, typename Derived2>
-  Derived& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
+  EIGEN_DEVICE_FUNC Derived& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
 
-  template<class OtherDerived> EIGEN_STRONG_INLINE Quaternion<Scalar> operator* (const QuaternionBase<OtherDerived>& q) const;
-  template<class OtherDerived> EIGEN_STRONG_INLINE Derived& operator*= (const QuaternionBase<OtherDerived>& q);
+  template<class OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Quaternion<Scalar> operator* (const QuaternionBase<OtherDerived>& q) const;
+  template<class OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*= (const QuaternionBase<OtherDerived>& q);
 
   /** \returns the quaternion describing the inverse rotation */
-  Quaternion<Scalar> inverse() const;
+  EIGEN_DEVICE_FUNC Quaternion<Scalar> inverse() const;
 
   /** \returns the conjugated quaternion */
-  Quaternion<Scalar> conjugate() const;
+  EIGEN_DEVICE_FUNC Quaternion<Scalar> conjugate() const;
 
-  template<class OtherDerived> Quaternion<Scalar> slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const;
+  template<class OtherDerived> EIGEN_DEVICE_FUNC Quaternion<Scalar> slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const;
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
   template<class OtherDerived>
-  bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
   { return coeffs().isApprox(other.coeffs(), prec); }
 
-	/** return the result vector of \a v through the rotation*/
-  EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;
+  /** return the result vector of \a v through the rotation*/
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -169,7 +170,7 @@ public:
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const
   {
     return typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type(derived());
   }
@@ -216,8 +217,8 @@ struct traits<Quaternion<_Scalar,_Options> >
   typedef _Scalar Scalar;
   typedef Matrix<_Scalar,4,1,_Options> Coefficients;
   enum{
-    IsAligned = internal::traits<Coefficients>::Flags & AlignedBit,
-    Flags = IsAligned ? (AlignedBit | LvalueBit) : LvalueBit
+    Alignment = internal::traits<Coefficients>::Alignment,
+    Flags = LvalueBit
   };
 };
 }
@@ -225,10 +226,10 @@ struct traits<Quaternion<_Scalar,_Options> >
 template<typename _Scalar, int _Options>
 class Quaternion : public QuaternionBase<Quaternion<_Scalar,_Options> >
 {
+public:
   typedef QuaternionBase<Quaternion<_Scalar,_Options> > Base;
-  enum { IsAligned = internal::traits<Quaternion>::IsAligned };
+  enum { NeedsAlignment = internal::traits<Quaternion>::Alignment>0 };
 
-public:
   typedef _Scalar Scalar;
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Quaternion)
@@ -238,7 +239,7 @@ public:
   typedef typename Base::AngleAxisType AngleAxisType;
 
   /** Default constructor leaving the quaternion uninitialized. */
-  inline Quaternion() {}
+  EIGEN_DEVICE_FUNC inline Quaternion() {}
 
   /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
     * its four coefficients \a w, \a x, \a y and \a z.
@@ -247,36 +248,42 @@ public:
     * while internally the coefficients are stored in the following order:
     * [\c x, \c y, \c z, \c w]
     */
-  inline Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, w){}
+  EIGEN_DEVICE_FUNC inline Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, w){}
 
   /** Constructs and initialize a quaternion from the array data */
-  inline Quaternion(const Scalar* data) : m_coeffs(data) {}
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const Scalar* data) : m_coeffs(data) {}
 
   /** Copy constructor */
-  template<class Derived> EIGEN_STRONG_INLINE Quaternion(const QuaternionBase<Derived>& other) { this->Base::operator=(other); }
+  template<class Derived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Quaternion(const QuaternionBase<Derived>& other) { this->Base::operator=(other); }
 
   /** Constructs and initializes a quaternion from the angle-axis \a aa */
-  explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
 
   /** Constructs and initializes a quaternion from either:
     *  - a rotation matrix expression,
     *  - a 4D vector expression representing quaternion coefficients.
     */
   template<typename Derived>
-  explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
 
   /** Explicit copy constructor with scalar conversion */
   template<typename OtherScalar, int OtherOptions>
-  explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
   { m_coeffs = other.coeffs().template cast<Scalar>(); }
 
+  EIGEN_DEVICE_FUNC static Quaternion UnitRandom();
+
   template<typename Derived1, typename Derived2>
-  static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
+  EIGEN_DEVICE_FUNC static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
 
-  inline Coefficients& coeffs() { return m_coeffs;}
-  inline const Coefficients& coeffs() const { return m_coeffs;}
+  EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs;}
+  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs;}
 
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(IsAligned)
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(NeedsAlignment))
+  
+#ifdef EIGEN_QUATERNION_PLUGIN
+# include EIGEN_QUATERNION_PLUGIN
+#endif
 
 protected:
   Coefficients m_coeffs;
@@ -336,9 +343,9 @@ template<typename _Scalar, int _Options>
 class Map<const Quaternion<_Scalar>, _Options >
   : public QuaternionBase<Map<const Quaternion<_Scalar>, _Options> >
 {
+  public:
     typedef QuaternionBase<Map<const Quaternion<_Scalar>, _Options> > Base;
 
-  public:
     typedef _Scalar Scalar;
     typedef typename internal::traits<Map>::Coefficients Coefficients;
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
@@ -350,9 +357,9 @@ class Map<const Quaternion<_Scalar>, _Options >
       * \code *coeffs == {x, y, z, w} \endcode
       *
       * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */
-    EIGEN_STRONG_INLINE Map(const Scalar* coeffs) : m_coeffs(coeffs) {}
+    EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Map(const Scalar* coeffs) : m_coeffs(coeffs) {}
 
-    inline const Coefficients& coeffs() const { return m_coeffs;}
+    EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs;}
 
   protected:
     const Coefficients m_coeffs;
@@ -373,9 +380,9 @@ template<typename _Scalar, int _Options>
 class Map<Quaternion<_Scalar>, _Options >
   : public QuaternionBase<Map<Quaternion<_Scalar>, _Options> >
 {
+  public:
     typedef QuaternionBase<Map<Quaternion<_Scalar>, _Options> > Base;
 
-  public:
     typedef _Scalar Scalar;
     typedef typename internal::traits<Map>::Coefficients Coefficients;
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
@@ -387,10 +394,10 @@ class Map<Quaternion<_Scalar>, _Options >
       * \code *coeffs == {x, y, z, w} \endcode
       *
       * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */
-    EIGEN_STRONG_INLINE Map(Scalar* coeffs) : m_coeffs(coeffs) {}
+    EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Map(Scalar* coeffs) : m_coeffs(coeffs) {}
 
-    inline Coefficients& coeffs() { return m_coeffs; }
-    inline const Coefficients& coeffs() const { return m_coeffs; }
+    EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
+    EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
 
   protected:
     Coefficients m_coeffs;
@@ -418,7 +425,7 @@ typedef Map<Quaternion<double>, Aligned>  QuaternionMapAlignedd;
 namespace internal {
 template<int Arch, class Derived1, class Derived2, typename Scalar, int _Options> struct quat_product
 {
-  static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived1>& a, const QuaternionBase<Derived2>& b){
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived1>& a, const QuaternionBase<Derived2>& b){
     return Quaternion<Scalar>
     (
       a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
@@ -433,20 +440,20 @@ template<int Arch, class Derived1, class Derived2, typename Scalar, int _Options
 /** \returns the concatenation of two rotations as a quaternion-quaternion product */
 template <class Derived>
 template <class OtherDerived>
-EIGEN_STRONG_INLINE Quaternion<typename internal::traits<Derived>::Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT((internal::is_same<typename Derived::Scalar, typename OtherDerived::Scalar>::value),
    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
   return internal::quat_product<Architecture::Target, Derived, OtherDerived,
                          typename internal::traits<Derived>::Scalar,
-                         internal::traits<Derived>::IsAligned && internal::traits<OtherDerived>::IsAligned>::run(*this, other);
+                         EIGEN_PLAIN_ENUM_MIN(internal::traits<Derived>::Alignment, internal::traits<OtherDerived>::Alignment)>::run(*this, other);
 }
 
 /** \sa operator*(Quaternion) */
 template <class Derived>
 template <class OtherDerived>
-EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const QuaternionBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const QuaternionBase<OtherDerived>& other)
 {
   derived() = derived() * other.derived();
   return derived();
@@ -460,7 +467,7 @@ EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const Quaterni
   *   - Via a Matrix3: 24 + 15n
   */
 template <class Derived>
-EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
 QuaternionBase<Derived>::_transformVector(const Vector3& v) const
 {
     // Note that this algorithm comes from the optimization by hand
@@ -474,7 +481,7 @@ QuaternionBase<Derived>::_transformVector(const Vector3& v) const
 }
 
 template<class Derived>
-EIGEN_STRONG_INLINE QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(const QuaternionBase<Derived>& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(const QuaternionBase<Derived>& other)
 {
   coeffs() = other.coeffs();
   return derived();
@@ -482,7 +489,7 @@ EIGEN_STRONG_INLINE QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(
 
 template<class Derived>
 template<class OtherDerived>
-EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const QuaternionBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const QuaternionBase<OtherDerived>& other)
 {
   coeffs() = other.coeffs();
   return derived();
@@ -491,10 +498,10 @@ EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const Quaternion
 /** Set \c *this from an angle-axis \a aa and returns a reference to \c *this
   */
 template<class Derived>
-EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const AngleAxisType& aa)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const AngleAxisType& aa)
 {
-  using std::cos;
-  using std::sin;
+  EIGEN_USING_STD_MATH(cos)
+  EIGEN_USING_STD_MATH(sin)
   Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
   this->w() = cos(ha);
   this->vec() = sin(ha) * aa.axis();
@@ -509,7 +516,7 @@ EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const AngleAxisT
 
 template<class Derived>
 template<class MatrixDerived>
-inline Derived& QuaternionBase<Derived>::operator=(const MatrixBase<MatrixDerived>& xpr)
+EIGEN_DEVICE_FUNC inline Derived& QuaternionBase<Derived>::operator=(const MatrixBase<MatrixDerived>& xpr)
 {
   EIGEN_STATIC_ASSERT((internal::is_same<typename Derived::Scalar, typename MatrixDerived::Scalar>::value),
    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
@@ -521,7 +528,7 @@ inline Derived& QuaternionBase<Derived>::operator=(const MatrixBase<MatrixDerive
   * be normalized, otherwise the result is undefined.
   */
 template<class Derived>
-inline typename QuaternionBase<Derived>::Matrix3
+EIGEN_DEVICE_FUNC inline typename QuaternionBase<Derived>::Matrix3
 QuaternionBase<Derived>::toRotationMatrix(void) const
 {
   // NOTE if inlined, then gcc 4.2 and 4.4 get rid of the temporary (not gcc 4.3 !!)
@@ -568,10 +575,9 @@ QuaternionBase<Derived>::toRotationMatrix(void) const
   */
 template<class Derived>
 template<typename Derived1, typename Derived2>
-inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
+EIGEN_DEVICE_FUNC inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
 {
-  using std::max;
-  using std::sqrt;
+  EIGEN_USING_STD_MATH(sqrt)
   Vector3 v0 = a.normalized();
   Vector3 v1 = b.normalized();
   Scalar c = v1.dot(v0);
@@ -586,7 +592,7 @@ inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Deri
   //    which yields a singular value problem
   if (c < Scalar(-1)+NumTraits<Scalar>::dummy_precision())
   {
-    c = (max)(c,Scalar(-1));
+    c = numext::maxi(c,Scalar(-1));
     Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
     JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
     Vector3 axis = svd.matrixV().col(2);
@@ -605,6 +611,24 @@ inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Deri
   return derived();
 }
 
+/** \returns a random unit quaternion following a uniform distribution law on SO(3)
+  *
+  * \note The implementation is based on http://planning.cs.uiuc.edu/node198.html
+  */
+template<typename Scalar, int Options>
+EIGEN_DEVICE_FUNC Quaternion<Scalar,Options> Quaternion<Scalar,Options>::UnitRandom()
+{
+  EIGEN_USING_STD_MATH(sqrt)
+  EIGEN_USING_STD_MATH(sin)
+  EIGEN_USING_STD_MATH(cos)
+  const Scalar u1 = internal::random<Scalar>(0, 1),
+               u2 = internal::random<Scalar>(0, 2*EIGEN_PI),
+               u3 = internal::random<Scalar>(0, 2*EIGEN_PI);
+  const Scalar a = sqrt(1 - u1),
+               b = sqrt(u1);
+  return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
+}
+
 
 /** Returns a quaternion representing a rotation between
   * the two arbitrary vectors \a a and \a b. In other words, the built
@@ -618,7 +642,7 @@ inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Deri
   */
 template<typename Scalar, int Options>
 template<typename Derived1, typename Derived2>
-Quaternion<Scalar,Options> Quaternion<Scalar,Options>::FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
+EIGEN_DEVICE_FUNC Quaternion<Scalar,Options> Quaternion<Scalar,Options>::FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
 {
     Quaternion quat;
     quat.setFromTwoVectors(a, b);
@@ -633,7 +657,7 @@ Quaternion<Scalar,Options> Quaternion<Scalar,Options>::FromTwoVectors(const Matr
   * \sa QuaternionBase::conjugate()
   */
 template <class Derived>
-inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::inverse() const
+EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::inverse() const
 {
   // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
   Scalar n2 = this->squaredNorm();
@@ -646,6 +670,16 @@ inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Der
   }
 }
 
+// Generic conjugate of a Quaternion
+namespace internal {
+template<int Arch, class Derived, typename Scalar, int _Options> struct quat_conj
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived>& q){
+    return Quaternion<Scalar>(q.w(),-q.x(),-q.y(),-q.z());
+  }
+};
+}
+                         
 /** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
   * if the quaternion is normalized.
   * The conjugate of a quaternion represents the opposite rotation.
@@ -653,10 +687,13 @@ inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Der
   * \sa Quaternion2::inverse()
   */
 template <class Derived>
-inline Quaternion<typename internal::traits<Derived>::Scalar>
+EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::conjugate() const
 {
-  return Quaternion<Scalar>(this->w(),-this->x(),-this->y(),-this->z());
+  return internal::quat_conj<Architecture::Target, Derived,
+                         typename internal::traits<Derived>::Scalar,
+                         internal::traits<Derived>::Alignment>::run(*this);
+                         
 }
 
 /** \returns the angle (in radian) between two rotations
@@ -664,13 +701,12 @@ QuaternionBase<Derived>::conjugate() const
   */
 template <class Derived>
 template <class OtherDerived>
-inline typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar
 QuaternionBase<Derived>::angularDistance(const QuaternionBase<OtherDerived>& other) const
 {
-  using std::atan2;
-  using std::abs;
+  EIGEN_USING_STD_MATH(atan2)
   Quaternion<Scalar> d = (*this) * other.conjugate();
-  return Scalar(2) * atan2( d.vec().norm(), abs(d.w()) );
+  return Scalar(2) * atan2( d.vec().norm(), numext::abs(d.w()) );
 }
 
  
@@ -683,15 +719,14 @@ QuaternionBase<Derived>::angularDistance(const QuaternionBase<OtherDerived>& oth
   */
 template <class Derived>
 template <class OtherDerived>
-Quaternion<typename internal::traits<Derived>::Scalar>
+EIGEN_DEVICE_FUNC Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const
 {
-  using std::acos;
-  using std::sin;
-  using std::abs;
-  static const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
+  EIGEN_USING_STD_MATH(acos)
+  EIGEN_USING_STD_MATH(sin)
+  const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
   Scalar d = this->dot(other);
-  Scalar absD = abs(d);
+  Scalar absD = numext::abs(d);
 
   Scalar scale0;
   Scalar scale1;
@@ -722,10 +757,10 @@ template<typename Other>
 struct quaternionbase_assign_impl<Other,3,3>
 {
   typedef typename Other::Scalar Scalar;
-  typedef DenseIndex Index;
-  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& mat)
+  template<class Derived> EIGEN_DEVICE_FUNC static inline void run(QuaternionBase<Derived>& q, const Other& a_mat)
   {
-    using std::sqrt;
+    const typename internal::nested_eval<Other,2>::type mat(a_mat);
+    EIGEN_USING_STD_MATH(sqrt)
     // This algorithm comes from  "Quaternion Calculus and Fast Animation",
     // Ken Shoemake, 1987 SIGGRAPH course notes
     Scalar t = mat.trace();
@@ -740,13 +775,13 @@ struct quaternionbase_assign_impl<Other,3,3>
     }
     else
     {
-      DenseIndex i = 0;
+      Index i = 0;
       if (mat.coeff(1,1) > mat.coeff(0,0))
         i = 1;
       if (mat.coeff(2,2) > mat.coeff(i,i))
         i = 2;
-      DenseIndex j = (i+1)%3;
-      DenseIndex k = (j+1)%3;
+      Index j = (i+1)%3;
+      Index k = (j+1)%3;
 
       t = sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
       q.coeffs().coeffRef(i) = Scalar(0.5) * t;
@@ -763,7 +798,7 @@ template<typename Other>
 struct quaternionbase_assign_impl<Other,4,1>
 {
   typedef typename Other::Scalar Scalar;
-  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& vec)
+  template<class Derived> EIGEN_DEVICE_FUNC static inline void run(QuaternionBase<Derived>& q, const Other& vec)
   {
     q.coeffs() = vec;
   }
diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h
index a2d59fce1..884b7d0ee 100644
--- a/Eigen/src/Geometry/Rotation2D.h
+++ b/Eigen/src/Geometry/Rotation2D.h
@@ -18,7 +18,7 @@ namespace Eigen {
   *
   * \brief Represents a rotation/orientation in a 2 dimensional space.
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
   *
   * This class is equivalent to a single scalar representing a counter clock wise rotation
   * as a single angle in radian. It provides some additional features such as the automatic
@@ -59,41 +59,79 @@ protected:
 public:
 
   /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
-  inline Rotation2D(const Scalar& a) : m_angle(a) {}
+  EIGEN_DEVICE_FUNC explicit inline Rotation2D(const Scalar& a) : m_angle(a) {}
   
   /** Default constructor wihtout initialization. The represented rotation is undefined. */
-  Rotation2D() {}
+  EIGEN_DEVICE_FUNC Rotation2D() {}
+
+  /** Construct a 2D rotation from a 2x2 rotation matrix \a mat.
+    *
+    * \sa fromRotationMatrix()
+    */
+  template<typename Derived>
+  EIGEN_DEVICE_FUNC explicit Rotation2D(const MatrixBase<Derived>& m)
+  {
+    fromRotationMatrix(m.derived());
+  }
 
   /** \returns the rotation angle */
-  inline Scalar angle() const { return m_angle; }
+  EIGEN_DEVICE_FUNC inline Scalar angle() const { return m_angle; }
 
   /** \returns a read-write reference to the rotation angle */
-  inline Scalar& angle() { return m_angle; }
+  EIGEN_DEVICE_FUNC inline Scalar& angle() { return m_angle; }
+  
+  /** \returns the rotation angle in [0,2pi] */
+  EIGEN_DEVICE_FUNC inline Scalar smallestPositiveAngle() const {
+    Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI));
+    return tmp<Scalar(0) ? tmp + Scalar(2*EIGEN_PI) : tmp;
+  }
+  
+  /** \returns the rotation angle in [-pi,pi] */
+  EIGEN_DEVICE_FUNC inline Scalar smallestAngle() const {
+    Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI));
+    if(tmp>Scalar(EIGEN_PI))       tmp -= Scalar(2*EIGEN_PI);
+    else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2*EIGEN_PI);
+    return tmp;
+  }
 
   /** \returns the inverse rotation */
-  inline Rotation2D inverse() const { return -m_angle; }
+  EIGEN_DEVICE_FUNC inline Rotation2D inverse() const { return Rotation2D(-m_angle); }
 
   /** Concatenates two rotations */
-  inline Rotation2D operator*(const Rotation2D& other) const
-  { return m_angle + other.m_angle; }
+  EIGEN_DEVICE_FUNC inline Rotation2D operator*(const Rotation2D& other) const
+  { return Rotation2D(m_angle + other.m_angle); }
 
   /** Concatenates two rotations */
-  inline Rotation2D& operator*=(const Rotation2D& other)
+  EIGEN_DEVICE_FUNC inline Rotation2D& operator*=(const Rotation2D& other)
   { m_angle += other.m_angle; return *this; }
 
   /** Applies the rotation to a 2D vector */
-  Vector2 operator* (const Vector2& vec) const
+  EIGEN_DEVICE_FUNC Vector2 operator* (const Vector2& vec) const
   { return toRotationMatrix() * vec; }
   
   template<typename Derived>
-  Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix2 toRotationMatrix() const;
+  EIGEN_DEVICE_FUNC Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
+  EIGEN_DEVICE_FUNC Matrix2 toRotationMatrix() const;
+
+  /** Set \c *this from a 2x2 rotation matrix \a mat.
+    * In other words, this function extract the rotation angle from the rotation matrix.
+    *
+    * This method is an alias for fromRotationMatrix()
+    *
+    * \sa fromRotationMatrix()
+    */
+  template<typename Derived>
+  EIGEN_DEVICE_FUNC Rotation2D& operator=(const  MatrixBase<Derived>& m)
+  { return fromRotationMatrix(m.derived()); }
 
   /** \returns the spherical interpolation between \c *this and \a other using
     * parameter \a t. It is in fact equivalent to a linear interpolation.
     */
-  inline Rotation2D slerp(const Scalar& t, const Rotation2D& other) const
-  { return m_angle * (1-t) + other.angle() * t; }
+  EIGEN_DEVICE_FUNC inline Rotation2D slerp(const Scalar& t, const Rotation2D& other) const
+  {
+    Scalar dist = Rotation2D(other.m_angle-m_angle).smallestAngle();
+    return Rotation2D(m_angle + dist*t);
+  }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -101,24 +139,25 @@ public:
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type cast() const
   { return typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type(*this); }
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType>
-  inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other)
+  EIGEN_DEVICE_FUNC inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other)
   {
     m_angle = Scalar(other.angle());
   }
 
-  static inline Rotation2D Identity() { return Rotation2D(0); }
+  EIGEN_DEVICE_FUNC static inline Rotation2D Identity() { return Rotation2D(0); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const Rotation2D& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const Rotation2D& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return internal::isApprox(m_angle,other.m_angle, prec); }
+  
 };
 
 /** \ingroup Geometry_Module
@@ -134,9 +173,9 @@ typedef Rotation2D<double> Rotation2Dd;
   */
 template<typename Scalar>
 template<typename Derived>
-Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
+EIGEN_DEVICE_FUNC Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
 {
-  using std::atan2;
+  EIGEN_USING_STD_MATH(atan2)
   EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
   m_angle = atan2(mat.coeff(1,0), mat.coeff(0,0));
   return *this;
@@ -146,10 +185,10 @@ Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Deri
   */
 template<typename Scalar>
 typename Rotation2D<Scalar>::Matrix2
-Rotation2D<Scalar>::toRotationMatrix(void) const
+EIGEN_DEVICE_FUNC Rotation2D<Scalar>::toRotationMatrix(void) const
 {
-  using std::sin;
-  using std::cos;
+  EIGEN_USING_STD_MATH(sin)
+  EIGEN_USING_STD_MATH(cos)
   Scalar sinA = sin(m_angle);
   Scalar cosA = cos(m_angle);
   return (Matrix2() << cosA, -sinA, sinA, cosA).finished();
diff --git a/Eigen/src/Geometry/RotationBase.h b/Eigen/src/Geometry/RotationBase.h
index b88661de6..f0ee0bd03 100644
--- a/Eigen/src/Geometry/RotationBase.h
+++ b/Eigen/src/Geometry/RotationBase.h
@@ -22,8 +22,8 @@ struct rotation_base_generic_product_selector;
   *
   * \brief Common base class for compact rotation representations
   *
-  * \param Derived is the derived type, i.e., a rotation type
-  * \param _Dim the dimension of the space
+  * \tparam Derived is the derived type, i.e., a rotation type
+  * \tparam _Dim the dimension of the space
   */
 template<typename Derived, int _Dim>
 class RotationBase
@@ -38,26 +38,26 @@ class RotationBase
     typedef Matrix<Scalar,Dim,1> VectorType;
 
   public:
-    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    inline Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    EIGEN_DEVICE_FUNC inline Derived& derived() { return *static_cast<Derived*>(this); }
 
     /** \returns an equivalent rotation matrix */
-    inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
+    EIGEN_DEVICE_FUNC inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
 
     /** \returns an equivalent rotation matrix 
       * This function is added to be conform with the Transform class' naming scheme.
       */
-    inline RotationMatrixType matrix() const { return derived().toRotationMatrix(); }
+    EIGEN_DEVICE_FUNC inline RotationMatrixType matrix() const { return derived().toRotationMatrix(); }
 
     /** \returns the inverse rotation */
-    inline Derived inverse() const { return derived().inverse(); }
+    EIGEN_DEVICE_FUNC inline Derived inverse() const { return derived().inverse(); }
 
     /** \returns the concatenation of the rotation \c *this with a translation \a t */
-    inline Transform<Scalar,Dim,Isometry> operator*(const Translation<Scalar,Dim>& t) const
+    EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Isometry> operator*(const Translation<Scalar,Dim>& t) const
     { return Transform<Scalar,Dim,Isometry>(*this) * t; }
 
     /** \returns the concatenation of the rotation \c *this with a uniform scaling \a s */
-    inline RotationMatrixType operator*(const UniformScaling<Scalar>& s) const
+    EIGEN_DEVICE_FUNC inline RotationMatrixType operator*(const UniformScaling<Scalar>& s) const
     { return toRotationMatrix() * s.factor(); }
 
     /** \returns the concatenation of the rotation \c *this with a generic expression \a e
@@ -67,17 +67,17 @@ class RotationBase
       *  - a vector of size Dim
       */
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE typename internal::rotation_base_generic_product_selector<Derived,OtherDerived,OtherDerived::IsVectorAtCompileTime>::ReturnType
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::rotation_base_generic_product_selector<Derived,OtherDerived,OtherDerived::IsVectorAtCompileTime>::ReturnType
     operator*(const EigenBase<OtherDerived>& e) const
     { return internal::rotation_base_generic_product_selector<Derived,OtherDerived>::run(derived(), e.derived()); }
 
     /** \returns the concatenation of a linear transformation \a l with the rotation \a r */
     template<typename OtherDerived> friend
-    inline RotationMatrixType operator*(const EigenBase<OtherDerived>& l, const Derived& r)
+    EIGEN_DEVICE_FUNC inline RotationMatrixType operator*(const EigenBase<OtherDerived>& l, const Derived& r)
     { return l.derived() * r.toRotationMatrix(); }
 
     /** \returns the concatenation of a scaling \a l with the rotation \a r */
-    friend inline Transform<Scalar,Dim,Affine> operator*(const DiagonalMatrix<Scalar,Dim>& l, const Derived& r)
+    EIGEN_DEVICE_FUNC friend inline Transform<Scalar,Dim,Affine> operator*(const DiagonalMatrix<Scalar,Dim>& l, const Derived& r)
     { 
       Transform<Scalar,Dim,Affine> res(r);
       res.linear().applyOnTheLeft(l);
@@ -86,11 +86,11 @@ class RotationBase
 
     /** \returns the concatenation of the rotation \c *this with a transformation \a t */
     template<int Mode, int Options>
-    inline Transform<Scalar,Dim,Mode> operator*(const Transform<Scalar,Dim,Mode,Options>& t) const
+    EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode> operator*(const Transform<Scalar,Dim,Mode,Options>& t) const
     { return toRotationMatrix() * t; }
 
     template<typename OtherVectorType>
-    inline VectorType _transformVector(const OtherVectorType& v) const
+    EIGEN_DEVICE_FUNC inline VectorType _transformVector(const OtherVectorType& v) const
     { return toRotationMatrix() * v; }
 };
 
@@ -102,7 +102,7 @@ struct rotation_base_generic_product_selector<RotationDerived,MatrixType,false>
 {
   enum { Dim = RotationDerived::Dim };
   typedef Matrix<typename RotationDerived::Scalar,Dim,Dim> ReturnType;
-  static inline ReturnType run(const RotationDerived& r, const MatrixType& m)
+  EIGEN_DEVICE_FUNC static inline ReturnType run(const RotationDerived& r, const MatrixType& m)
   { return r.toRotationMatrix() * m; }
 };
 
@@ -110,7 +110,7 @@ template<typename RotationDerived, typename Scalar, int Dim, int MaxDim>
 struct rotation_base_generic_product_selector< RotationDerived, DiagonalMatrix<Scalar,Dim,MaxDim>, false >
 {
   typedef Transform<Scalar,Dim,Affine> ReturnType;
-  static inline ReturnType run(const RotationDerived& r, const DiagonalMatrix<Scalar,Dim,MaxDim>& m)
+  EIGEN_DEVICE_FUNC static inline ReturnType run(const RotationDerived& r, const DiagonalMatrix<Scalar,Dim,MaxDim>& m)
   {
     ReturnType res(r);
     res.linear() *= m;
@@ -123,7 +123,7 @@ struct rotation_base_generic_product_selector<RotationDerived,OtherVectorType,tr
 {
   enum { Dim = RotationDerived::Dim };
   typedef Matrix<typename RotationDerived::Scalar,Dim,1> ReturnType;
-  static EIGEN_STRONG_INLINE ReturnType run(const RotationDerived& r, const OtherVectorType& v)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE ReturnType run(const RotationDerived& r, const OtherVectorType& v)
   {
     return r._transformVector(v);
   }
@@ -137,7 +137,7 @@ struct rotation_base_generic_product_selector<RotationDerived,OtherVectorType,tr
   */
 template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
 template<typename OtherDerived>
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
+EIGEN_DEVICE_FUNC Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
 ::Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
 {
   EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
@@ -150,7 +150,7 @@ Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
   */
 template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
 template<typename OtherDerived>
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>&
+EIGEN_DEVICE_FUNC Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>&
 Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
 ::operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
 {
@@ -164,8 +164,8 @@ namespace internal {
   *
   * Helper function to return an arbitrary rotation object to a rotation matrix.
   *
-  * \param Scalar the numeric type of the matrix coefficients
-  * \param Dim the dimension of the current space
+  * \tparam Scalar the numeric type of the matrix coefficients
+  * \tparam Dim the dimension of the current space
   *
   * It returns a Dim x Dim fixed size matrix.
   *
@@ -179,20 +179,20 @@ namespace internal {
   * \sa class Transform, class Rotation2D, class Quaternion, class AngleAxis
   */
 template<typename Scalar, int Dim>
-static inline Matrix<Scalar,2,2> toRotationMatrix(const Scalar& s)
+EIGEN_DEVICE_FUNC static inline Matrix<Scalar,2,2> toRotationMatrix(const Scalar& s)
 {
   EIGEN_STATIC_ASSERT(Dim==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
   return Rotation2D<Scalar>(s).toRotationMatrix();
 }
 
 template<typename Scalar, int Dim, typename OtherDerived>
-static inline Matrix<Scalar,Dim,Dim> toRotationMatrix(const RotationBase<OtherDerived,Dim>& r)
+EIGEN_DEVICE_FUNC static inline Matrix<Scalar,Dim,Dim> toRotationMatrix(const RotationBase<OtherDerived,Dim>& r)
 {
   return r.toRotationMatrix();
 }
 
 template<typename Scalar, int Dim, typename OtherDerived>
-static inline const MatrixBase<OtherDerived>& toRotationMatrix(const MatrixBase<OtherDerived>& mat)
+EIGEN_DEVICE_FUNC static inline const MatrixBase<OtherDerived>& toRotationMatrix(const MatrixBase<OtherDerived>& mat)
 {
   EIGEN_STATIC_ASSERT(OtherDerived::RowsAtCompileTime==Dim && OtherDerived::ColsAtCompileTime==Dim,
     YOU_MADE_A_PROGRAMMING_MISTAKE)
diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h
index 1c25f36fe..f58ca03d9 100644..100755
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@@ -18,7 +18,7 @@ namespace Eigen {
   *
   * \brief Represents a generic uniform scaling transformation
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients.
   *
   * This class represent a uniform scaling transformation. It is the return
   * type of Scaling(Scalar), and most of the time this is the only way it
@@ -62,10 +62,10 @@ public:
   template<int Dim, int Mode, int Options>
   inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> operator* (const Transform<Scalar,Dim, Mode, Options>& t) const
   {
-   Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t;
-   res.prescale(factor());
-   return res;
-}
+    Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t;
+    res.prescale(factor());
+    return res;
+  }
 
   /** Concatenates a uniform scaling and a linear transformation matrix */
   // TODO returns an expression
@@ -104,40 +104,44 @@ public:
 
 };
 
-/** Concatenates a linear transformation matrix and a uniform scaling */
+/** \addtogroup Geometry_Module */
+//@{
+
+/** Concatenates a linear transformation matrix and a uniform scaling
+  * \relates UniformScaling
+  */
 // NOTE this operator is defiend in MatrixBase and not as a friend function
 // of UniformScaling to fix an internal crash of Intel's ICC
-template<typename Derived> typename MatrixBase<Derived>::ScalarMultipleReturnType
-MatrixBase<Derived>::operator*(const UniformScaling<Scalar>& s) const
-{ return derived() * s.factor(); }
+template<typename Derived,typename Scalar>
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product)
+operator*(const MatrixBase<Derived>& matrix, const UniformScaling<Scalar>& s)
+{ return matrix.derived() * s.factor(); }
 
 /** Constructs a uniform scaling from scale factor \a s */
-static inline UniformScaling<float> Scaling(float s) { return UniformScaling<float>(s); }
+inline UniformScaling<float> Scaling(float s) { return UniformScaling<float>(s); }
 /** Constructs a uniform scaling from scale factor \a s */
-static inline UniformScaling<double> Scaling(double s) { return UniformScaling<double>(s); }
+inline UniformScaling<double> Scaling(double s) { return UniformScaling<double>(s); }
 /** Constructs a uniform scaling from scale factor \a s */
 template<typename RealScalar>
-static inline UniformScaling<std::complex<RealScalar> > Scaling(const std::complex<RealScalar>& s)
+inline UniformScaling<std::complex<RealScalar> > Scaling(const std::complex<RealScalar>& s)
 { return UniformScaling<std::complex<RealScalar> >(s); }
 
 /** Constructs a 2D axis aligned scaling */
 template<typename Scalar>
-static inline DiagonalMatrix<Scalar,2> Scaling(const Scalar& sx, const Scalar& sy)
+inline DiagonalMatrix<Scalar,2> Scaling(const Scalar& sx, const Scalar& sy)
 { return DiagonalMatrix<Scalar,2>(sx, sy); }
 /** Constructs a 3D axis aligned scaling */
 template<typename Scalar>
-static inline DiagonalMatrix<Scalar,3> Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
+inline DiagonalMatrix<Scalar,3> Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
 { return DiagonalMatrix<Scalar,3>(sx, sy, sz); }
 
 /** Constructs an axis aligned scaling expression from vector expression \a coeffs
   * This is an alias for coeffs.asDiagonal()
   */
 template<typename Derived>
-static inline const DiagonalWrapper<const Derived> Scaling(const MatrixBase<Derived>& coeffs)
+inline const DiagonalWrapper<const Derived> Scaling(const MatrixBase<Derived>& coeffs)
 { return coeffs.asDiagonal(); }
 
-/** \addtogroup Geometry_Module */
-//@{
 /** \deprecated */
 typedef DiagonalMatrix<float, 2> AlignedScaling2f;
 /** \deprecated */
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index acee2d84c..3f31ee45d 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -32,7 +32,8 @@ template< typename TransformType,
           typename MatrixType,
           int Case = transform_traits<TransformType>::IsProjective ? 0
                    : int(MatrixType::RowsAtCompileTime) == int(transform_traits<TransformType>::HDim) ? 1
-                   : 2>
+                   : 2,
+          int RhsCols = MatrixType::ColsAtCompileTime>
 struct transform_right_product_impl;
 
 template< typename Other,
@@ -62,6 +63,22 @@ struct transform_construct_from_matrix;
 
 template<typename TransformType> struct transform_take_affine_part;
 
+template<typename _Scalar, int _Dim, int _Mode, int _Options>
+struct traits<Transform<_Scalar,_Dim,_Mode,_Options> >
+{
+  typedef _Scalar Scalar;
+  typedef Eigen::Index StorageIndex;
+  typedef Dense StorageKind;
+  enum {
+    Dim1 = _Dim==Dynamic ? _Dim : _Dim + 1,
+    RowsAtCompileTime = _Mode==Projective ? Dim1 : _Dim,
+    ColsAtCompileTime = Dim1,
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    Flags = 0
+  };
+};
+
 template<int Mode> struct transform_make_affine;
 
 } // end namespace internal
@@ -102,15 +119,15 @@ template<int Mode> struct transform_make_affine;
   *
   * However, unlike a plain matrix, the Transform class provides many features
   * simplifying both its assembly and usage. In particular, it can be composed
-  * with any other transformations (Transform,Translation,RotationBase,Matrix)
+  * with any other transformations (Transform,Translation,RotationBase,DiagonalMatrix)
   * and can be directly used to transform implicit homogeneous vectors. All these
   * operations are handled via the operator*. For the composition of transformations,
   * its principle consists to first convert the right/left hand sides of the product
   * to a compatible (Dim+1)^2 matrix and then perform a pure matrix product.
   * Of course, internally, operator* tries to perform the minimal number of operations
   * according to the nature of each terms. Likewise, when applying the transform
-  * to non homogeneous vectors, the latters are automatically promoted to homogeneous
-  * one before doing the matrix product. The convertions to homogeneous representations
+  * to points, the latters are automatically promoted to homogeneous vectors
+  * before doing the matrix product. The conventions to homogeneous representations
   * are performed as follow:
   *
   * \b Translation t (Dim)x(1):
@@ -124,7 +141,7 @@ template<int Mode> struct transform_make_affine;
   * R & 0\\
   * 0\,...\,0 & 1
   * \end{array} \right) \f$
-  *
+  *<!--
   * \b Linear \b Matrix L (Dim)x(Dim):
   * \f$ \left( \begin{array}{cc}
   * L & 0\\
@@ -136,14 +153,20 @@ template<int Mode> struct transform_make_affine;
   * A\\
   * 0\,...\,0\,1
   * \end{array} \right) \f$
+  *-->
+  * \b Scaling \b DiagonalMatrix S (Dim)x(Dim):
+  * \f$ \left( \begin{array}{cc}
+  * S & 0\\
+  * 0\,...\,0 & 1
+  * \end{array} \right) \f$
   *
-  * \b Column \b vector v (Dim)x(1):
+  * \b Column \b point v (Dim)x(1):
   * \f$ \left( \begin{array}{c}
   * v\\
   * 1
   * \end{array} \right) \f$
   *
-  * \b Set \b of \b column \b vectors V1...Vn (Dim)x(n):
+  * \b Set \b of \b column \b points V1...Vn (Dim)x(n):
   * \f$ \left( \begin{array}{ccc}
   * v_1 & ... & v_n\\
   * 1 & ... & 1
@@ -170,7 +193,7 @@ template<int Mode> struct transform_make_affine;
   * preprocessor token EIGEN_QT_SUPPORT is defined.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TRANSFORM_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TRANSFORM_PLUGIN.
   *
   * \sa class Matrix, class Quaternion
   */
@@ -188,7 +211,8 @@ public:
   };
   /** the scalar type of the coefficients */
   typedef _Scalar Scalar;
-  typedef DenseIndex Index;
+  typedef Eigen::Index StorageIndex;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   /** type of the matrix used to represent the transformation */
   typedef typename internal::make_proper_matrix_type<Scalar,Rows,HDim,Options>::type MatrixType;
   /** constified MatrixType */
@@ -210,9 +234,9 @@ public:
   /** type of a vector */
   typedef Matrix<Scalar,Dim,1> VectorType;
   /** type of a read/write reference to the translation part of the rotation */
-  typedef Block<MatrixType,Dim,1,int(Mode)==(AffineCompact)> TranslationPart;
+  typedef Block<MatrixType,Dim,1,!(internal::traits<MatrixType>::Flags & RowMajorBit)> TranslationPart;
   /** type of a read reference to the translation part of the rotation */
-  typedef const Block<ConstMatrixType,Dim,1,int(Mode)==(AffineCompact)> ConstTranslationPart;
+  typedef const Block<ConstMatrixType,Dim,1,!(internal::traits<MatrixType>::Flags & RowMajorBit)> ConstTranslationPart;
   /** corresponding translation type */
   typedef Translation<Scalar,Dim> TranslationType;
   
@@ -229,43 +253,43 @@ public:
 
   /** Default constructor without initialization of the meaningful coefficients.
     * If Mode==Affine, then the last row is set to [0 ... 0 1] */
-  inline Transform()
+  EIGEN_DEVICE_FUNC inline Transform()
   {
     check_template_params();
     internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
   }
 
-  inline Transform(const Transform& other)
+  EIGEN_DEVICE_FUNC inline Transform(const Transform& other)
   {
     check_template_params();
     m_matrix = other.m_matrix;
   }
 
-  inline explicit Transform(const TranslationType& t)
+  EIGEN_DEVICE_FUNC inline explicit Transform(const TranslationType& t)
   {
     check_template_params();
     *this = t;
   }
-  inline explicit Transform(const UniformScaling<Scalar>& s)
+  EIGEN_DEVICE_FUNC inline explicit Transform(const UniformScaling<Scalar>& s)
   {
     check_template_params();
     *this = s;
   }
   template<typename Derived>
-  inline explicit Transform(const RotationBase<Derived, Dim>& r)
+  EIGEN_DEVICE_FUNC inline explicit Transform(const RotationBase<Derived, Dim>& r)
   {
     check_template_params();
     *this = r;
   }
 
-  inline Transform& operator=(const Transform& other)
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const Transform& other)
   { m_matrix = other.m_matrix; return *this; }
 
   typedef internal::transform_take_affine_part<Transform> take_affine_part;
 
   /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */
   template<typename OtherDerived>
-  inline explicit Transform(const EigenBase<OtherDerived>& other)
+  EIGEN_DEVICE_FUNC inline explicit Transform(const EigenBase<OtherDerived>& other)
   {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar,typename OtherDerived::Scalar>::value),
       YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY);
@@ -276,7 +300,7 @@ public:
 
   /** Set \c *this from a Dim^2 or (Dim+1)^2 matrix. */
   template<typename OtherDerived>
-  inline Transform& operator=(const EigenBase<OtherDerived>& other)
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const EigenBase<OtherDerived>& other)
   {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar,typename OtherDerived::Scalar>::value),
       YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY);
@@ -286,7 +310,7 @@ public:
   }
   
   template<int OtherOptions>
-  inline Transform(const Transform<Scalar,Dim,Mode,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC inline Transform(const Transform<Scalar,Dim,Mode,OtherOptions>& other)
   {
     check_template_params();
     // only the options change, we can directly copy the matrices
@@ -294,7 +318,7 @@ public:
   }
 
   template<int OtherMode,int OtherOptions>
-  inline Transform(const Transform<Scalar,Dim,OtherMode,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC inline Transform(const Transform<Scalar,Dim,OtherMode,OtherOptions>& other)
   {
     check_template_params();
     // prevent conversions as:
@@ -335,14 +359,14 @@ public:
   }
 
   template<typename OtherDerived>
-  Transform(const ReturnByValue<OtherDerived>& other)
+  EIGEN_DEVICE_FUNC Transform(const ReturnByValue<OtherDerived>& other)
   {
     check_template_params();
     other.evalTo(*this);
   }
 
   template<typename OtherDerived>
-  Transform& operator=(const ReturnByValue<OtherDerived>& other)
+  EIGEN_DEVICE_FUNC Transform& operator=(const ReturnByValue<OtherDerived>& other)
   {
     other.evalTo(*this);
     return *this;
@@ -356,60 +380,76 @@ public:
   inline Transform& operator=(const QTransform& other);
   inline QTransform toQTransform(void) const;
   #endif
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_matrix.cols(); }
 
   /** shortcut for m_matrix(row,col);
     * \sa MatrixBase::operator(Index,Index) const */
-  inline Scalar operator() (Index row, Index col) const { return m_matrix(row,col); }
+  EIGEN_DEVICE_FUNC inline Scalar operator() (Index row, Index col) const { return m_matrix(row,col); }
   /** shortcut for m_matrix(row,col);
     * \sa MatrixBase::operator(Index,Index) */
-  inline Scalar& operator() (Index row, Index col) { return m_matrix(row,col); }
+  EIGEN_DEVICE_FUNC inline Scalar& operator() (Index row, Index col) { return m_matrix(row,col); }
 
   /** \returns a read-only expression of the transformation matrix */
-  inline const MatrixType& matrix() const { return m_matrix; }
+  EIGEN_DEVICE_FUNC inline const MatrixType& matrix() const { return m_matrix; }
   /** \returns a writable expression of the transformation matrix */
-  inline MatrixType& matrix() { return m_matrix; }
+  EIGEN_DEVICE_FUNC inline MatrixType& matrix() { return m_matrix; }
 
   /** \returns a read-only expression of the linear part of the transformation */
-  inline ConstLinearPart linear() const { return ConstLinearPart(m_matrix,0,0); }
+  EIGEN_DEVICE_FUNC inline ConstLinearPart linear() const { return ConstLinearPart(m_matrix,0,0); }
   /** \returns a writable expression of the linear part of the transformation */
-  inline LinearPart linear() { return LinearPart(m_matrix,0,0); }
+  EIGEN_DEVICE_FUNC inline LinearPart linear() { return LinearPart(m_matrix,0,0); }
 
   /** \returns a read-only expression of the Dim x HDim affine part of the transformation */
-  inline ConstAffinePart affine() const { return take_affine_part::run(m_matrix); }
+  EIGEN_DEVICE_FUNC inline ConstAffinePart affine() const { return take_affine_part::run(m_matrix); }
   /** \returns a writable expression of the Dim x HDim affine part of the transformation */
-  inline AffinePart affine() { return take_affine_part::run(m_matrix); }
+  EIGEN_DEVICE_FUNC inline AffinePart affine() { return take_affine_part::run(m_matrix); }
 
   /** \returns a read-only expression of the translation vector of the transformation */
-  inline ConstTranslationPart translation() const { return ConstTranslationPart(m_matrix,0,Dim); }
+  EIGEN_DEVICE_FUNC inline ConstTranslationPart translation() const { return ConstTranslationPart(m_matrix,0,Dim); }
   /** \returns a writable expression of the translation vector of the transformation */
-  inline TranslationPart translation() { return TranslationPart(m_matrix,0,Dim); }
+  EIGEN_DEVICE_FUNC inline TranslationPart translation() { return TranslationPart(m_matrix,0,Dim); }
 
-  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other
+  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other.
     *
-    * The right hand side \a other might be either:
-    * \li a vector of size Dim,
+    * The right-hand-side \a other can be either:
     * \li an homogeneous vector of size Dim+1,
-    * \li a set of vectors of size Dim x Dynamic,
-    * \li a set of homogeneous vectors of size Dim+1 x Dynamic,
-    * \li a linear transformation matrix of size Dim x Dim,
-    * \li an affine transformation matrix of size Dim x Dim+1,
+    * \li a set of homogeneous vectors of size Dim+1 x N,
     * \li a transformation matrix of size Dim+1 x Dim+1.
+    *
+    * Moreover, if \c *this represents an affine transformation (i.e., Mode!=Projective), then \a other can also be:
+    * \li a point of size Dim (computes: \code this->linear() * other + this->translation()\endcode),
+    * \li a set of N points as a Dim x N matrix (computes: \code (this->linear() * other).colwise() + this->translation()\endcode),
+    *
+    * In all cases, the return type is a matrix or vector of same sizes as the right-hand-side \a other.
+    *
+    * If you want to interpret \a other as a linear or affine transformation, then first convert it to a Transform<> type,
+    * or do your own cooking.
+    *
+    * Finally, if you want to apply Affine transformations to vectors, then explicitly apply the linear part only:
+    * \code
+    * Affine3f A;
+    * Vector3f v1, v2;
+    * v2 = A.linear() * v1;
+    * \endcode
+    *
     */
   // note: this function is defined here because some compilers cannot find the respective declaration
   template<typename OtherDerived>
-  EIGEN_STRONG_INLINE const typename internal::transform_right_product_impl<Transform, OtherDerived>::ResultType
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::transform_right_product_impl<Transform, OtherDerived>::ResultType
   operator * (const EigenBase<OtherDerived> &other) const
   { return internal::transform_right_product_impl<Transform, OtherDerived>::run(*this,other.derived()); }
 
   /** \returns the product expression of a transformation matrix \a a times a transform \a b
     *
-    * The left hand side \a other might be either:
+    * The left hand side \a other can be either:
     * \li a linear transformation matrix of size Dim x Dim,
     * \li an affine transformation matrix of size Dim x Dim+1,
     * \li a general transformation matrix of size Dim+1 x Dim+1.
     */
   template<typename OtherDerived> friend
-  inline const typename internal::transform_left_product_impl<OtherDerived,Mode,Options,_Dim,_Dim+1>::ResultType
+  EIGEN_DEVICE_FUNC inline const typename internal::transform_left_product_impl<OtherDerived,Mode,Options,_Dim,_Dim+1>::ResultType
     operator * (const EigenBase<OtherDerived> &a, const Transform &b)
   { return internal::transform_left_product_impl<OtherDerived,Mode,Options,Dim,HDim>::run(a.derived(),b); }
 
@@ -420,7 +460,7 @@ public:
     * mode is no isometry. In that case, the returned transform is an affinity.
     */
   template<typename DiagonalDerived>
-  inline const TransformTimeDiagonalReturnType
+  EIGEN_DEVICE_FUNC inline const TransformTimeDiagonalReturnType
     operator * (const DiagonalBase<DiagonalDerived> &b) const
   {
     TransformTimeDiagonalReturnType res(*this);
@@ -435,7 +475,7 @@ public:
     * mode is no isometry. In that case, the returned transform is an affinity.
     */
   template<typename DiagonalDerived>
-  friend inline TransformTimeDiagonalReturnType
+  EIGEN_DEVICE_FUNC friend inline TransformTimeDiagonalReturnType
     operator * (const DiagonalBase<DiagonalDerived> &a, const Transform &b)
   {
     TransformTimeDiagonalReturnType res;
@@ -447,15 +487,15 @@ public:
   }
 
   template<typename OtherDerived>
-  inline Transform& operator*=(const EigenBase<OtherDerived>& other) { return *this = *this * other; }
+  EIGEN_DEVICE_FUNC inline Transform& operator*=(const EigenBase<OtherDerived>& other) { return *this = *this * other; }
 
   /** Concatenates two transformations */
-  inline const Transform operator * (const Transform& other) const
+  EIGEN_DEVICE_FUNC inline const Transform operator * (const Transform& other) const
   {
     return internal::transform_transform_product_impl<Transform,Transform>::run(*this,other);
   }
   
-  #ifdef __INTEL_COMPILER
+  #if EIGEN_COMP_ICC
 private:
   // this intermediate structure permits to workaround a bug in ICC 11:
   //   error: template instantiation resulted in unexpected function type of "Eigen::Transform<double, 3, 32, 0>
@@ -482,7 +522,7 @@ public:
   #else
   /** Concatenates two different transformations */
   template<int OtherMode,int OtherOptions>
-  inline typename internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> >::ResultType
+  EIGEN_DEVICE_FUNC inline typename internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> >::ResultType
     operator * (const Transform<Scalar,Dim,OtherMode,OtherOptions>& other) const
   {
     return internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> >::run(*this,other);
@@ -490,79 +530,98 @@ public:
   #endif
 
   /** \sa MatrixBase::setIdentity() */
-  void setIdentity() { m_matrix.setIdentity(); }
+  EIGEN_DEVICE_FUNC void setIdentity() { m_matrix.setIdentity(); }
 
   /**
    * \brief Returns an identity transformation.
    * \todo In the future this function should be returning a Transform expression.
    */
-  static const Transform Identity()
+  EIGEN_DEVICE_FUNC static const Transform Identity()
   {
     return Transform(MatrixType::Identity());
   }
 
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC 
   inline Transform& scale(const MatrixBase<OtherDerived> &other);
 
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   inline Transform& prescale(const MatrixBase<OtherDerived> &other);
 
-  inline Transform& scale(const Scalar& s);
-  inline Transform& prescale(const Scalar& s);
+  EIGEN_DEVICE_FUNC inline Transform& scale(const Scalar& s);
+  EIGEN_DEVICE_FUNC inline Transform& prescale(const Scalar& s);
 
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   inline Transform& translate(const MatrixBase<OtherDerived> &other);
 
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   inline Transform& pretranslate(const MatrixBase<OtherDerived> &other);
 
   template<typename RotationType>
+  EIGEN_DEVICE_FUNC
   inline Transform& rotate(const RotationType& rotation);
 
   template<typename RotationType>
+  EIGEN_DEVICE_FUNC
   inline Transform& prerotate(const RotationType& rotation);
 
-  Transform& shear(const Scalar& sx, const Scalar& sy);
-  Transform& preshear(const Scalar& sx, const Scalar& sy);
+  EIGEN_DEVICE_FUNC Transform& shear(const Scalar& sx, const Scalar& sy);
+  EIGEN_DEVICE_FUNC Transform& preshear(const Scalar& sx, const Scalar& sy);
 
-  inline Transform& operator=(const TranslationType& t);
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const TranslationType& t);
+  
+  EIGEN_DEVICE_FUNC
   inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
-  inline Transform operator*(const TranslationType& t) const;
+  
+  EIGEN_DEVICE_FUNC inline Transform operator*(const TranslationType& t) const;
 
+  EIGEN_DEVICE_FUNC 
   inline Transform& operator=(const UniformScaling<Scalar>& t);
+  
+  EIGEN_DEVICE_FUNC
   inline Transform& operator*=(const UniformScaling<Scalar>& s) { return scale(s.factor()); }
-  inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?int(Affine):int(Mode))> operator*(const UniformScaling<Scalar>& s) const
+  
+  EIGEN_DEVICE_FUNC
+  inline TransformTimeDiagonalReturnType operator*(const UniformScaling<Scalar>& s) const
   {
-    Transform<Scalar,Dim,(int(Mode)==int(Isometry)?int(Affine):int(Mode)),Options> res = *this;
+    TransformTimeDiagonalReturnType res = *this;
     res.scale(s.factor());
     return res;
   }
 
+  EIGEN_DEVICE_FUNC
   inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linearExt() *= s; return *this; }
 
   template<typename Derived>
-  inline Transform& operator=(const RotationBase<Derived,Dim>& r);
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const RotationBase<Derived,Dim>& r);
   template<typename Derived>
-  inline Transform& operator*=(const RotationBase<Derived,Dim>& r) { return rotate(r.toRotationMatrix()); }
+  EIGEN_DEVICE_FUNC inline Transform& operator*=(const RotationBase<Derived,Dim>& r) { return rotate(r.toRotationMatrix()); }
   template<typename Derived>
-  inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
+  EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
 
-  const LinearMatrixType rotation() const;
+  EIGEN_DEVICE_FUNC const LinearMatrixType rotation() const;
   template<typename RotationMatrixType, typename ScalingMatrixType>
+  EIGEN_DEVICE_FUNC
   void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
   template<typename ScalingMatrixType, typename RotationMatrixType>
+  EIGEN_DEVICE_FUNC
   void computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const;
 
   template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
+  EIGEN_DEVICE_FUNC
   Transform& fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
     const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale);
 
+  EIGEN_DEVICE_FUNC
   inline Transform inverse(TransformTraits traits = (TransformTraits)Mode) const;
 
   /** \returns a const pointer to the column major internal matrix */
-  const Scalar* data() const { return m_matrix.data(); }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_matrix.data(); }
   /** \returns a non-const pointer to the column major internal matrix */
-  Scalar* data() { return m_matrix.data(); }
+  EIGEN_DEVICE_FUNC Scalar* data() { return m_matrix.data(); }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -570,12 +629,12 @@ public:
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim,Mode,Options> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim,Mode,Options> >::type cast() const
   { return typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim,Mode,Options> >::type(*this); }
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType>
-  inline explicit Transform(const Transform<OtherScalarType,Dim,Mode,Options>& other)
+  EIGEN_DEVICE_FUNC inline explicit Transform(const Transform<OtherScalarType,Dim,Mode,Options>& other)
   {
     check_template_params();
     m_matrix = other.matrix().template cast<Scalar>();
@@ -585,12 +644,12 @@ public:
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const Transform& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const Transform& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_matrix.isApprox(other.m_matrix, prec); }
 
   /** Sets the last row to [0 ... 0 1]
     */
-  void makeAffine()
+  EIGEN_DEVICE_FUNC void makeAffine()
   {
     internal::transform_make_affine<int(Mode)>::run(m_matrix);
   }
@@ -599,26 +658,26 @@ public:
     * \returns the Dim x Dim linear part if the transformation is affine,
     *          and the HDim x Dim part for projective transformations.
     */
-  inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt()
+  EIGEN_DEVICE_FUNC inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt()
   { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,Dim>(0,0); }
   /** \internal
     * \returns the Dim x Dim linear part if the transformation is affine,
     *          and the HDim x Dim part for projective transformations.
     */
-  inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt() const
+  EIGEN_DEVICE_FUNC inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt() const
   { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,Dim>(0,0); }
 
   /** \internal
     * \returns the translation part if the transformation is affine,
     *          and the last column for projective transformations.
     */
-  inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt()
+  EIGEN_DEVICE_FUNC inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt()
   { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,1>(0,Dim); }
   /** \internal
     * \returns the translation part if the transformation is affine,
     *          and the last column for projective transformations.
     */
-  inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt() const
+  EIGEN_DEVICE_FUNC inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt() const
   { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,1>(0,Dim); }
 
 
@@ -628,7 +687,7 @@ public:
   
 protected:
   #ifndef EIGEN_PARSED_BY_DOXYGEN
-    static EIGEN_STRONG_INLINE void check_template_params()
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void check_template_params()
     {
       EIGEN_STATIC_ASSERT((Options & (DontAlign|RowMajor)) == Options, INVALID_MATRIX_TEMPLATE_PARAMETERS)
     }
@@ -696,9 +755,13 @@ template<typename Scalar, int Dim, int Mode,int Options>
 Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const QMatrix& other)
 {
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix << other.m11(), other.m21(), other.dx(),
-              other.m12(), other.m22(), other.dy(),
-              0, 0, 1;
+  if (Mode == int(AffineCompact))
+    m_matrix << other.m11(), other.m21(), other.dx(),
+                other.m12(), other.m22(), other.dy();
+  else
+    m_matrix << other.m11(), other.m21(), other.dx(),
+                other.m12(), other.m22(), other.dy(),
+                0, 0, 1;
   return *this;
 }
 
@@ -777,7 +840,7 @@ QTransform Transform<Scalar,Dim,Mode,Options>::toQTransform(void) const
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::scale(const MatrixBase<OtherDerived> &other)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
@@ -791,7 +854,7 @@ Transform<Scalar,Dim,Mode,Options>::scale(const MatrixBase<OtherDerived> &other)
   * \sa prescale(Scalar)
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::scale(const Scalar& s)
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::scale(const Scalar& s)
 {
   EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
   linearExt() *= s;
@@ -804,7 +867,7 @@ inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::s
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &other)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
@@ -818,7 +881,7 @@ Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &oth
   * \sa scale(Scalar)
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::prescale(const Scalar& s)
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::prescale(const Scalar& s)
 {
   EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
   m_matrix.template topRows<Dim>() *= s;
@@ -831,7 +894,7 @@ inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::p
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::translate(const MatrixBase<OtherDerived> &other)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
@@ -845,7 +908,7 @@ Transform<Scalar,Dim,Mode,Options>::translate(const MatrixBase<OtherDerived> &ot
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::pretranslate(const MatrixBase<OtherDerived> &other)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
@@ -875,7 +938,7 @@ Transform<Scalar,Dim,Mode,Options>::pretranslate(const MatrixBase<OtherDerived>
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename RotationType>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::rotate(const RotationType& rotation)
 {
   linearExt() *= internal::toRotationMatrix<Scalar,Dim>(rotation);
@@ -891,7 +954,7 @@ Transform<Scalar,Dim,Mode,Options>::rotate(const RotationType& rotation)
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename RotationType>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::prerotate(const RotationType& rotation)
 {
   m_matrix.template block<Dim,HDim>(0,0) = internal::toRotationMatrix<Scalar,Dim>(rotation)
@@ -905,7 +968,7 @@ Transform<Scalar,Dim,Mode,Options>::prerotate(const RotationType& rotation)
   * \sa preshear()
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::shear(const Scalar& sx, const Scalar& sy)
 {
   EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -921,7 +984,7 @@ Transform<Scalar,Dim,Mode,Options>::shear(const Scalar& sx, const Scalar& sy)
   * \sa shear()
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::preshear(const Scalar& sx, const Scalar& sy)
 {
   EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -935,7 +998,7 @@ Transform<Scalar,Dim,Mode,Options>::preshear(const Scalar& sx, const Scalar& sy)
 ******************************************************/
 
 template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const TranslationType& t)
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const TranslationType& t)
 {
   linear().setIdentity();
   translation() = t.vector();
@@ -944,7 +1007,7 @@ inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::o
 }
 
 template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const TranslationType& t) const
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const TranslationType& t) const
 {
   Transform res = *this;
   res.translate(t.vector());
@@ -952,7 +1015,7 @@ inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::op
 }
 
 template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const UniformScaling<Scalar>& s)
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const UniformScaling<Scalar>& s)
 {
   m_matrix.setZero();
   linear().diagonal().fill(s.factor());
@@ -962,7 +1025,7 @@ inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::o
 
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename Derived>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const RotationBase<Derived,Dim>& r)
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const RotationBase<Derived,Dim>& r)
 {
   linear() = internal::toRotationMatrix<Scalar,Dim>(r);
   translation().setZero();
@@ -972,7 +1035,7 @@ inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::o
 
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename Derived>
-inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const RotationBase<Derived,Dim>& r) const
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const RotationBase<Derived,Dim>& r) const
 {
   Transform res = *this;
   res.rotate(r.derived());
@@ -991,7 +1054,7 @@ inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::op
   * \sa computeRotationScaling(), computeScalingRotation(), class SVD
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-const typename Transform<Scalar,Dim,Mode,Options>::LinearMatrixType
+EIGEN_DEVICE_FUNC const typename Transform<Scalar,Dim,Mode,Options>::LinearMatrixType
 Transform<Scalar,Dim,Mode,Options>::rotation() const
 {
   LinearMatrixType result;
@@ -1013,7 +1076,7 @@ Transform<Scalar,Dim,Mode,Options>::rotation() const
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename RotationMatrixType, typename ScalingMatrixType>
-void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
+EIGEN_DEVICE_FUNC void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
 {
   JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
 
@@ -1029,7 +1092,7 @@ void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixTy
   }
 }
 
-/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
+/** decomposes the linear part of the transformation as a product scaling x rotation, the scaling being
   * not necessarily positive.
   *
   * If either pointer is zero, the corresponding computation is skipped.
@@ -1042,7 +1105,7 @@ void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixTy
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename ScalingMatrixType, typename RotationMatrixType>
-void Transform<Scalar,Dim,Mode,Options>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
+EIGEN_DEVICE_FUNC void Transform<Scalar,Dim,Mode,Options>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
 {
   JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
 
@@ -1063,7 +1126,7 @@ void Transform<Scalar,Dim,Mode,Options>::computeScalingRotation(ScalingMatrixTyp
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
   const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale)
 {
@@ -1080,7 +1143,7 @@ template<int Mode>
 struct transform_make_affine
 {
   template<typename MatrixType>
-  static void run(MatrixType &mat)
+  EIGEN_DEVICE_FUNC static void run(MatrixType &mat)
   {
     static const int Dim = MatrixType::ColsAtCompileTime-1;
     mat.template block<1,Dim>(Dim,0).setZero();
@@ -1091,21 +1154,21 @@ struct transform_make_affine
 template<>
 struct transform_make_affine<AffineCompact>
 {
-  template<typename MatrixType> static void run(MatrixType &) { }
+  template<typename MatrixType> EIGEN_DEVICE_FUNC static void run(MatrixType &) { }
 };
     
 // selector needed to avoid taking the inverse of a 3x4 matrix
 template<typename TransformType, int Mode=TransformType::Mode>
 struct projective_transform_inverse
 {
-  static inline void run(const TransformType&, TransformType&)
+  EIGEN_DEVICE_FUNC static inline void run(const TransformType&, TransformType&)
   {}
 };
 
 template<typename TransformType>
 struct projective_transform_inverse<TransformType, Projective>
 {
-  static inline void run(const TransformType& m, TransformType& res)
+  EIGEN_DEVICE_FUNC static inline void run(const TransformType& m, TransformType& res)
   {
     res.matrix() = m.matrix().inverse();
   }
@@ -1135,7 +1198,7 @@ struct projective_transform_inverse<TransformType, Projective>
   * \sa MatrixBase::inverse()
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-Transform<Scalar,Dim,Mode,Options>
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>
 Transform<Scalar,Dim,Mode,Options>::inverse(TransformTraits hint) const
 {
   Transform res;
@@ -1244,8 +1307,8 @@ struct transform_product_result
   };
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 0 >
+template< typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl< TransformType, MatrixType, 0, RhsCols>
 {
   typedef typename MatrixType::PlainObject ResultType;
 
@@ -1255,8 +1318,8 @@ struct transform_right_product_impl< TransformType, MatrixType, 0 >
   }
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 1 >
+template< typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols>
 {
   enum { 
     Dim = TransformType::Dim, 
@@ -1281,8 +1344,8 @@ struct transform_right_product_impl< TransformType, MatrixType, 1 >
   }
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 2 >
+template< typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl< TransformType, MatrixType, 2, RhsCols>
 {
   enum { 
     Dim = TransformType::Dim, 
@@ -1305,6 +1368,30 @@ struct transform_right_product_impl< TransformType, MatrixType, 2 >
   }
 };
 
+template< typename TransformType, typename MatrixType >
+struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is a vector of size Dim
+{
+  typedef typename TransformType::MatrixType TransformMatrix;
+  enum {
+    Dim = TransformType::Dim,
+    HDim = TransformType::HDim,
+    OtherRows = MatrixType::RowsAtCompileTime,
+    WorkingRows = EIGEN_PLAIN_ENUM_MIN(TransformMatrix::RowsAtCompileTime,HDim)
+  };
+
+  typedef typename MatrixType::PlainObject ResultType;
+
+  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  {
+    EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
+
+    Matrix<typename ResultType::Scalar, Dim+1, 1> rhs;
+    rhs.template head<Dim>() = other; rhs[Dim] = typename ResultType::Scalar(1);
+    Matrix<typename ResultType::Scalar, WorkingRows, 1> res(T.matrix() * rhs);
+    return res.template head<Dim>();
+  }
+};
+
 /**********************************************************
 ***   Specializations of operator* with lhs EigenBase   ***
 **********************************************************/
diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h
index c8f53eace..51d9a82eb 100644
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@@ -18,8 +18,8 @@ namespace Eigen {
   *
   * \brief Represents a translation transformation
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients.
+  * \tparam _Dim the  dimension of the space, can be a compile time value or Dynamic
   *
   * \note This class is not aimed to be used to store a translation transformation,
   * but rather to make easier the constructions and updates of Transform objects.
@@ -51,16 +51,16 @@ protected:
 public:
 
   /** Default constructor without initialization. */
-  Translation() {}
+  EIGEN_DEVICE_FUNC Translation() {}
   /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy)
+  EIGEN_DEVICE_FUNC inline Translation(const Scalar& sx, const Scalar& sy)
   {
     eigen_assert(Dim==2);
     m_coeffs.x() = sx;
     m_coeffs.y() = sy;
   }
   /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz)
+  EIGEN_DEVICE_FUNC inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz)
   {
     eigen_assert(Dim==3);
     m_coeffs.x() = sx;
@@ -68,48 +68,48 @@ public:
     m_coeffs.z() = sz;
   }
   /** Constructs and initialize the translation transformation from a vector of translation coefficients */
-  explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
+  EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
 
   /** \brief Retruns the x-translation by value. **/
-  inline Scalar x() const { return m_coeffs.x(); }
+  EIGEN_DEVICE_FUNC inline Scalar x() const { return m_coeffs.x(); }
   /** \brief Retruns the y-translation by value. **/
-  inline Scalar y() const { return m_coeffs.y(); }
+  EIGEN_DEVICE_FUNC inline Scalar y() const { return m_coeffs.y(); }
   /** \brief Retruns the z-translation by value. **/
-  inline Scalar z() const { return m_coeffs.z(); }
+  EIGEN_DEVICE_FUNC inline Scalar z() const { return m_coeffs.z(); }
 
   /** \brief Retruns the x-translation as a reference. **/
-  inline Scalar& x() { return m_coeffs.x(); }
+  EIGEN_DEVICE_FUNC inline Scalar& x() { return m_coeffs.x(); }
   /** \brief Retruns the y-translation as a reference. **/
-  inline Scalar& y() { return m_coeffs.y(); }
+  EIGEN_DEVICE_FUNC inline Scalar& y() { return m_coeffs.y(); }
   /** \brief Retruns the z-translation as a reference. **/
-  inline Scalar& z() { return m_coeffs.z(); }
+  EIGEN_DEVICE_FUNC inline Scalar& z() { return m_coeffs.z(); }
 
-  const VectorType& vector() const { return m_coeffs; }
-  VectorType& vector() { return m_coeffs; }
+  EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; }
+  EIGEN_DEVICE_FUNC VectorType& vector() { return m_coeffs; }
 
-  const VectorType& translation() const { return m_coeffs; }
-  VectorType& translation() { return m_coeffs; }
+  EIGEN_DEVICE_FUNC const VectorType& translation() const { return m_coeffs; }
+  EIGEN_DEVICE_FUNC VectorType& translation() { return m_coeffs; }
 
   /** Concatenates two translation */
-  inline Translation operator* (const Translation& other) const
+  EIGEN_DEVICE_FUNC inline Translation operator* (const Translation& other) const
   { return Translation(m_coeffs + other.m_coeffs); }
 
   /** Concatenates a translation and a uniform scaling */
-  inline AffineTransformType operator* (const UniformScaling<Scalar>& other) const;
+  EIGEN_DEVICE_FUNC inline AffineTransformType operator* (const UniformScaling<Scalar>& other) const;
 
   /** Concatenates a translation and a linear transformation */
   template<typename OtherDerived>
-  inline AffineTransformType operator* (const EigenBase<OtherDerived>& linear) const;
+  EIGEN_DEVICE_FUNC inline AffineTransformType operator* (const EigenBase<OtherDerived>& linear) const;
 
   /** Concatenates a translation and a rotation */
   template<typename Derived>
-  inline IsometryTransformType operator*(const RotationBase<Derived,Dim>& r) const
+  EIGEN_DEVICE_FUNC inline IsometryTransformType operator*(const RotationBase<Derived,Dim>& r) const
   { return *this * IsometryTransformType(r); }
 
   /** \returns the concatenation of a linear transformation \a l with the translation \a t */
   // its a nightmare to define a templated friend function outside its declaration
   template<typename OtherDerived> friend
-  inline AffineTransformType operator*(const EigenBase<OtherDerived>& linear, const Translation& t)
+  EIGEN_DEVICE_FUNC inline AffineTransformType operator*(const EigenBase<OtherDerived>& linear, const Translation& t)
   {
     AffineTransformType res;
     res.matrix().setZero();
@@ -122,7 +122,7 @@ public:
 
   /** Concatenates a translation and a transformation */
   template<int Mode, int Options>
-  inline Transform<Scalar,Dim,Mode> operator* (const Transform<Scalar,Dim,Mode,Options>& t) const
+  EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode> operator* (const Transform<Scalar,Dim,Mode,Options>& t) const
   {
     Transform<Scalar,Dim,Mode> res = t;
     res.pretranslate(m_coeffs);
@@ -152,19 +152,19 @@ public:
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type cast() const
   { return typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type(*this); }
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType>
-  inline explicit Translation(const Translation<OtherScalarType,Dim>& other)
+  EIGEN_DEVICE_FUNC inline explicit Translation(const Translation<OtherScalarType,Dim>& other)
   { m_coeffs = other.vector().template cast<Scalar>(); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const Translation& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const Translation& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_coeffs.isApprox(other.m_coeffs, prec); }
 
 };
@@ -178,7 +178,7 @@ typedef Translation<double,3> Translation3d;
 //@}
 
 template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::AffineTransformType
+EIGEN_DEVICE_FUNC inline typename Translation<Scalar,Dim>::AffineTransformType
 Translation<Scalar,Dim>::operator* (const UniformScaling<Scalar>& other) const
 {
   AffineTransformType res;
@@ -191,7 +191,7 @@ Translation<Scalar,Dim>::operator* (const UniformScaling<Scalar>& other) const
 
 template<typename Scalar, int Dim>
 template<typename OtherDerived>
-inline typename Translation<Scalar,Dim>::AffineTransformType
+EIGEN_DEVICE_FUNC inline typename Translation<Scalar,Dim>::AffineTransformType
 Translation<Scalar,Dim>::operator* (const EigenBase<OtherDerived>& linear) const
 {
   AffineTransformType res;
diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h
index 5e20662f8..7e933fca1 100644
--- a/Eigen/src/Geometry/Umeyama.h
+++ b/Eigen/src/Geometry/Umeyama.h
@@ -97,7 +97,6 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
   typedef typename internal::umeyama_transform_matrix_type<Derived, OtherDerived>::type TransformationMatrixType;
   typedef typename internal::traits<TransformationMatrixType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename Derived::Index Index;
 
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename internal::traits<OtherDerived>::Scalar>::value),
@@ -136,22 +135,12 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
 
   // Eq. (39)
   VectorType S = VectorType::Ones(m);
-  if (sigma.determinant()<Scalar(0)) S(m-1) = Scalar(-1);
+
+  if  ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 )
+    S(m-1) = -1;
 
   // Eq. (40) and (43)
-  const VectorType& d = svd.singularValues();
-  Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank;
-  if (rank == m-1) {
-    if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) {
-      Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose();
-    } else {
-      const Scalar s = S(m-1); S(m-1) = Scalar(-1);
-      Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
-      S(m-1) = s;
-    }
-  } else {
-    Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
-  }
+  Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
 
   if (with_scaling)
   {
diff --git a/Eigen/src/Geometry/arch/CMakeLists.txt b/Eigen/src/Geometry/arch/CMakeLists.txt
deleted file mode 100644
index 1267a79c7..000000000
--- a/Eigen/src/Geometry/arch/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Geometry_arch_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Geometry_arch_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Geometry/arch COMPONENT Devel
-  )
diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h
index 3d8284f2d..1a86ff837 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -16,35 +16,47 @@ namespace Eigen {
 namespace internal {
 
 template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned16>
 {
   static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000));
     Quaternion<float> res;
-    __m128 a = _a.coeffs().template packet<Aligned>(0);
-    __m128 b = _b.coeffs().template packet<Aligned>(0);
-    __m128 flip1 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),
-                                         vec4f_swizzle1(b,2,0,1,2)),mask);
-    __m128 flip2 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),
-                                         vec4f_swizzle1(b,0,1,2,1)),mask);
+    const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
+    __m128 a = _a.coeffs().template packet<Aligned16>(0);
+    __m128 b = _b.coeffs().template packet<Aligned16>(0);
+    __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
+    __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
     pstore(&res.x(),
               _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
                                     _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
                                                vec4f_swizzle1(b,1,2,0,0))),
-                         _mm_add_ps(flip1,flip2)));
+                         _mm_xor_ps(mask,_mm_add_ps(s1,s2))));
+    
     return res;
   }
 };
 
+template<class Derived, int Alignment>
+struct quat_conj<Architecture::SSE, Derived, float, Alignment>
+{
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& q)
+  {
+    Quaternion<float> res;
+    const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
+    pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet<Alignment>(0)));
+    return res;
+  }
+};
+
+
 template<typename VectorLhs,typename VectorRhs>
 struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
 {
   static inline typename plain_matrix_type<VectorLhs>::type
   run(const VectorLhs& lhs, const VectorRhs& rhs)
   {
-    __m128 a = lhs.template packet<VectorLhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
-    __m128 b = rhs.template packet<VectorRhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
+    __m128 a = lhs.template packet<traits<VectorLhs>::Alignment>(0);
+    __m128 b = rhs.template packet<traits<VectorRhs>::Alignment>(0);
     __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
     __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
     typename plain_matrix_type<VectorLhs>::type res;
@@ -56,8 +68,8 @@ struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
 
 
 
-template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
+template<class Derived, class OtherDerived, int Alignment>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
 {
   static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
@@ -66,8 +78,8 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
   Quaternion<double> res;
 
   const double* a = _a.coeffs().data();
-  Packet2d b_xy = _b.coeffs().template packet<Aligned>(0);
-  Packet2d b_zw = _b.coeffs().template packet<Aligned>(2);
+  Packet2d b_xy = _b.coeffs().template packet<Alignment>(0);
+  Packet2d b_zw = _b.coeffs().template packet<Alignment>(2);
   Packet2d a_xx = pset1<Packet2d>(a[0]);
   Packet2d a_yy = pset1<Packet2d>(a[1]);
   Packet2d a_zz = pset1<Packet2d>(a[2]);
@@ -108,6 +120,20 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
 }
 };
 
+template<class Derived, int Alignment>
+struct quat_conj<Architecture::SSE, Derived, double, Alignment>
+{
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& q)
+  {
+    Quaternion<double> res;
+    const __m128d mask0 = _mm_setr_pd(-0.,-0.);
+    const __m128d mask2 = _mm_setr_pd(-0.,0.);
+    pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet<Alignment>(0)));
+    pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet<Alignment>(2)));
+    return res;
+  }
+};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h
index 60dbea5f5..01a7ed188 100644
--- a/Eigen/src/Householder/BlockHouseholder.h
+++ b/Eigen/src/Householder/BlockHouseholder.h
@@ -16,48 +16,83 @@
 namespace Eigen { 
 
 namespace internal {
+  
+/** \internal */
+// template<typename TriangularFactorType,typename VectorsType,typename CoeffsType>
+// void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors, const CoeffsType& hCoeffs)
+// {
+//   typedef typename VectorsType::Scalar Scalar;
+//   const Index nbVecs = vectors.cols();
+//   eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows()>=nbVecs);
+// 
+//   for(Index i = 0; i < nbVecs; i++)
+//   {
+//     Index rs = vectors.rows() - i;
+//     // Warning, note that hCoeffs may alias with vectors.
+//     // It is then necessary to copy it before modifying vectors(i,i). 
+//     typename CoeffsType::Scalar h = hCoeffs(i);
+//     // This hack permits to pass trough nested Block<> and Transpose<> expressions.
+//     Scalar *Vii_ptr = const_cast<Scalar*>(vectors.data() + vectors.outerStride()*i + vectors.innerStride()*i);
+//     Scalar Vii = *Vii_ptr;
+//     *Vii_ptr = Scalar(1);
+//     triFactor.col(i).head(i).noalias() = -h * vectors.block(i, 0, rs, i).adjoint()
+//                                        * vectors.col(i).tail(rs);
+//     *Vii_ptr = Vii;
+//     // FIXME add .noalias() once the triangular product can work inplace
+//     triFactor.col(i).head(i) = triFactor.block(0,0,i,i).template triangularView<Upper>()
+//                              * triFactor.col(i).head(i);
+//     triFactor(i,i) = hCoeffs(i);
+//   }
+// }
 
 /** \internal */
+// This variant avoid modifications in vectors
 template<typename TriangularFactorType,typename VectorsType,typename CoeffsType>
 void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors, const CoeffsType& hCoeffs)
 {
-  typedef typename TriangularFactorType::Index Index;
-  typedef typename VectorsType::Scalar Scalar;
   const Index nbVecs = vectors.cols();
   eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows()>=nbVecs);
 
-  for(Index i = 0; i < nbVecs; i++)
+  for(Index i = nbVecs-1; i >=0 ; --i)
   {
-    Index rs = vectors.rows() - i;
-    Scalar Vii = vectors(i,i);
-    vectors.const_cast_derived().coeffRef(i,i) = Scalar(1);
-    triFactor.col(i).head(i).noalias() = -hCoeffs(i) * vectors.block(i, 0, rs, i).adjoint()
-                                       * vectors.col(i).tail(rs);
-    vectors.const_cast_derived().coeffRef(i, i) = Vii;
-    // FIXME add .noalias() once the triangular product can work inplace
-    triFactor.col(i).head(i) = triFactor.block(0,0,i,i).template triangularView<Upper>()
-                             * triFactor.col(i).head(i);
+    Index rs = vectors.rows() - i - 1;
+    Index rt = nbVecs-i-1;
+
+    if(rt>0)
+    {
+      triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint()
+                                                        * vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
+            
+      // FIXME add .noalias() once the triangular product can work inplace
+      triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      
+    }
     triFactor(i,i) = hCoeffs(i);
   }
 }
 
-/** \internal */
+/** \internal
+  * if forward then perform   mat = H0 * H1 * H2 * mat
+  * otherwise perform         mat = H2 * H1 * H0 * mat
+  */
 template<typename MatrixType,typename VectorsType,typename CoeffsType>
-void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vectors, const CoeffsType& hCoeffs)
+void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vectors, const CoeffsType& hCoeffs, bool forward)
 {
-  typedef typename MatrixType::Index Index;
   enum { TFactorSize = MatrixType::ColsAtCompileTime };
   Index nbVecs = vectors.cols();
-  Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, ColMajor> T(nbVecs,nbVecs);
-  make_block_householder_triangular_factor(T, vectors, hCoeffs);
-
-  const TriangularView<const VectorsType, UnitLower>& V(vectors);
+  Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, RowMajor> T(nbVecs,nbVecs);
+  
+  if(forward) make_block_householder_triangular_factor(T, vectors, hCoeffs);
+  else        make_block_householder_triangular_factor(T, vectors, hCoeffs.conjugate());  
+  const TriangularView<const VectorsType, UnitLower> V(vectors);
 
   // A -= V T V^* A
-  Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,0,
+  Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,
+         (VectorsType::MaxColsAtCompileTime==1 && MatrixType::MaxColsAtCompileTime!=1)?RowMajor:ColMajor,
          VectorsType::MaxColsAtCompileTime,MatrixType::MaxColsAtCompileTime> tmp = V.adjoint() * mat;
   // FIXME add .noalias() once the triangular product can work inplace
-  tmp = T.template triangularView<Upper>().adjoint() * tmp;
+  if(forward) tmp = T.template triangularView<Upper>()           * tmp;
+  else        tmp = T.template triangularView<Upper>().adjoint() * tmp;
   mat.noalias() -= V * tmp;
 }
 
diff --git a/Eigen/src/Householder/CMakeLists.txt b/Eigen/src/Householder/CMakeLists.txt
deleted file mode 100644
index ce4937db0..000000000
--- a/Eigen/src/Householder/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Householder_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Householder_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Householder COMPONENT Devel
-  )
diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 32112af9b..80de2c305 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -75,8 +75,9 @@ void MatrixBase<Derived>::makeHouseholder(
   
   RealScalar tailSqNorm = size()==1 ? RealScalar(0) : tail.squaredNorm();
   Scalar c0 = coeff(0);
+  const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
 
-  if(tailSqNorm == RealScalar(0) && numext::imag(c0)==RealScalar(0))
+  if(tailSqNorm <= tol && numext::abs2(numext::imag(c0))<=tol)
   {
     tau = RealScalar(0);
     beta = numext::real(c0);
@@ -118,7 +119,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   {
     *this *= Scalar(1)-tau;
   }
-  else
+  else if(tau!=Scalar(0))
   {
     Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
     Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
@@ -155,7 +156,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
   {
     *this *= Scalar(1)-tau;
   }
-  else
+  else if(tau!=Scalar(0))
   {
     Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
     Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index d800ca1fa..3ce0a693d 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -60,7 +60,7 @@ template<typename VectorsType, typename CoeffsType, int Side>
 struct traits<HouseholderSequence<VectorsType,CoeffsType,Side> >
 {
   typedef typename VectorsType::Scalar Scalar;
-  typedef typename VectorsType::Index Index;
+  typedef typename VectorsType::StorageIndex StorageIndex;
   typedef typename VectorsType::StorageKind StorageKind;
   enum {
     RowsAtCompileTime = Side==OnTheLeft ? traits<VectorsType>::RowsAtCompileTime
@@ -73,12 +73,20 @@ struct traits<HouseholderSequence<VectorsType,CoeffsType,Side> >
   };
 };
 
+struct HouseholderSequenceShape {};
+
+template<typename VectorsType, typename CoeffsType, int Side>
+struct evaluator_traits<HouseholderSequence<VectorsType,CoeffsType,Side> >
+  : public evaluator_traits_base<HouseholderSequence<VectorsType,CoeffsType,Side> >
+{
+  typedef HouseholderSequenceShape Shape;
+};
+
 template<typename VectorsType, typename CoeffsType, int Side>
 struct hseq_side_dependent_impl
 {
   typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
-  typedef typename VectorsType::Index Index;
   static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
@@ -91,7 +99,6 @@ struct hseq_side_dependent_impl<VectorsType, CoeffsType, OnTheRight>
 {
   typedef Transpose<Block<const VectorsType, 1, Dynamic> > EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheRight> HouseholderSequenceType;
-  typedef typename VectorsType::Index Index;
   static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
@@ -101,7 +108,7 @@ struct hseq_side_dependent_impl<VectorsType, CoeffsType, OnTheRight>
 
 template<typename OtherScalarType, typename MatrixType> struct matrix_type_times_scalar_type
 {
-  typedef typename scalar_product_traits<OtherScalarType, typename MatrixType::Scalar>::ReturnType
+  typedef typename ScalarBinaryOpTraits<OtherScalarType, typename MatrixType::Scalar>::ReturnType
     ResultScalar;
   typedef Matrix<ResultScalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime,
                  0, MatrixType::MaxRowsAtCompileTime, MatrixType::MaxColsAtCompileTime> Type;
@@ -122,7 +129,6 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       MaxColsAtCompileTime = internal::traits<HouseholderSequence>::MaxColsAtCompileTime
     };
     typedef typename internal::traits<HouseholderSequence>::Scalar Scalar;
-    typedef typename VectorsType::Index Index;
 
     typedef HouseholderSequence<
       typename internal::conditional<NumTraits<Scalar>::IsComplex,
@@ -237,8 +243,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     {
       workspace.resize(rows());
       Index vecs = m_length;
-      if(    internal::is_same<typename internal::remove_all<VectorsType>::type,Dest>::value
-          && internal::extract_data(dst) == internal::extract_data(m_vectors))
+      if(internal::is_same_dense(dst,m_vectors))
       {
         // in-place
         dst.diagonal().setOnes();
@@ -299,7 +304,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     /** \internal */
     template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
     {
-      Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace(dst.cols());
+      Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace;
       applyThisOnTheLeft(dst, workspace);
     }
 
@@ -307,12 +312,36 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     template<typename Dest, typename Workspace>
     inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const
     {
-      workspace.resize(dst.cols());
-      for(Index k = 0; k < m_length; ++k)
+      const Index BlockSize = 48;
+      // if the entries are large enough, then apply the reflectors by block
+      if(m_length>=BlockSize && dst.cols()>1)
       {
-        Index actual_k = m_trans ? k : m_length-k-1;
-        dst.bottomRows(rows()-m_shift-actual_k)
-           .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+        for(Index i = 0; i < m_length; i+=BlockSize)
+        {
+          Index end = m_trans ? (std::min)(m_length,i+BlockSize) : m_length-i;
+          Index k = m_trans ? i : (std::max)(Index(0),end-BlockSize);
+          Index bs = end-k;
+          Index start = k + m_shift;
+          
+          typedef Block<typename internal::remove_all<VectorsType>::type,Dynamic,Dynamic> SubVectorsType;
+          SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side==OnTheRight ? k : start,
+                                                                   Side==OnTheRight ? start : k,
+                                                                   Side==OnTheRight ? bs : m_vectors.rows()-start,
+                                                                   Side==OnTheRight ? m_vectors.cols()-start : bs);
+          typename internal::conditional<Side==OnTheRight, Transpose<SubVectorsType>, SubVectorsType&>::type sub_vecs(sub_vecs1);
+          Block<Dest,Dynamic,Dynamic> sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols());
+          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans);
+        }
+      }
+      else
+      {
+        workspace.resize(dst.cols());
+        for(Index k = 0; k < m_length; ++k)
+        {
+          Index actual_k = m_trans ? k : m_length-k-1;
+          dst.bottomRows(rows()-m_shift-actual_k)
+            .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+        }
       }
     }
 
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 73ca9bfde..358444aff 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -17,33 +17,37 @@ namespace Eigen {
   *
   * This class allows to approximately solve for A.x = b problems assuming A is a diagonal matrix.
   * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
-  * \code
-  * A.diagonal().asDiagonal() . x = b
-  * \endcode
+    \code
+    A.diagonal().asDiagonal() . x = b
+    \endcode
   *
   * \tparam _Scalar the type of the scalar.
   *
+  * \implsparsesolverconcept
+  *
   * This preconditioner is suitable for both selfadjoint and general problems.
   * The diagonal entries are pre-inverted and stored into a dense vector.
   *
   * \note A variant that has yet to be implemented would attempt to preserve the norm of each column.
   *
+  * \sa class LeastSquareDiagonalPreconditioner, class ConjugateGradient
   */
 template <typename _Scalar>
 class DiagonalPreconditioner
 {
     typedef _Scalar Scalar;
     typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef typename Vector::Index Index;
-
   public:
-    // this typedef is only to export the scalar type and compile-time dimensions to solve_retval
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+    typedef typename Vector::StorageIndex StorageIndex;
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
 
     DiagonalPreconditioner() : m_isInitialized(false) {}
 
     template<typename MatType>
-    DiagonalPreconditioner(const MatType& mat) : m_invdiag(mat.cols())
+    explicit DiagonalPreconditioner(const MatType& mat) : m_invdiag(mat.cols())
     {
       compute(mat);
     }
@@ -65,10 +69,10 @@ class DiagonalPreconditioner
       {
         typename MatType::InnerIterator it(mat,j);
         while(it && it.index()!=j) ++it;
-        if(it && it.index()==j)
+        if(it && it.index()==j && it.value()!=Scalar(0))
           m_invdiag(j) = Scalar(1)/it.value();
         else
-          m_invdiag(j) = 0;
+          m_invdiag(j) = Scalar(1);
       }
       m_isInitialized = true;
       return *this;
@@ -80,46 +84,102 @@ class DiagonalPreconditioner
       return factorize(mat);
     }
 
+    /** \internal */
     template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
+    void _solve_impl(const Rhs& b, Dest& x) const
     {
       x = m_invdiag.array() * b.array() ;
     }
 
-    template<typename Rhs> inline const internal::solve_retval<DiagonalPreconditioner, Rhs>
+    template<typename Rhs> inline const Solve<DiagonalPreconditioner, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "DiagonalPreconditioner is not initialized.");
       eigen_assert(m_invdiag.size()==b.rows()
                 && "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<DiagonalPreconditioner, Rhs>(*this, b.derived());
+      return Solve<DiagonalPreconditioner, Rhs>(*this, b.derived());
     }
+    
+    ComputationInfo info() { return Success; }
 
   protected:
     Vector m_invdiag;
     bool m_isInitialized;
 };
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<DiagonalPreconditioner<_MatrixType>, Rhs>
-  : solve_retval_base<DiagonalPreconditioner<_MatrixType>, Rhs>
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief Jacobi preconditioner for LeastSquaresConjugateGradient
+  *
+  * This class allows to approximately solve for A' A x  = A' b problems assuming A' A is a diagonal matrix.
+  * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
+    \code
+    (A.adjoint() * A).diagonal().asDiagonal() * x = b
+    \endcode
+  *
+  * \tparam _Scalar the type of the scalar.
+  *
+  * \implsparsesolverconcept
+  *
+  * The diagonal entries are pre-inverted and stored into a dense vector.
+  * 
+  * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner
+  */
+template <typename _Scalar>
+class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
 {
-  typedef DiagonalPreconditioner<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+    typedef _Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef DiagonalPreconditioner<_Scalar> Base;
+    using Base::m_invdiag;
+  public:
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
+    LeastSquareDiagonalPreconditioner() : Base() {}
+
+    template<typename MatType>
+    explicit LeastSquareDiagonalPreconditioner(const MatType& mat) : Base()
+    {
+      compute(mat);
+    }
+
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& analyzePattern(const MatType& )
+    {
+      return *this;
+    }
+    
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& factorize(const MatType& mat)
+    {
+      // Compute the inverse squared-norm of each column of mat
+      m_invdiag.resize(mat.cols());
+      for(Index j=0; j<mat.outerSize(); ++j)
+      {
+        RealScalar sum = mat.innerVector(j).squaredNorm();
+        if(sum>0)
+          m_invdiag(j) = RealScalar(1)/sum;
+        else
+          m_invdiag(j) = RealScalar(1);
+      }
+      Base::m_isInitialized = true;
+      return *this;
+    }
+    
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& compute(const MatType& mat)
+    {
+      return factorize(mat);
+    }
+    
+    ComputationInfo info() { return Success; }
 
-}
+  protected:
+};
 
 /** \ingroup IterativeLinearSolvers_Module
   * \brief A naive preconditioner which approximates any matrix as the identity matrix
   *
+  * \implsparsesolverconcept
+  *
   * \sa class DiagonalPreconditioner
   */
 class IdentityPreconditioner
@@ -129,7 +189,7 @@ class IdentityPreconditioner
     IdentityPreconditioner() {}
 
     template<typename MatrixType>
-    IdentityPreconditioner(const MatrixType& ) {}
+    explicit IdentityPreconditioner(const MatrixType& ) {}
     
     template<typename MatrixType>
     IdentityPreconditioner& analyzePattern(const MatrixType& ) { return *this; }
@@ -142,6 +202,8 @@ class IdentityPreconditioner
     
     template<typename Rhs>
     inline const Rhs& solve(const Rhs& b) const { return b; }
+    
+    ComputationInfo info() { return Success; }
 };
 
 } // end namespace Eigen
diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 2625c4dc3..454f46814 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -27,7 +27,7 @@ namespace internal {
   */
 template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
 bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
-              const Preconditioner& precond, int& iters,
+              const Preconditioner& precond, Index& iters,
               typename Dest::RealScalar& tol_error)
 {
   using std::sqrt;
@@ -36,9 +36,9 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
   typedef typename Dest::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,1> VectorType;
   RealScalar tol = tol_error;
-  int maxIters = iters;
+  Index maxIters = iters;
 
-  int n = mat.cols();
+  Index n = mat.cols();
   VectorType r  = rhs - mat * x;
   VectorType r0 = r;
   
@@ -59,20 +59,21 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
 
   VectorType s(n), t(n);
 
-  RealScalar tol2 = tol*tol;
+  RealScalar tol2 = tol*tol*rhs_sqnorm;
   RealScalar eps2 = NumTraits<Scalar>::epsilon()*NumTraits<Scalar>::epsilon();
-  int i = 0;
-  int restarts = 0;
+  Index i = 0;
+  Index restarts = 0;
 
-  while ( r.squaredNorm()/rhs_sqnorm > tol2 && i<maxIters )
+  while ( r.squaredNorm() > tol2 && i<maxIters )
   {
     Scalar rho_old = rho;
 
     rho = r0.dot(r);
     if (abs(rho) < eps2*r0_sqnorm)
     {
-      // The new residual vector became too orthogonal to the arbitrarily choosen direction r0
+      // The new residual vector became too orthogonal to the arbitrarily chosen direction r0
       // Let's restart with a new r0:
+      r  = rhs - mat * x;
       r0 = r;
       rho = r0_sqnorm = r.squaredNorm();
       if(restarts++ == 0)
@@ -131,35 +132,33 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
   * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
+  * \implsparsesolverconcept
+  *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
   * 
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  * 
+  * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
+  * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  * 
   * This class can be used as the direct solver classes. Here is a typical usage example:
-  * \code
-  * int n = 10000;
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A(n,n);
-  * // fill A and b
-  * BiCGSTAB<SparseMatrix<double> > solver;
-  * solver.compute(A);
-  * x = solver.solve(b);
-  * std::cout << "#iterations:     " << solver.iterations() << std::endl;
-  * std::cout << "estimated error: " << solver.error()      << std::endl;
-  * // update b, and solve again
-  * x = solver.solve(b);
-  * \endcode
+  * \include BiCGSTAB_simple.cpp
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
   * One can control the start using the solveWithGuess() method.
   * 
+  * BiCGSTAB can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, typename _Preconditioner>
 class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<_MatrixType,_Preconditioner> >
 {
   typedef IterativeSolverBase<BiCGSTAB> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
@@ -167,7 +166,6 @@ class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<_MatrixType,_Preconditioner
 public:
   typedef _MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef _Preconditioner Preconditioner;
 
@@ -186,38 +184,23 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  BiCGSTAB(const MatrixType& A) : Base(A) {}
+  template<typename MatrixDerived>
+  explicit BiCGSTAB(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~BiCGSTAB() {}
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<BiCGSTAB, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "BiCGSTAB is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "BiCGSTAB::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <BiCGSTAB, Rhs, Guess>(*this, b.derived(), x0);
-  }
-  
+
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {    
     bool failed = false;
-    for(int j=0; j<b.cols(); ++j)
+    for(Index j=0; j<b.cols(); ++j)
     {
       m_iterations = Base::maxIterations();
       m_error = Base::m_tolerance;
       
       typename Dest::ColXpr xj(x,j);
-      if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
+      if(!internal::bicgstab(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
         failed = true;
     }
     m_info = failed ? NumericalIssue
@@ -227,36 +210,19 @@ public:
   }
 
   /** \internal */
+  using Base::_solve_impl;
   template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
   {
-//     x.setZero();
-  x = b;
-    _solveWithGuess(b,x);
+    x.resize(this->rows(),b.cols());
+    x.setZero();
+    _solve_with_guess_impl(b,x);
   }
 
 protected:
 
 };
 
-
-namespace internal {
-
-  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
-struct solve_retval<BiCGSTAB<_MatrixType, _Preconditioner>, Rhs>
-  : solve_retval_base<BiCGSTAB<_MatrixType, _Preconditioner>, Rhs>
-{
-  typedef BiCGSTAB<_MatrixType, _Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_BICGSTAB_H
diff --git a/Eigen/src/IterativeLinearSolvers/CMakeLists.txt b/Eigen/src/IterativeLinearSolvers/CMakeLists.txt
deleted file mode 100644
index 59ccc0072..000000000
--- a/Eigen/src/IterativeLinearSolvers/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_IterativeLinearSolvers_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_IterativeLinearSolvers_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/IterativeLinearSolvers COMPONENT Devel
-  )
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 8ba4a8dbe..395daa8e4 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -26,7 +26,7 @@ namespace internal {
 template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
 EIGEN_DONT_INLINE
 void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
-                        const Preconditioner& precond, int& iters,
+                        const Preconditioner& precond, Index& iters,
                         typename Dest::RealScalar& tol_error)
 {
   using std::sqrt;
@@ -36,9 +36,9 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
   typedef Matrix<Scalar,Dynamic,1> VectorType;
   
   RealScalar tol = tol_error;
-  int maxIters = iters;
+  Index maxIters = iters;
   
-  int n = mat.cols();
+  Index n = mat.cols();
 
   VectorType residual = rhs - mat * x; //initial residual
 
@@ -60,29 +60,29 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
   }
   
   VectorType p(n);
-  p = precond.solve(residual);      //initial search direction
+  p = precond.solve(residual);      // initial search direction
 
   VectorType z(n), tmp(n);
   RealScalar absNew = numext::real(residual.dot(p));  // the square of the absolute value of r scaled by invM
-  int i = 0;
+  Index i = 0;
   while(i < maxIters)
   {
-    tmp.noalias() = mat * p;              // the bottleneck of the algorithm
+    tmp.noalias() = mat * p;                    // the bottleneck of the algorithm
 
-    Scalar alpha = absNew / p.dot(tmp);   // the amount we travel on dir
-    x += alpha * p;                       // update solution
-    residual -= alpha * tmp;              // update residue
+    Scalar alpha = absNew / p.dot(tmp);         // the amount we travel on dir
+    x += alpha * p;                             // update solution
+    residual -= alpha * tmp;                    // update residual
     
     residualNorm2 = residual.squaredNorm();
     if(residualNorm2 < threshold)
       break;
     
-    z = precond.solve(residual);          // approximately solve for "A z = residual"
+    z = precond.solve(residual);                // approximately solve for "A z = residual"
 
     RealScalar absOld = absNew;
     absNew = numext::real(residual.dot(z));     // update the absolute value of r
-    RealScalar beta = absNew / absOld;            // calculate the Gram-Schmidt value used to create the new search direction
-    p = z + beta * p;                             // update search direction
+    RealScalar beta = absNew / absOld;          // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                           // update search direction
     i++;
   }
   tol_error = sqrt(residualNorm2 / rhsNorm2);
@@ -107,45 +107,57 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
 }
 
 /** \ingroup IterativeLinearSolvers_Module
-  * \brief A conjugate gradient solver for sparse self-adjoint problems
+  * \brief A conjugate gradient solver for sparse (or dense) self-adjoint problems
   *
-  * This class allows to solve for A.x = b sparse linear problems using a conjugate gradient algorithm.
-  * The sparse matrix A must be selfadjoint. The vectors x and b can be either dense or sparse.
+  * This class allows to solve for A.x = b linear problems using an iterative conjugate gradient algorithm.
+  * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse.
   *
   * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
-  *               Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
+  *               \c Upper, or \c Lower|Upper in which the full matrix entries will be considered.
+  *               Default is \c Lower, best performance is \c Lower|Upper.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
+  * \implsparsesolverconcept
+  *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
   * 
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  * 
+  * \b Performance: Even though the default value of \c _UpLo is \c Lower, significantly higher performance is
+  * achieved when using a complete matrix and \b Lower|Upper as the \a _UpLo template parameter. Moreover, in this
+  * case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  * 
   * This class can be used as the direct solver classes. Here is a typical usage example:
-  * \code
-  * int n = 10000;
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A(n,n);
-  * // fill A and b
-  * ConjugateGradient<SparseMatrix<double> > cg;
-  * cg.compute(A);
-  * x = cg.solve(b);
-  * std::cout << "#iterations:     " << cg.iterations() << std::endl;
-  * std::cout << "estimated error: " << cg.error()      << std::endl;
-  * // update b, and solve again
-  * x = cg.solve(b);
-  * \endcode
+    \code
+    int n = 10000;
+    VectorXd x(n), b(n);
+    SparseMatrix<double> A(n,n);
+    // fill A and b
+    ConjugateGradient<SparseMatrix<double>, Lower|Upper> cg;
+    cg.compute(A);
+    x = cg.solve(b);
+    std::cout << "#iterations:     " << cg.iterations() << std::endl;
+    std::cout << "estimated error: " << cg.error()      << std::endl;
+    // update b, and solve again
+    x = cg.solve(b);
+    \endcode
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
   * One can control the start using the solveWithGuess() method.
   * 
-  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  * ConjugateGradient can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
+  * \sa class LeastSquaresConjugateGradient, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, int _UpLo, typename _Preconditioner>
 class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
 {
   typedef IterativeSolverBase<ConjugateGradient> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
@@ -153,7 +165,6 @@ class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixTy
 public:
   typedef _MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef _Preconditioner Preconditioner;
 
@@ -176,44 +187,40 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  ConjugateGradient(const MatrixType& A) : Base(A) {}
+  template<typename MatrixDerived>
+  explicit ConjugateGradient(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~ConjugateGradient() {}
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<ConjugateGradient, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "ConjugateGradient::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <ConjugateGradient, Rhs, Guess>(*this, b.derived(), x0);
-  }
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {
+    typedef typename Base::MatrixWrapper MatrixWrapper;
+    typedef typename Base::ActualMatrixType ActualMatrixType;
+    enum {
+      TransposeInput  =   (!MatrixWrapper::MatrixFree)
+                      &&  (UpLo==(Lower|Upper))
+                      &&  (!MatrixType::IsRowMajor)
+                      &&  (!NumTraits<Scalar>::IsComplex)
+    };
+    typedef typename internal::conditional<TransposeInput,Transpose<const ActualMatrixType>, ActualMatrixType const&>::type RowMajorWrapper;
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY);
     typedef typename internal::conditional<UpLo==(Lower|Upper),
-                                           const MatrixType&,
-                                           SparseSelfAdjointView<const MatrixType, UpLo>
-                                          >::type MatrixWrapperType;
+                                           RowMajorWrapper,
+                                           typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type
+                                          >::type SelfAdjointWrapper;
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
-    for(int j=0; j<b.cols(); ++j)
+    for(Index j=0; j<b.cols(); ++j)
     {
       m_iterations = Base::maxIterations();
       m_error = Base::m_tolerance;
 
       typename Dest::ColXpr xj(x,j);
-      internal::conjugate_gradient(MatrixWrapperType(*mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
+      RowMajorWrapper row_mat(matrix());
+      internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
     }
 
     m_isInitialized = true;
@@ -221,35 +228,18 @@ public:
   }
   
   /** \internal */
+  using Base::_solve_impl;
   template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
   {
     x.setZero();
-    _solveWithGuess(b,x);
+    _solve_with_guess_impl(b.derived(),x);
   }
 
 protected:
 
 };
 
-
-namespace internal {
-
-template<typename _MatrixType, int _UpLo, typename _Preconditioner, typename Rhs>
-struct solve_retval<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-  : solve_retval_base<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-{
-  typedef ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_CONJUGATE_GRADIENT_H
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
new file mode 100644
index 000000000..e45c272b4
--- /dev/null
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -0,0 +1,400 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INCOMPLETE_CHOlESKY_H
+#define EIGEN_INCOMPLETE_CHOlESKY_H
+
+#include <vector>
+#include <list>
+
+namespace Eigen {  
+/** 
+  * \brief Modified Incomplete Cholesky with dual threshold
+  *
+  * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+  *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+  *
+  * \tparam Scalar the scalar type of the input matrices
+  * \tparam _UpLo The triangular part that will be used for the computations. It can be Lower
+    *               or Upper. Default is Lower.
+  * \tparam _OrderingType The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<int>,
+  *                       unless EIGEN_MPL2_ONLY is defined, in which case the default is NaturalOrdering<int>.
+  *
+  * \implsparsesolverconcept
+  *
+  * It performs the following incomplete factorization: \f$ S P A P' S \approx L L' \f$
+  * where L is a lower triangular factor, S is a diagonal scaling matrix, and P is a
+  * fill-in reducing permutation as computed by the ordering method.
+  *
+  * \b Shifting \b strategy: Let \f$ B = S P A P' S \f$  be the scaled matrix on which the factorization is carried out,
+  * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly performed
+  * on the matrix B. Otherwise, the factorization is performed on the shifted matrix \f$ B + (\sigma+|\beta| I \f$ where
+  * \f$ \sigma \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$ \sigma = 10^{-3} \f$.
+  * If the factorization fails, then the shift in doubled until it succeed or a maximum of ten attempts. If it still fails, as returned by
+  * the info() method, then you can either increase the initial shift, or better use another preconditioning technique.
+  *
+  */
+template <typename Scalar, int _UpLo = Lower, typename _OrderingType =
+#ifndef EIGEN_MPL2_ONLY
+AMDOrdering<int>
+#else
+NaturalOrdering<int>
+#endif
+>
+class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> >
+{
+  protected:
+    typedef SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> > Base;
+    using Base::m_isInitialized;
+  public:
+    typedef typename NumTraits<Scalar>::Real RealScalar; 
+    typedef _OrderingType OrderingType;
+    typedef typename OrderingType::PermutationType PermutationType;
+    typedef typename PermutationType::StorageIndex StorageIndex; 
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> FactorType;
+    typedef Matrix<Scalar,Dynamic,1> VectorSx;
+    typedef Matrix<RealScalar,Dynamic,1> VectorRx;
+    typedef Matrix<StorageIndex,Dynamic, 1> VectorIx;
+    typedef std::vector<std::list<StorageIndex> > VectorList; 
+    enum { UpLo = _UpLo };
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
+  public:
+
+    /** Default constructor leaving the object in a partly non-initialized stage.
+      *
+      * You must call compute() or the pair analyzePattern()/factorize() to make it valid.
+      *
+      * \sa IncompleteCholesky(const MatrixType&)
+      */
+    IncompleteCholesky() : m_initialShift(1e-3),m_factorizationIsOk(false) {}
+    
+    /** Constructor computing the incomplete factorization for the given matrix \a matrix.
+      */
+    template<typename MatrixType>
+    IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_factorizationIsOk(false)
+    {
+      compute(matrix);
+    }
+    
+    /** \returns number of rows of the factored matrix */
+    Index rows() const { return m_L.rows(); }
+    
+    /** \returns number of columns of the factored matrix */
+    Index cols() const { return m_L.cols(); }
+    
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * It triggers an assertion if \c *this has not been initialized through the respective constructor,
+      * or a call to compute() or analyzePattern().
+      *
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the matrix appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "IncompleteCholesky is not initialized.");
+      return m_info;
+    }
+    
+    /** \brief Set the initial shift parameter \f$ \sigma \f$.
+      */
+    void setInitialShift(RealScalar shift) { m_initialShift = shift; }
+    
+    /** \brief Computes the fill reducing permutation vector using the sparsity pattern of \a mat
+      */
+    template<typename MatrixType>
+    void analyzePattern(const MatrixType& mat)
+    {
+      OrderingType ord; 
+      PermutationType pinv;
+      ord(mat.template selfadjointView<UpLo>(), pinv); 
+      if(pinv.size()>0) m_perm = pinv.inverse();
+      else              m_perm.resize(0);
+      m_L.resize(mat.rows(), mat.cols());
+      m_analysisIsOk = true;
+      m_isInitialized = true;
+      m_info = Success;
+    }
+    
+    /** \brief Performs the numerical factorization of the input matrix \a mat
+      *
+      * The method analyzePattern() or compute() must have been called beforehand
+      * with a matrix having the same pattern.
+      *
+      * \sa compute(), analyzePattern()
+      */
+    template<typename MatrixType>
+    void factorize(const MatrixType& mat);
+    
+    /** Computes or re-computes the incomplete Cholesky factorization of the input matrix \a mat
+      *
+      * It is a shortcut for a sequential call to the analyzePattern() and factorize() methods.
+      *
+      * \sa analyzePattern(), factorize()
+      */
+    template<typename MatrixType>
+    void compute(const MatrixType& mat)
+    {
+      analyzePattern(mat);
+      factorize(mat);
+    }
+    
+    // internal
+    template<typename Rhs, typename Dest>
+    void _solve_impl(const Rhs& b, Dest& x) const
+    {
+      eigen_assert(m_factorizationIsOk && "factorize() should be called first");
+      if (m_perm.rows() == b.rows())  x = m_perm * b;
+      else                            x = b;
+      x = m_scale.asDiagonal() * x;
+      x = m_L.template triangularView<Lower>().solve(x);
+      x = m_L.adjoint().template triangularView<Upper>().solve(x);
+      x = m_scale.asDiagonal() * x;
+      if (m_perm.rows() == b.rows())
+        x = m_perm.inverse() * x;
+    }
+
+    /** \returns the sparse lower triangular factor L */
+    const FactorType& matrixL() const { eigen_assert("m_factorizationIsOk"); return m_L; }
+
+    /** \returns a vector representing the scaling factor S */
+    const VectorRx& scalingS() const { eigen_assert("m_factorizationIsOk"); return m_scale; }
+
+    /** \returns the fill-in reducing permutation P (can be empty for a natural ordering) */
+    const PermutationType& permutationP() const { eigen_assert("m_analysisIsOk"); return m_perm; }
+
+  protected:
+    FactorType m_L;              // The lower part stored in CSC
+    VectorRx m_scale;            // The vector for scaling the matrix 
+    RealScalar m_initialShift;   // The initial shift parameter
+    bool m_analysisIsOk; 
+    bool m_factorizationIsOk; 
+    ComputationInfo m_info;
+    PermutationType m_perm; 
+
+  private:
+    inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); 
+}; 
+
+// Based on the following paper:
+//   C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+//   Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+//   http://ftp.mcs.anl.gov/pub/tech_reports/reports/P682.pdf
+template<typename Scalar, int _UpLo, typename OrderingType>
+template<typename _MatrixType>
+void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType& mat)
+{
+  using std::sqrt;
+  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
+    
+  // Dropping strategy : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added
+  
+  // Apply the fill-reducing permutation computed in analyzePattern()
+  if (m_perm.rows() == mat.rows() ) // To detect the null permutation
+  {
+    // The temporary is needed to make sure that the diagonal entry is properly sorted
+    FactorType tmp(mat.rows(), mat.cols());
+    tmp = mat.template selfadjointView<_UpLo>().twistedBy(m_perm);
+    m_L.template selfadjointView<Lower>() = tmp.template selfadjointView<Lower>();
+  }
+  else
+  {
+    m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>();
+  }
+  
+  Index n = m_L.cols(); 
+  Index nnz = m_L.nonZeros();
+  Map<VectorSx> vals(m_L.valuePtr(), nnz);         //values
+  Map<VectorIx> rowIdx(m_L.innerIndexPtr(), nnz);  //Row indices
+  Map<VectorIx> colPtr( m_L.outerIndexPtr(), n+1); // Pointer to the beginning of each row
+  VectorIx firstElt(n-1); // for each j, points to the next entry in vals that will be used in the factorization
+  VectorList listCol(n);  // listCol(j) is a linked list of columns to update column j
+  VectorSx col_vals(n);   // Store a  nonzero values in each column
+  VectorIx col_irow(n);   // Row indices of nonzero elements in each column
+  VectorIx col_pattern(n);
+  col_pattern.fill(-1);
+  StorageIndex col_nnz;
+  
+  
+  // Computes the scaling factors 
+  m_scale.resize(n);
+  m_scale.setZero();
+  for (Index j = 0; j < n; j++)
+    for (Index k = colPtr[j]; k < colPtr[j+1]; k++)
+    {
+      m_scale(j) += numext::abs2(vals(k));
+      if(rowIdx[k]!=j)
+        m_scale(rowIdx[k]) += numext::abs2(vals(k));
+    }
+  
+  m_scale = m_scale.cwiseSqrt().cwiseSqrt();
+
+  for (Index j = 0; j < n; ++j)
+    if(m_scale(j)>(std::numeric_limits<RealScalar>::min)())
+      m_scale(j) = RealScalar(1)/m_scale(j);
+    else
+      m_scale(j) = 1;
+
+  // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
+  
+  // Scale and compute the shift for the matrix 
+  RealScalar mindiag = NumTraits<RealScalar>::highest();
+  for (Index j = 0; j < n; j++)
+  {
+    for (Index k = colPtr[j]; k < colPtr[j+1]; k++)
+      vals[k] *= (m_scale(j)*m_scale(rowIdx[k]));
+    eigen_internal_assert(rowIdx[colPtr[j]]==j && "IncompleteCholesky: only the lower triangular part must be stored");
+    mindiag = numext::mini(numext::real(vals[colPtr[j]]), mindiag);
+  }
+
+  FactorType L_save = m_L;
+  
+  RealScalar shift = 0;
+  if(mindiag <= RealScalar(0.))
+    shift = m_initialShift - mindiag;
+
+  m_info = NumericalIssue;
+
+  // Try to perform the incomplete factorization using the current shift
+  int iter = 0;
+  do
+  {
+    // Apply the shift to the diagonal elements of the matrix
+    for (Index j = 0; j < n; j++)
+      vals[colPtr[j]] += shift;
+
+    // jki version of the Cholesky factorization
+    Index j=0;
+    for (; j < n; ++j)
+    {
+      // Left-looking factorization of the j-th column
+      // First, load the j-th column into col_vals
+      Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
+      col_nnz = 0;
+      for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++)
+      {
+        StorageIndex l = rowIdx[i];
+        col_vals(col_nnz) = vals[i];
+        col_irow(col_nnz) = l;
+        col_pattern(l) = col_nnz;
+        col_nnz++;
+      }
+      {
+        typename std::list<StorageIndex>::iterator k;
+        // Browse all previous columns that will update column j
+        for(k = listCol[j].begin(); k != listCol[j].end(); k++)
+        {
+          Index jk = firstElt(*k); // First element to use in the column
+          eigen_internal_assert(rowIdx[jk]==j);
+          Scalar v_j_jk = numext::conj(vals[jk]);
+
+          jk += 1;
+          for (Index i = jk; i < colPtr[*k+1]; i++)
+          {
+            StorageIndex l = rowIdx[i];
+            if(col_pattern[l]<0)
+            {
+              col_vals(col_nnz) = vals[i] * v_j_jk;
+              col_irow[col_nnz] = l;
+              col_pattern(l) = col_nnz;
+              col_nnz++;
+            }
+            else
+              col_vals(col_pattern[l]) -= vals[i] * v_j_jk;
+          }
+          updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol);
+        }
+      }
+
+      // Scale the current column
+      if(numext::real(diag) <= 0)
+      {
+        if(++iter>=10)
+          return;
+
+        // increase shift
+        shift = numext::maxi(m_initialShift,RealScalar(2)*shift);
+        // restore m_L, col_pattern, and listCol
+        vals = Map<const VectorSx>(L_save.valuePtr(), nnz);
+        rowIdx = Map<const VectorIx>(L_save.innerIndexPtr(), nnz);
+        colPtr = Map<const VectorIx>(L_save.outerIndexPtr(), n+1);
+        col_pattern.fill(-1);
+        for(Index i=0; i<n; ++i)
+          listCol[i].clear();
+
+        break;
+      }
+
+      RealScalar rdiag = sqrt(numext::real(diag));
+      vals[colPtr[j]] = rdiag;
+      for (Index k = 0; k<col_nnz; ++k)
+      {
+        Index i = col_irow[k];
+        //Scale
+        col_vals(k) /= rdiag;
+        //Update the remaining diagonals with col_vals
+        vals[colPtr[i]] -= numext::abs2(col_vals(k));
+      }
+      // Select the largest p elements
+      // p is the original number of elements in the column (without the diagonal)
+      Index p = colPtr[j+1] - colPtr[j] - 1 ;
+      Ref<VectorSx> cvals = col_vals.head(col_nnz);
+      Ref<VectorIx> cirow = col_irow.head(col_nnz);
+      internal::QuickSplit(cvals,cirow, p);
+      // Insert the largest p elements in the matrix
+      Index cpt = 0;
+      for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++)
+      {
+        vals[i] = col_vals(cpt);
+        rowIdx[i] = col_irow(cpt);
+        // restore col_pattern:
+        col_pattern(col_irow(cpt)) = -1;
+        cpt++;
+      }
+      // Get the first smallest row index and put it after the diagonal element
+      Index jk = colPtr(j)+1;
+      updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol);
+    }
+
+    if(j==n)
+    {
+      m_factorizationIsOk = true;
+      m_info = Success;
+    }
+  } while(m_info!=Success);
+}
+
+template<typename Scalar, int _UpLo, typename OrderingType>
+inline void IncompleteCholesky<Scalar,_UpLo, OrderingType>::updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol)
+{
+  if (jk < colPtr(col+1) )
+  {
+    Index p = colPtr(col+1) - jk;
+    Index minpos; 
+    rowIdx.segment(jk,p).minCoeff(&minpos);
+    minpos += jk;
+    if (rowIdx(minpos) != rowIdx(jk))
+    {
+      //Swap
+      std::swap(rowIdx(jk),rowIdx(minpos));
+      std::swap(vals(jk),vals(minpos));
+    }
+    firstElt(col) = internal::convert_index<StorageIndex,Index>(jk);
+    listCol[rowIdx(jk)].push_back(internal::convert_index<StorageIndex,Index>(col));
+  }
+}
+
+} // end namespace Eigen 
+
+#endif
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 4c169aa60..338e6f10a 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -24,7 +25,7 @@ namespace internal {
   * \param ind The array of index for the elements in @p row
   * \param ncut  The number of largest elements to keep
   **/ 
-template <typename VectorV, typename VectorI, typename Index>
+template <typename VectorV, typename VectorI>
 Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
 {
   typedef typename VectorV::RealScalar RealScalar;
@@ -66,6 +67,8 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
   * \class IncompleteLUT
   * \brief Incomplete LU factorization with dual-threshold strategy
   *
+  * \implsparsesolverconcept
+  *
   * During the numerical factorization, two dropping rules are used :
   *  1) any element whose magnitude is less than some tolerance is dropped.
   *    This tolerance is obtained by multiplying the input tolerance @p droptol 
@@ -92,28 +95,36 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
   * alternatively, on GMANE:
   *   http://comments.gmane.org/gmane.comp.lib.eigen/3302
   */
-template <typename _Scalar>
-class IncompleteLUT : internal::noncopyable
+template <typename _Scalar, typename _StorageIndex = int>
+class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageIndex> >
 {
+  protected:
+    typedef SparseSolverBase<IncompleteLUT> Base;
+    using Base::m_isInitialized;
+  public:
     typedef _Scalar Scalar;
+    typedef _StorageIndex StorageIndex;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef SparseMatrix<Scalar,RowMajor> FactorType;
-    typedef SparseMatrix<Scalar,ColMajor> PermutType;
-    typedef typename FactorType::Index Index;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+    typedef SparseMatrix<Scalar,RowMajor,StorageIndex> FactorType;
+
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
 
   public:
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
     
     IncompleteLUT()
       : m_droptol(NumTraits<Scalar>::dummy_precision()), m_fillfactor(10),
-        m_analysisIsOk(false), m_factorizationIsOk(false), m_isInitialized(false)
+        m_analysisIsOk(false), m_factorizationIsOk(false)
     {}
     
     template<typename MatrixType>
-    IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits<Scalar>::dummy_precision(), int fillfactor = 10)
+    explicit IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits<Scalar>::dummy_precision(), int fillfactor = 10)
       : m_droptol(droptol),m_fillfactor(fillfactor),
-        m_analysisIsOk(false),m_factorizationIsOk(false),m_isInitialized(false)
+        m_analysisIsOk(false),m_factorizationIsOk(false)
     {
       eigen_assert(fillfactor != 0);
       compute(mat); 
@@ -146,7 +157,7 @@ class IncompleteLUT : internal::noncopyable
       * 
       **/
     template<typename MatrixType>
-    IncompleteLUT<Scalar>& compute(const MatrixType& amat)
+    IncompleteLUT& compute(const MatrixType& amat)
     {
       analyzePattern(amat); 
       factorize(amat);
@@ -157,23 +168,14 @@ class IncompleteLUT : internal::noncopyable
     void setFillfactor(int fillfactor); 
     
     template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
+    void _solve_impl(const Rhs& b, Dest& x) const
     {
-      x = m_Pinv * b;  
+      x = m_Pinv * b;
       x = m_lu.template triangularView<UnitLower>().solve(x);
       x = m_lu.template triangularView<Upper>().solve(x);
       x = m_P * x; 
     }
 
-    template<typename Rhs> inline const internal::solve_retval<IncompleteLUT, Rhs>
-     solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
-      eigen_assert(cols()==b.rows()
-                && "IncompleteLUT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<IncompleteLUT, Rhs>(*this, b.derived());
-    }
-
 protected:
 
     /** keeps off-diagonal entries; drops diagonal entries */
@@ -191,18 +193,17 @@ protected:
     int m_fillfactor;
     bool m_analysisIsOk;
     bool m_factorizationIsOk;
-    bool m_isInitialized;
     ComputationInfo m_info;
-    PermutationMatrix<Dynamic,Dynamic,Index> m_P;     // Fill-reducing permutation
-    PermutationMatrix<Dynamic,Dynamic,Index> m_Pinv;  // Inverse permutation
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_P;     // Fill-reducing permutation
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_Pinv;  // Inverse permutation
 };
 
 /**
  * Set control parameter droptol
  *  \param droptol   Drop any element whose magnitude is less than this tolerance 
  **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
+template<typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar,StorageIndex>::setDroptol(const RealScalar& droptol)
 {
   this->m_droptol = droptol;   
 }
@@ -211,52 +212,62 @@ void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
  * Set control parameter fillfactor
  * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row. 
  **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setFillfactor(int fillfactor)
+template<typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar,StorageIndex>::setFillfactor(int fillfactor)
 {
   this->m_fillfactor = fillfactor;   
 }
 
-template <typename Scalar>
+template <typename Scalar, typename StorageIndex>
 template<typename _MatrixType>
-void IncompleteLUT<Scalar>::analyzePattern(const _MatrixType& amat)
+void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
 {
   // Compute the Fill-reducing permutation
-  SparseMatrix<Scalar,ColMajor, Index> mat1 = amat;
-  SparseMatrix<Scalar,ColMajor, Index> mat2 = amat.transpose();
-  // Symmetrize the pattern
+  // Since ILUT does not perform any numerical pivoting,
+  // it is highly preferable to keep the diagonal through symmetric permutations.
+#ifndef EIGEN_MPL2_ONLY
+  // To this end, let's symmetrize the pattern and perform AMD on it.
+  SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
+  SparseMatrix<Scalar,ColMajor, StorageIndex> mat2 = amat.transpose();
   // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
   //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered...
-  SparseMatrix<Scalar,ColMajor, Index> AtA = mat2 + mat1;
-  AtA.prune(keep_diag());
-  internal::minimum_degree_ordering<Scalar, Index>(AtA, m_P);  // Then compute the AMD ordering...
-
-  m_Pinv  = m_P.inverse(); // ... and the inverse permutation
+  SparseMatrix<Scalar,ColMajor, StorageIndex> AtA = mat2 + mat1;
+  AMDOrdering<StorageIndex> ordering;
+  ordering(AtA,m_P);
+  m_Pinv  = m_P.inverse(); // cache the inverse permutation
+#else
+  // If AMD is not available, (MPL2-only), then let's use the slower COLAMD routine.
+  SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
+  COLAMDOrdering<StorageIndex> ordering;
+  ordering(mat1,m_Pinv);
+  m_P = m_Pinv.inverse();
+#endif
 
   m_analysisIsOk = true;
   m_factorizationIsOk = false;
-  m_isInitialized = false;
+  m_isInitialized = true;
 }
 
-template <typename Scalar>
+template <typename Scalar, typename StorageIndex>
 template<typename _MatrixType>
-void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
+void IncompleteLUT<Scalar,StorageIndex>::factorize(const _MatrixType& amat)
 {
   using std::sqrt;
   using std::swap;
   using std::abs;
+  using internal::convert_index;
 
   eigen_assert((amat.rows() == amat.cols()) && "The factorization should be done on a square matrix");
   Index n = amat.cols();  // Size of the matrix
   m_lu.resize(n,n);
   // Declare Working vectors and variables
   Vector u(n) ;     // real values of the row -- maximum size is n --
-  VectorXi ju(n);   // column position of the values in u -- maximum size  is n
-  VectorXi jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
+  VectorI ju(n);   // column position of the values in u -- maximum size  is n
+  VectorI jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
 
   // Apply the fill-reducing permutation
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
-  SparseMatrix<Scalar,RowMajor, Index> mat;
+  SparseMatrix<Scalar,RowMajor, StorageIndex> mat;
   mat = amat.twistedBy(m_Pinv);
 
   // Initialization
@@ -265,7 +276,7 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
   u.fill(0);
 
   // number of largest elements to keep in each row:
-  Index fill_in =   static_cast<Index> (amat.nonZeros()*m_fillfactor)/n+1;
+  Index fill_in = (amat.nonZeros()*m_fillfactor)/n + 1;
   if (fill_in > n) fill_in = n;
 
   // number of largest nonzero elements to keep in the L and the U part of the current row:
@@ -280,9 +291,9 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
 
     Index sizeu = 1; // number of nonzero elements in the upper part of the current row
     Index sizel = 0; // number of nonzero elements in the lower part of the current row
-    ju(ii)    = ii;
+    ju(ii)    = convert_index<StorageIndex>(ii);
     u(ii)     = 0;
-    jr(ii)    = ii;
+    jr(ii)    = convert_index<StorageIndex>(ii);
     RealScalar rownorm = 0;
 
     typename FactorType::InnerIterator j_it(mat, ii); // Iterate through the current row ii
@@ -292,9 +303,9 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
       if (k < ii)
       {
         // copy the lower part
-        ju(sizel) = k;
+        ju(sizel) = convert_index<StorageIndex>(k);
         u(sizel) = j_it.value();
-        jr(k) = sizel;
+        jr(k) = convert_index<StorageIndex>(sizel);
         ++sizel;
       }
       else if (k == ii)
@@ -305,9 +316,9 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
       {
         // copy the upper part
         Index jpos = ii + sizeu;
-        ju(jpos) = k;
+        ju(jpos) = convert_index<StorageIndex>(k);
         u(jpos) = j_it.value();
-        jr(k) = jpos;
+        jr(k) = convert_index<StorageIndex>(jpos);
         ++sizeu;
       }
       rownorm += numext::abs2(j_it.value());
@@ -337,7 +348,8 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
         // swap the two locations
         Index j = ju(jj);
         swap(ju(jj), ju(k));
-        jr(minrow) = jj;   jr(j) = k;
+        jr(minrow) = convert_index<StorageIndex>(jj);
+        jr(j) = convert_index<StorageIndex>(k);
         swap(u(jj), u(k));
       }
       // Reset this location
@@ -361,8 +373,8 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
       for (; ki_it; ++ki_it)
       {
         Scalar prod = fact * ki_it.value();
-        Index j       = ki_it.index();
-        Index jpos    = jr(j);
+        Index j     = ki_it.index();
+        Index jpos  = jr(j);
         if (jpos == -1) // fill-in element
         {
           Index newpos;
@@ -378,16 +390,16 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
             sizel++;
             eigen_internal_assert(sizel<=ii);
           }
-          ju(newpos) = j;
+          ju(newpos) = convert_index<StorageIndex>(j);
           u(newpos) = -prod;
-          jr(j) = newpos;
+          jr(j) = convert_index<StorageIndex>(newpos);
         }
         else
           u(jpos) -= prod;
       }
       // store the pivot element
-      u(len) = fact;
-      ju(len) = minrow;
+      u(len)  = fact;
+      ju(len) = convert_index<StorageIndex>(minrow);
       ++len;
 
       jj++;
@@ -402,7 +414,7 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
     sizel = len;
     len = (std::min)(sizel, nnzL);
     typename Vector::SegmentReturnType ul(u.segment(0, sizel));
-    typename VectorXi::SegmentReturnType jul(ju.segment(0, sizel));
+    typename VectorI::SegmentReturnType jul(ju.segment(0, sizel));
     internal::QuickSplit(ul, jul, len);
 
     // store the largest m_fill elements of the L part
@@ -431,39 +443,20 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
     sizeu = len + 1; // +1 to take into account the diagonal element
     len = (std::min)(sizeu, nnzU);
     typename Vector::SegmentReturnType uu(u.segment(ii+1, sizeu-1));
-    typename VectorXi::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
+    typename VectorI::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
     internal::QuickSplit(uu, juu, len);
 
     // store the largest elements of the U part
     for(Index k = ii + 1; k < ii + len; k++)
       m_lu.insertBackByOuterInnerUnordered(ii,ju(k)) = u(k);
   }
-
   m_lu.finalize();
   m_lu.makeCompressed();
 
   m_factorizationIsOk = true;
-  m_isInitialized = m_factorizationIsOk;
   m_info = Success;
 }
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<IncompleteLUT<_MatrixType>, Rhs>
-  : solve_retval_base<IncompleteLUT<_MatrixType>, Rhs>
-{
-  typedef IncompleteLUT<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_INCOMPLETE_LUT_H
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 2036922d6..7c2326eb7 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,29 +12,158 @@
 
 namespace Eigen { 
 
+namespace internal {
+
+template<typename MatrixType>
+struct is_ref_compatible_impl
+{
+private:
+  template <typename T0>
+  struct any_conversion
+  {
+    template <typename T> any_conversion(const volatile T&);
+    template <typename T> any_conversion(T&);
+  };
+  struct yes {int a[1];};
+  struct no  {int a[2];};
+
+  template<typename T>
+  static yes test(const Ref<const T>&, int);
+  template<typename T>
+  static no  test(any_conversion<T>, ...);
+
+public:
+  static MatrixType ms_from;
+  enum { value = sizeof(test<MatrixType>(ms_from, 0))==sizeof(yes) };
+};
+
+template<typename MatrixType>
+struct is_ref_compatible
+{
+  enum { value = is_ref_compatible_impl<typename remove_all<MatrixType>::type>::value };
+};
+
+template<typename MatrixType, bool MatrixFree = !internal::is_ref_compatible<MatrixType>::value>
+class generic_matrix_wrapper;
+
+// We have an explicit matrix at hand, compatible with Ref<>
+template<typename MatrixType>
+class generic_matrix_wrapper<MatrixType,false>
+{
+public:
+  typedef Ref<const MatrixType> ActualMatrixType;
+  template<int UpLo> struct ConstSelfAdjointViewReturnType {
+    typedef typename ActualMatrixType::template ConstSelfAdjointViewReturnType<UpLo>::Type Type;
+  };
+
+  enum {
+    MatrixFree = false
+  };
+
+  generic_matrix_wrapper()
+    : m_dummy(0,0), m_matrix(m_dummy)
+  {}
+
+  template<typename InputType>
+  generic_matrix_wrapper(const InputType &mat)
+    : m_matrix(mat)
+  {}
+
+  const ActualMatrixType& matrix() const
+  {
+    return m_matrix;
+  }
+
+  template<typename MatrixDerived>
+  void grab(const EigenBase<MatrixDerived> &mat)
+  {
+    m_matrix.~Ref<const MatrixType>();
+    ::new (&m_matrix) Ref<const MatrixType>(mat.derived());
+  }
+
+  void grab(const Ref<const MatrixType> &mat)
+  {
+    if(&(mat.derived()) != &m_matrix)
+    {
+      m_matrix.~Ref<const MatrixType>();
+      ::new (&m_matrix) Ref<const MatrixType>(mat);
+    }
+  }
+
+protected:
+  MatrixType m_dummy; // used to default initialize the Ref<> object
+  ActualMatrixType m_matrix;
+};
+
+// MatrixType is not compatible with Ref<> -> matrix-free wrapper
+template<typename MatrixType>
+class generic_matrix_wrapper<MatrixType,true>
+{
+public:
+  typedef MatrixType ActualMatrixType;
+  template<int UpLo> struct ConstSelfAdjointViewReturnType
+  {
+    typedef ActualMatrixType Type;
+  };
+
+  enum {
+    MatrixFree = true
+  };
+
+  generic_matrix_wrapper()
+    : mp_matrix(0)
+  {}
+
+  generic_matrix_wrapper(const MatrixType &mat)
+    : mp_matrix(&mat)
+  {}
+
+  const ActualMatrixType& matrix() const
+  {
+    return *mp_matrix;
+  }
+
+  void grab(const MatrixType &mat)
+  {
+    mp_matrix = &mat;
+  }
+
+protected:
+  const ActualMatrixType *mp_matrix;
+};
+
+}
+
 /** \ingroup IterativeLinearSolvers_Module
   * \brief Base class for linear iterative solvers
   *
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename Derived>
-class IterativeSolverBase : internal::noncopyable
+class IterativeSolverBase : public SparseSolverBase<Derived>
 {
+protected:
+  typedef SparseSolverBase<Derived> Base;
+  using Base::m_isInitialized;
+  
 public:
   typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename internal::traits<Derived>::Preconditioner Preconditioner;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::RealScalar RealScalar;
 
+  enum {
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
 public:
 
-  Derived& derived() { return *static_cast<Derived*>(this); }
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  using Base::derived;
 
   /** Default constructor. */
   IterativeSolverBase()
-    : mp_matrix(0)
   {
     init();
   }
@@ -49,77 +178,90 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  IterativeSolverBase(const MatrixType& A)
+  template<typename MatrixDerived>
+  explicit IterativeSolverBase(const EigenBase<MatrixDerived>& A)
+    : m_matrixWrapper(A.derived())
   {
     init();
-    compute(A);
+    compute(matrix());
   }
 
   ~IterativeSolverBase() {}
   
-  /** Initializes the iterative solver for the sparcity pattern of the matrix \a A for further solving \c Ax=b problems.
+  /** Initializes the iterative solver for the sparsity pattern of the matrix \a A for further solving \c Ax=b problems.
     *
-    * Currently, this function mostly call analyzePattern on the preconditioner. In the future
-    * we might, for instance, implement column reodering for faster matrix vector products.
+    * Currently, this function mostly calls analyzePattern on the preconditioner. In the future
+    * we might, for instance, implement column reordering for faster matrix vector products.
     */
-  Derived& analyzePattern(const MatrixType& A)
+  template<typename MatrixDerived>
+  Derived& analyzePattern(const EigenBase<MatrixDerived>& A)
   {
-    m_preconditioner.analyzePattern(A);
+    grab(A.derived());
+    m_preconditioner.analyzePattern(matrix());
     m_isInitialized = true;
     m_analysisIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
   
   /** Initializes the iterative solver with the numerical values of the matrix \a A for further solving \c Ax=b problems.
     *
-    * Currently, this function mostly call factorize on the preconditioner.
+    * Currently, this function mostly calls factorize on the preconditioner.
     *
     * \warning this class stores a reference to the matrix A as well as some
     * precomputed values that depend on it. Therefore, if \a A is changed
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  Derived& factorize(const MatrixType& A)
+  template<typename MatrixDerived>
+  Derived& factorize(const EigenBase<MatrixDerived>& A)
   {
     eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
-    mp_matrix = &A;
-    m_preconditioner.factorize(A);
+    grab(A.derived());
+    m_preconditioner.factorize(matrix());
     m_factorizationIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
 
   /** Initializes the iterative solver with the matrix \a A for further solving \c Ax=b problems.
     *
-    * Currently, this function mostly initialized/compute the preconditioner. In the future
-    * we might, for instance, implement column reodering for faster matrix vector products.
+    * Currently, this function mostly initializes/computes the preconditioner. In the future
+    * we might, for instance, implement column reordering for faster matrix vector products.
     *
     * \warning this class stores a reference to the matrix A as well as some
     * precomputed values that depend on it. Therefore, if \a A is changed
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  Derived& compute(const MatrixType& A)
+  template<typename MatrixDerived>
+  Derived& compute(const EigenBase<MatrixDerived>& A)
   {
-    mp_matrix = &A;
-    m_preconditioner.compute(A);
+    grab(A.derived());
+    m_preconditioner.compute(matrix());
     m_isInitialized = true;
     m_analysisIsOk = true;
     m_factorizationIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
 
   /** \internal */
-  Index rows() const { return mp_matrix ? mp_matrix->rows() : 0; }
+  Index rows() const { return matrix().rows(); }
+
   /** \internal */
-  Index cols() const { return mp_matrix ? mp_matrix->cols() : 0; }
+  Index cols() const { return matrix().cols(); }
 
-  /** \returns the tolerance threshold used by the stopping criteria */
+  /** \returns the tolerance threshold used by the stopping criteria.
+    * \sa setTolerance()
+    */
   RealScalar tolerance() const { return m_tolerance; }
   
-  /** Sets the tolerance threshold used by the stopping criteria */
+  /** Sets the tolerance threshold used by the stopping criteria.
+    *
+    * This value is used as an upper bound to the relative residual error: |Ax-b|/|b|.
+    * The default value is the machine precision given by NumTraits<Scalar>::epsilon()
+    */
   Derived& setTolerance(const RealScalar& tolerance)
   {
     m_tolerance = tolerance;
@@ -132,58 +274,52 @@ public:
   /** \returns a read-only reference to the preconditioner. */
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
-  /** \returns the max number of iterations */
-  int maxIterations() const
+  /** \returns the max number of iterations.
+    * It is either the value setted by setMaxIterations or, by default,
+    * twice the number of columns of the matrix.
+    */
+  Index maxIterations() const
   {
-    return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations;
+    return (m_maxIterations<0) ? 2*matrix().cols() : m_maxIterations;
   }
   
-  /** Sets the max number of iterations */
-  Derived& setMaxIterations(int maxIters)
+  /** Sets the max number of iterations.
+    * Default is twice the number of columns of the matrix.
+    */
+  Derived& setMaxIterations(Index maxIters)
   {
     m_maxIterations = maxIters;
     return derived();
   }
 
   /** \returns the number of iterations performed during the last solve */
-  int iterations() const
+  Index iterations() const
   {
     eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
     return m_iterations;
   }
 
-  /** \returns the tolerance error reached during the last solve */
+  /** \returns the tolerance error reached during the last solve.
+    * It is a close approximation of the true relative residual error |Ax-b|/|b|.
+    */
   RealScalar error() const
   {
     eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
     return m_error;
   }
 
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs> inline const internal::solve_retval<Derived, Rhs>
-  solve(const MatrixBase<Rhs>& b) const
-  {
-    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
-    eigen_assert(rows()==b.rows()
-              && "IterativeSolverBase::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval<Derived, Rhs>(derived(), b.derived());
-  }
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
+    * and \a x0 as an initial solution.
     *
-    * \sa compute()
+    * \sa solve(), compute()
     */
-  template<typename Rhs>
-  inline const internal::sparse_solve_retval<IterativeSolverBase, Rhs>
-  solve(const SparseMatrixBase<Rhs>& b) const
+  template<typename Rhs,typename Guess>
+  inline const SolveWithGuess<Derived, Rhs, Guess>
+  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
   {
-    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
-    eigen_assert(rows()==b.rows()
-              && "IterativeSolverBase::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::sparse_solve_retval<IterativeSolverBase, Rhs>(*this, b.derived());
+    eigen_assert(m_isInitialized && "Solver is not initialized.");
+    eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+    return SolveWithGuess<Derived, Rhs, Guess>(derived(), b.derived(), x0);
   }
 
   /** \returns Success if the iterations converged, and NoConvergence otherwise. */
@@ -194,21 +330,27 @@ public:
   }
   
   /** \internal */
-  template<typename Rhs, typename DestScalar, int DestOptions, typename DestIndex>
-  void _solve_sparse(const Rhs& b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+  template<typename Rhs, typename DestDerived>
+  void _solve_impl(const Rhs& b, SparseMatrixBase<DestDerived> &aDest) const
   {
     eigen_assert(rows()==b.rows());
     
-    int rhsCols = b.cols();
-    int size = b.rows();
+    Index rhsCols = b.cols();
+    Index size = b.rows();
+    DestDerived& dest(aDest.derived());
+    typedef typename DestDerived::Scalar DestScalar;
     Eigen::Matrix<DestScalar,Dynamic,1> tb(size);
-    Eigen::Matrix<DestScalar,Dynamic,1> tx(size);
-    for(int k=0; k<rhsCols; ++k)
+    Eigen::Matrix<DestScalar,Dynamic,1> tx(cols());
+    // We do not directly fill dest because sparse expressions have to be free of aliasing issue.
+    // For non square least-square problems, b and dest might not have the same size whereas they might alias each-other.
+    typename DestDerived::PlainObject tmp(cols(),rhsCols);
+    for(Index k=0; k<rhsCols; ++k)
     {
       tb = b.col(k);
       tx = derived().solve(tb);
-      dest.col(k) = tx.sparseView(0);
+      tmp.col(k) = tx.sparseView(0);
     }
+    dest.swap(tmp);
   }
 
 protected:
@@ -220,35 +362,33 @@ protected:
     m_maxIterations = -1;
     m_tolerance = NumTraits<Scalar>::epsilon();
   }
-  const MatrixType* mp_matrix;
+
+  typedef internal::generic_matrix_wrapper<MatrixType> MatrixWrapper;
+  typedef typename MatrixWrapper::ActualMatrixType ActualMatrixType;
+
+  const ActualMatrixType& matrix() const
+  {
+    return m_matrixWrapper.matrix();
+  }
+  
+  template<typename InputType>
+  void grab(const InputType &A)
+  {
+    m_matrixWrapper.grab(A);
+  }
+  
+  MatrixWrapper m_matrixWrapper;
   Preconditioner m_preconditioner;
 
-  int m_maxIterations;
+  Index m_maxIterations;
   RealScalar m_tolerance;
   
   mutable RealScalar m_error;
-  mutable int m_iterations;
+  mutable Index m_iterations;
   mutable ComputationInfo m_info;
-  mutable bool m_isInitialized, m_analysisIsOk, m_factorizationIsOk;
-};
-
-namespace internal {
- 
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<IterativeSolverBase<Derived>, Rhs>
-  : sparse_solve_retval_base<IterativeSolverBase<Derived>, Rhs>
-{
-  typedef IterativeSolverBase<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve_sparse(rhs(),dst);
-  }
+  mutable bool m_analysisIsOk, m_factorizationIsOk;
 };
 
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_ITERATIVE_SOLVER_BASE_H
diff --git a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
new file mode 100644
index 000000000..0aea0e099
--- /dev/null
+++ b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
@@ -0,0 +1,216 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+#define EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Low-level conjugate gradient algorithm for least-square problems
+  * \param mat The matrix A
+  * \param rhs The right hand side vector b
+  * \param x On input and initial solution, on output the computed solution.
+  * \param precond A preconditioner being able to efficiently solve for an
+  *                approximation of A'Ax=b (regardless of b)
+  * \param iters On input the max number of iteration, on output the number of performed iterations.
+  * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+  */
+template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+EIGEN_DONT_INLINE
+void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
+                                     const Preconditioner& precond, Index& iters,
+                                     typename Dest::RealScalar& tol_error)
+{
+  using std::sqrt;
+  using std::abs;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  
+  RealScalar tol = tol_error;
+  Index maxIters = iters;
+  
+  Index m = mat.rows(), n = mat.cols();
+
+  VectorType residual        = rhs - mat * x;
+  VectorType normal_residual = mat.adjoint() * residual;
+
+  RealScalar rhsNorm2 = (mat.adjoint()*rhs).squaredNorm();
+  if(rhsNorm2 == 0) 
+  {
+    x.setZero();
+    iters = 0;
+    tol_error = 0;
+    return;
+  }
+  RealScalar threshold = tol*tol*rhsNorm2;
+  RealScalar residualNorm2 = normal_residual.squaredNorm();
+  if (residualNorm2 < threshold)
+  {
+    iters = 0;
+    tol_error = sqrt(residualNorm2 / rhsNorm2);
+    return;
+  }
+  
+  VectorType p(n);
+  p = precond.solve(normal_residual);                         // initial search direction
+
+  VectorType z(n), tmp(m);
+  RealScalar absNew = numext::real(normal_residual.dot(p));  // the square of the absolute value of r scaled by invM
+  Index i = 0;
+  while(i < maxIters)
+  {
+    tmp.noalias() = mat * p;
+
+    Scalar alpha = absNew / tmp.squaredNorm();      // the amount we travel on dir
+    x += alpha * p;                                 // update solution
+    residual -= alpha * tmp;                        // update residual
+    normal_residual = mat.adjoint() * residual;     // update residual of the normal equation
+    
+    residualNorm2 = normal_residual.squaredNorm();
+    if(residualNorm2 < threshold)
+      break;
+    
+    z = precond.solve(normal_residual);             // approximately solve for "A'A z = normal_residual"
+
+    RealScalar absOld = absNew;
+    absNew = numext::real(normal_residual.dot(z));  // update the absolute value of r
+    RealScalar beta = absNew / absOld;              // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                               // update search direction
+    i++;
+  }
+  tol_error = sqrt(residualNorm2 / rhsNorm2);
+  iters = i;
+}
+
+}
+
+template< typename _MatrixType,
+          typename _Preconditioner = LeastSquareDiagonalPreconditioner<typename _MatrixType::Scalar> >
+class LeastSquaresConjugateGradient;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A conjugate gradient solver for sparse (or dense) least-square problems
+  *
+  * This class allows to solve for A x = b linear problems using an iterative conjugate gradient algorithm.
+  * The matrix A can be non symmetric and rectangular, but the matrix A' A should be positive-definite to guaranty stability.
+  * Otherwise, the SparseLU or SparseQR classes might be preferable.
+  * The matrix A and the vectors x and b can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is LeastSquareDiagonalPreconditioner
+  *
+  * \implsparsesolverconcept
+  * 
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  * 
+  * This class can be used as the direct solver classes. Here is a typical usage example:
+    \code
+    int m=1000000, n = 10000;
+    VectorXd x(n), b(m);
+    SparseMatrix<double> A(m,n);
+    // fill A and b
+    LeastSquaresConjugateGradient<SparseMatrix<double> > lscg;
+    lscg.compute(A);
+    x = lscg.solve(b);
+    std::cout << "#iterations:     " << lscg.iterations() << std::endl;
+    std::cout << "estimated error: " << lscg.error()      << std::endl;
+    // update b, and solve again
+    x = lscg.solve(b);
+    \endcode
+  * 
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method.
+  * 
+  * \sa class ConjugateGradient, SparseLU, SparseQR
+  */
+template< typename _MatrixType, typename _Preconditioner>
+class LeastSquaresConjugateGradient : public IterativeSolverBase<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
+{
+  typedef IterativeSolverBase<LeastSquaresConjugateGradient> Base;
+  using Base::matrix;
+  using Base::m_error;
+  using Base::m_iterations;
+  using Base::m_info;
+  using Base::m_isInitialized;
+public:
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef _Preconditioner Preconditioner;
+
+public:
+
+  /** Default constructor. */
+  LeastSquaresConjugateGradient() : Base() {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  template<typename MatrixDerived>
+  explicit LeastSquaresConjugateGradient(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
+
+  ~LeastSquaresConjugateGradient() {}
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+
+    for(Index j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+
+      typename Dest::ColXpr xj(x,j);
+      internal::least_square_conjugate_gradient(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
+    }
+
+    m_isInitialized = true;
+    m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+  }
+  
+  /** \internal */
+  using Base::_solve_impl;
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
+  {
+    x.setZero();
+    _solve_with_guess_impl(b.derived(),x);
+  }
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
new file mode 100644
index 000000000..0ace45177
--- /dev/null
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -0,0 +1,115 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVEWITHGUESS_H
+#define EIGEN_SOLVEWITHGUESS_H
+
+namespace Eigen {
+
+template<typename Decomposition, typename RhsType, typename GuessType> class SolveWithGuess;
+  
+/** \class SolveWithGuess
+  * \ingroup IterativeLinearSolvers_Module
+  *
+  * \brief Pseudo expression representing a solving operation
+  *
+  * \tparam Decomposition the type of the matrix or decomposion object
+  * \tparam Rhstype the type of the right-hand side
+  *
+  * This class represents an expression of A.solve(B)
+  * and most of the time this is the only way it is used.
+  *
+  */
+namespace internal {
+
+
+template<typename Decomposition, typename RhsType, typename GuessType>
+struct traits<SolveWithGuess<Decomposition, RhsType, GuessType> >
+  : traits<Solve<Decomposition,RhsType> >
+{};
+
+}
+
+
+template<typename Decomposition, typename RhsType, typename GuessType>
+class SolveWithGuess : public internal::generic_xpr_base<SolveWithGuess<Decomposition,RhsType,GuessType>, MatrixXpr, typename internal::traits<RhsType>::StorageKind>::type
+{
+public:
+  typedef typename internal::traits<SolveWithGuess>::Scalar Scalar;
+  typedef typename internal::traits<SolveWithGuess>::PlainObject PlainObject;
+  typedef typename internal::generic_xpr_base<SolveWithGuess<Decomposition,RhsType,GuessType>, MatrixXpr, typename internal::traits<RhsType>::StorageKind>::type Base;
+  typedef typename internal::ref_selector<SolveWithGuess>::type Nested;
+  
+  SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
+    : m_dec(dec), m_rhs(rhs), m_guess(guess)
+  {}
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC const Decomposition& dec()   const { return m_dec; }
+  EIGEN_DEVICE_FUNC const RhsType&       rhs()   const { return m_rhs; }
+  EIGEN_DEVICE_FUNC const GuessType&     guess() const { return m_guess; }
+
+protected:
+  const Decomposition &m_dec;
+  const RhsType       &m_rhs;
+  const GuessType     &m_guess;
+  
+private:
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+namespace internal {
+
+// Evaluator of SolveWithGuess -> eval into a temporary
+template<typename Decomposition, typename RhsType, typename GuessType>
+struct evaluator<SolveWithGuess<Decomposition,RhsType, GuessType> >
+  : public evaluator<typename SolveWithGuess<Decomposition,RhsType,GuessType>::PlainObject>
+{
+  typedef SolveWithGuess<Decomposition,RhsType,GuessType> SolveType;
+  typedef typename SolveType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  evaluator(const SolveType& solve)
+    : m_result(solve.rows(), solve.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    m_result = solve.guess();
+    solve.dec()._solve_with_guess_impl(solve.rhs(), m_result);
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+// Specialization for "dst = dec.solveWithGuess(rhs)"
+// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
+template<typename DstXprType, typename DecType, typename RhsType, typename GuessType, typename Scalar>
+struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef SolveWithGuess<DecType,RhsType,GuessType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    dst = src.guess();
+    src.dec()._solve_with_guess_impl(src.rhs(), dst/*, src.guess()*/);
+  }
+};
+
+} // end namepsace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVEWITHGUESS_H
diff --git a/Eigen/src/Jacobi/CMakeLists.txt b/Eigen/src/Jacobi/CMakeLists.txt
deleted file mode 100644
index 490dac626..000000000
--- a/Eigen/src/Jacobi/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Jacobi_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Jacobi_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Jacobi COMPONENT Devel
-  )
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index 956f72d57..d25af8e90 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -62,7 +62,7 @@ template<typename Scalar> class JacobiRotation
     JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
 
     template<typename Derived>
-    bool makeJacobi(const MatrixBase<Derived>&, typename Derived::Index p, typename Derived::Index q);
+    bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
     bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
 
     void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
@@ -85,7 +85,8 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   using std::sqrt;
   using std::abs;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  if(y == Scalar(0))
+  RealScalar deno = RealScalar(2)*abs(y);
+  if(deno < (std::numeric_limits<RealScalar>::min)())
   {
     m_c = Scalar(1);
     m_s = Scalar(0);
@@ -93,7 +94,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   }
   else
   {
-    RealScalar tau = (x-z)/(RealScalar(2)*abs(y));
+    RealScalar tau = (x-z)/deno;
     RealScalar w = sqrt(numext::abs2(tau) + RealScalar(1));
     RealScalar t;
     if(tau>RealScalar(0))
@@ -123,7 +124,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   */
 template<typename Scalar>
 template<typename Derived>
-inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, typename Derived::Index p, typename Derived::Index q)
+inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q)
 {
   return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
 }
@@ -255,15 +256,15 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 *   Implementation of MatrixBase methods
 ****************************************************************************************/
 
+namespace internal {
 /** \jacobi_module
   * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y:
   * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right )  =  J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$
   *
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
-namespace internal {
 template<typename VectorX, typename VectorY, typename OtherScalar>
-void apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j);
+void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
 }
 
 /** \jacobi_module
@@ -298,19 +299,18 @@ inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiR
 
 namespace internal {
 template<typename VectorX, typename VectorY, typename OtherScalar>
-void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j)
+void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
 {
-  typedef typename VectorX::Index Index;
   typedef typename VectorX::Scalar Scalar;
   enum { PacketSize = packet_traits<Scalar>::size };
   typedef typename packet_traits<Scalar>::type Packet;
-  eigen_assert(_x.size() == _y.size());
-  Index size = _x.size();
-  Index incrx = _x.innerStride();
-  Index incry = _y.innerStride();
+  eigen_assert(xpr_x.size() == xpr_y.size());
+  Index size = xpr_x.size();
+  Index incrx = xpr_x.derived().innerStride();
+  Index incry = xpr_y.derived().innerStride();
 
-  Scalar* EIGEN_RESTRICT x = &_x.coeffRef(0);
-  Scalar* EIGEN_RESTRICT y = &_y.coeffRef(0);
+  Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
+  Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
   
   OtherScalar c = j.c();
   OtherScalar s = j.s();
@@ -326,7 +326,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y,
     // both vectors are sequentially stored in memory => vectorization
     enum { Peeling = 2 };
 
-    Index alignedStart = internal::first_aligned(y, size);
+    Index alignedStart = internal::first_default_aligned(y, size);
     Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
 
     const Packet pc = pset1<Packet>(c);
@@ -344,7 +344,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y,
     Scalar* EIGEN_RESTRICT px = x + alignedStart;
     Scalar* EIGEN_RESTRICT py = y + alignedStart;
 
-    if(internal::first_aligned(x, size)==alignedStart)
+    if(internal::first_default_aligned(x, size)==alignedStart)
     {
       for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
       {
@@ -393,7 +393,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y,
   /*** fixed-size vectorized path ***/
   else if(VectorX::SizeAtCompileTime != Dynamic &&
           (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
-          (VectorX::Flags & VectorY::Flags & AlignedBit))
+          (EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment)>0)) // FIXME should be compared to the required alignment
   {
     const Packet pc = pset1<Packet>(c);
     const Packet ps = pset1<Packet>(s);
diff --git a/Eigen/src/LU/CMakeLists.txt b/Eigen/src/LU/CMakeLists.txt
deleted file mode 100644
index e0d8d78c1..000000000
--- a/Eigen/src/LU/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_LU_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_LU_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU COMPONENT Devel
-  )
-
-ADD_SUBDIRECTORY(arch)
diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h
index bb8e78a8a..d6a3c1e5a 100644
--- a/Eigen/src/LU/Determinant.h
+++ b/Eigen/src/LU/Determinant.h
@@ -92,7 +92,7 @@ template<typename Derived>
 inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
 {
   eigen_assert(rows() == cols());
-  typedef typename internal::nested<Derived,Base::RowsAtCompileTime>::type Nested;
+  typedef typename internal::nested_eval<Derived,Base::RowsAtCompileTime>::type Nested;
   return internal::determinant_impl<typename internal::remove_all<Nested>::type>::run(derived());
 }
 
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 26bc71447..03b6af706 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -10,7 +10,18 @@
 #ifndef EIGEN_LU_H
 #define EIGEN_LU_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  enum { Flags = 0 };
+};
+
+} // end namespace internal
 
 /** \ingroup LU_Module
   *
@@ -18,7 +29,7 @@ namespace Eigen {
   *
   * \brief LU decomposition of a matrix with complete pivoting, and related features
   *
-  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition
   *
   * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
   * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
@@ -41,27 +52,28 @@ namespace Eigen {
   * \include class_FullPivLU.cpp
   * Output: \verbinclude class_FullPivLU.out
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
   */
 template<typename _MatrixType> class FullPivLU
+  : public SolverBase<FullPivLU<_MatrixType> >
 {
   public:
     typedef _MatrixType MatrixType;
+    typedef SolverBase<FullPivLU> Base;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
+    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
-    typedef typename MatrixType::Index Index;
-    typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
-    typedef typename internal::plain_col_type<MatrixType, Index>::type IntColVectorType;
+    typedef typename internal::plain_row_type<MatrixType, StorageIndex>::type IntRowVectorType;
+    typedef typename internal::plain_col_type<MatrixType, StorageIndex>::type IntColVectorType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationQType;
     typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationPType;
+    typedef typename MatrixType::PlainObject PlainObject;
 
     /**
       * \brief Default Constructor.
@@ -84,7 +96,17 @@ template<typename _MatrixType> class FullPivLU
       * \param matrix the matrix of which to compute the LU decomposition.
       *               It is required to be nonzero.
       */
-    FullPivLU(const MatrixType& matrix);
+    template<typename InputType>
+    explicit FullPivLU(const EigenBase<InputType>& matrix);
+
+    /** \brief Constructs a LU factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa FullPivLU(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit FullPivLU(EigenBase<InputType>& matrix);
 
     /** Computes the LU decomposition of the given matrix.
       *
@@ -93,7 +115,12 @@ template<typename _MatrixType> class FullPivLU
       *
       * \returns a reference to *this
       */
-    FullPivLU& compute(const MatrixType& matrix);
+    template<typename InputType>
+    FullPivLU& compute(const EigenBase<InputType>& matrix) {
+      m_lu = matrix.derived();
+      computeInPlace();
+      return *this;
+    }
 
     /** \returns the LU decomposition matrix: the upper-triangular part is U, the
       * unit-lower-triangular part is L (at least for square matrices; in the non-square
@@ -129,7 +156,7 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa permutationQ()
       */
-    inline const PermutationPType& permutationP() const
+    EIGEN_DEVICE_FUNC inline const PermutationPType& permutationP() const
     {
       eigen_assert(m_isInitialized && "LU is not initialized.");
       return m_p;
@@ -166,7 +193,7 @@ template<typename _MatrixType> class FullPivLU
     }
 
     /** \returns the image of the matrix, also called its column-space. The columns of the returned matrix
-      * will form a basis of the kernel.
+      * will form a basis of the image (column-space).
       *
       * \param originalMatrix the original matrix, of which *this is the LU decomposition.
       *                       The reason why it is needed to pass it here, is that this allows
@@ -210,12 +237,22 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa TriangularView::solve(), kernel(), inverse()
       */
+    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
-    inline const internal::solve_retval<FullPivLU, Rhs>
+    inline const Solve<FullPivLU, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "LU is not initialized.");
-      return internal::solve_retval<FullPivLU, Rhs>(*this, b.derived());
+      return Solve<FullPivLU, Rhs>(*this, b.derived());
+    }
+
+    /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
+        the LU decomposition.
+      */
+    inline RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
     }
 
     /** \returns the determinant of the matrix of which
@@ -360,33 +397,46 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa MatrixBase::inverse()
       */
-    inline const internal::solve_retval<FullPivLU,typename MatrixType::IdentityReturnType> inverse() const
+    inline const Inverse<FullPivLU> inverse() const
     {
       eigen_assert(m_isInitialized && "LU is not initialized.");
       eigen_assert(m_lu.rows() == m_lu.cols() && "You can't take the inverse of a non-square matrix!");
-      return internal::solve_retval<FullPivLU,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
+      return Inverse<FullPivLU>(*this);
     }
 
     MatrixType reconstructedMatrix() const;
 
-    inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
+    #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
+    void computeInPlace();
+
     MatrixType m_lu;
     PermutationPType m_p;
     PermutationQType m_q;
     IntColVectorType m_rowsTranspositions;
     IntRowVectorType m_colsTranspositions;
-    Index m_det_pq, m_nonzero_pivots;
+    Index m_nonzero_pivots;
+    RealScalar m_l1_norm;
     RealScalar m_maxpivot, m_prescribedThreshold;
+    signed char m_det_pq;
     bool m_isInitialized, m_usePrescribedThreshold;
 };
 
@@ -409,7 +459,8 @@ FullPivLU<MatrixType>::FullPivLU(Index rows, Index cols)
 }
 
 template<typename MatrixType>
-FullPivLU<MatrixType>::FullPivLU(const MatrixType& matrix)
+template<typename InputType>
+FullPivLU<MatrixType>::FullPivLU(const EigenBase<InputType>& matrix)
   : m_lu(matrix.rows(), matrix.cols()),
     m_p(matrix.rows()),
     m_q(matrix.cols()),
@@ -418,28 +469,41 @@ FullPivLU<MatrixType>::FullPivLU(const MatrixType& matrix)
     m_isInitialized(false),
     m_usePrescribedThreshold(false)
 {
-  compute(matrix);
+  compute(matrix.derived());
+}
+
+template<typename MatrixType>
+template<typename InputType>
+FullPivLU<MatrixType>::FullPivLU(EigenBase<InputType>& matrix)
+  : m_lu(matrix.derived()),
+    m_p(matrix.rows()),
+    m_q(matrix.cols()),
+    m_rowsTranspositions(matrix.rows()),
+    m_colsTranspositions(matrix.cols()),
+    m_isInitialized(false),
+    m_usePrescribedThreshold(false)
+{
+  computeInPlace();
 }
 
 template<typename MatrixType>
-FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
+void FullPivLU<MatrixType>::computeInPlace()
 {
   check_template_parameters();
-  
+
   // the permutations are stored as int indices, so just to be sure:
-  eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
-  
-  m_isInitialized = true;
-  m_lu = matrix;
+  eigen_assert(m_lu.rows()<=NumTraits<int>::highest() && m_lu.cols()<=NumTraits<int>::highest());
+
+  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
 
-  const Index size = matrix.diagonalSize();
-  const Index rows = matrix.rows();
-  const Index cols = matrix.cols();
+  const Index size = m_lu.diagonalSize();
+  const Index rows = m_lu.rows();
+  const Index cols = m_lu.cols();
 
   // will store the transpositions, before we accumulate them at the end.
   // can't accumulate on-the-fly because that will be done in reverse order for the rows.
-  m_rowsTranspositions.resize(matrix.rows());
-  m_colsTranspositions.resize(matrix.cols());
+  m_rowsTranspositions.resize(m_lu.rows());
+  m_colsTranspositions.resize(m_lu.cols());
   Index number_of_transpositions = 0; // number of NONTRIVIAL transpositions, i.e. m_rowsTranspositions[i]!=i
 
   m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
@@ -451,14 +515,16 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
 
     // biggest coefficient in the remaining bottom-right corner (starting at row k, col k)
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
+    Score biggest_in_corner;
     biggest_in_corner = m_lu.bottomRightCorner(rows-k, cols-k)
-                        .cwiseAbs()
+                        .unaryExpr(Scoring())
                         .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
     col_of_biggest_in_corner += k; // need to add k to them.
 
-    if(biggest_in_corner==RealScalar(0))
+    if(biggest_in_corner==Score(0))
     {
       // before exiting, make sure to initialize the still uninitialized transpositions
       // in a sane state without destroying what we already have.
@@ -471,7 +537,8 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
       break;
     }
 
-    if(biggest_in_corner > m_maxpivot) m_maxpivot = biggest_in_corner;
+    RealScalar abs_pivot = internal::abs_knowing_score<Scalar>()(m_lu(row_of_biggest_in_corner, col_of_biggest_in_corner), biggest_in_corner);
+    if(abs_pivot > m_maxpivot) m_maxpivot = abs_pivot;
 
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
@@ -508,7 +575,8 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
     m_q.applyTranspositionOnTheRight(k, m_colsTranspositions.coeff(k));
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
-  return *this;
+
+  m_isInitialized = true;
 }
 
 template<typename MatrixType>
@@ -671,64 +739,136 @@ struct image_retval<FullPivLU<_MatrixType> >
 
 /***** Implementation of solve() *****************************************************/
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<FullPivLU<_MatrixType>, Rhs>
-  : solve_retval_base<FullPivLU<_MatrixType>, Rhs>
+} // end namespace internal
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  EIGEN_MAKE_SOLVE_HELPERS(FullPivLU<_MatrixType>,Rhs)
+  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
+  * So we proceed as follows:
+  * Step 1: compute c = P * rhs.
+  * Step 2: replace c by the solution x to Lx = c. Exists because L is invertible.
+  * Step 3: replace c by the solution x to Ux = c. May or may not exist.
+  * Step 4: result = Q * c;
+  */
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  const Index rows = this->rows(),
+              cols = this->cols(),
+              nonzero_pivots = this->rank();
+  eigen_assert(rhs.rows() == rows);
+  const Index smalldim = (std::min)(rows, cols);
+
+  if(nonzero_pivots == 0)
   {
-    /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
-     * So we proceed as follows:
-     * Step 1: compute c = P * rhs.
-     * Step 2: replace c by the solution x to Lx = c. Exists because L is invertible.
-     * Step 3: replace c by the solution x to Ux = c. May or may not exist.
-     * Step 4: result = Q * c;
-     */
-
-    const Index rows = dec().rows(), cols = dec().cols(),
-              nonzero_pivots = dec().nonzeroPivots();
-    eigen_assert(rhs().rows() == rows);
-    const Index smalldim = (std::min)(rows, cols);
-
-    if(nonzero_pivots == 0)
-    {
-      dst.setZero();
-      return;
-    }
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(rhs.rows(), rhs.cols());
+
+  // Step 1
+  c = permutationP() * rhs;
 
-    typename Rhs::PlainObject c(rhs().rows(), rhs().cols());
+  // Step 2
+  m_lu.topLeftCorner(smalldim,smalldim)
+      .template triangularView<UnitLower>()
+      .solveInPlace(c.topRows(smalldim));
+  if(rows>cols)
+    c.bottomRows(rows-cols) -= m_lu.bottomRows(rows-cols) * c.topRows(cols);
 
-    // Step 1
-    c = dec().permutationP() * rhs();
+  // Step 3
+  m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(nonzero_pivots));
 
+  // Step 4
+  for(Index i = 0; i < nonzero_pivots; ++i)
+    dst.row(permutationQ().indices().coeff(i)) = c.row(i);
+  for(Index i = nonzero_pivots; i < m_lu.cols(); ++i)
+    dst.row(permutationQ().indices().coeff(i)).setZero();
+}
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1},
+   * and since permutations are real and unitary, we can write this
+   * as   A^T = Q U^T L^T P,
+   * So we proceed as follows:
+   * Step 1: compute c = Q^T rhs.
+   * Step 2: replace c by the solution x to U^T x = c. May or may not exist.
+   * Step 3: replace c by the solution x to L^T x = c.
+   * Step 4: result = P^T c.
+   * If Conjugate is true, replace "^T" by "^*" above.
+   */
+
+  const Index rows = this->rows(), cols = this->cols(),
+    nonzero_pivots = this->rank();
+   eigen_assert(rhs.rows() == cols);
+  const Index smalldim = (std::min)(rows, cols);
+
+  if(nonzero_pivots == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(rhs.rows(), rhs.cols());
+
+  // Step 1
+  c = permutationQ().inverse() * rhs;
+
+  if (Conjugate) {
     // Step 2
-    dec().matrixLU()
-        .topLeftCorner(smalldim,smalldim)
+    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+        .template triangularView<Upper>()
+        .adjoint()
+        .solveInPlace(c.topRows(nonzero_pivots));
+    // Step 3
+    m_lu.topLeftCorner(smalldim, smalldim)
         .template triangularView<UnitLower>()
+        .adjoint()
         .solveInPlace(c.topRows(smalldim));
-    if(rows>cols)
-    {
-      c.bottomRows(rows-cols)
-        -= dec().matrixLU().bottomRows(rows-cols)
-         * c.topRows(cols);
-    }
-
-    // Step 3
-    dec().matrixLU()
-        .topLeftCorner(nonzero_pivots, nonzero_pivots)
+  } else {
+    // Step 2
+    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
         .template triangularView<Upper>()
+        .transpose()
         .solveInPlace(c.topRows(nonzero_pivots));
+    // Step 3
+    m_lu.topLeftCorner(smalldim, smalldim)
+        .template triangularView<UnitLower>()
+        .transpose()
+        .solveInPlace(c.topRows(smalldim));
+  }
+
+  // Step 4
+  PermutationPType invp = permutationP().inverse().eval();
+  for(Index i = 0; i < smalldim; ++i)
+    dst.row(invp.indices().coeff(i)) = c.row(i);
+  for(Index i = smalldim; i < rows; ++i)
+    dst.row(invp.indices().coeff(i)).setZero();
+}
+
+#endif
+
+namespace internal {
+
 
-    // Step 4
-    for(Index i = 0; i < nonzero_pivots; ++i)
-      dst.row(dec().permutationQ().indices().coeff(i)) = c.row(i);
-    for(Index i = nonzero_pivots; i < dec().matrixLU().cols(); ++i)
-      dst.row(dec().permutationQ().indices().coeff(i)).setZero();
+/***** Implementation of inverse() *****************************************************/
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivLU<MatrixType>::Scalar>, Dense2Dense>
+{
+  typedef FullPivLU<MatrixType> LuType;
+  typedef Inverse<LuType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename MatrixType::Scalar> &)
+  {
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
-
 } // end namespace internal
 
 /******* MatrixBase methods *****************************************************************/
diff --git a/Eigen/src/LU/Inverse.h b/Eigen/src/LU/InverseImpl.h
index 3cf887193..018f99b58 100644
--- a/Eigen/src/LU/Inverse.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -2,13 +2,14 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_INVERSE_H
-#define EIGEN_INVERSE_H
+#ifndef EIGEN_INVERSE_IMPL_H
+#define EIGEN_INVERSE_IMPL_H
 
 namespace Eigen { 
 
@@ -21,6 +22,7 @@ namespace internal {
 template<typename MatrixType, typename ResultType, int Size = MatrixType::RowsAtCompileTime>
 struct compute_inverse
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const MatrixType& matrix, ResultType& result)
   {
     result = matrix.partialPivLu().inverse();
@@ -37,16 +39,19 @@ struct compute_inverse_and_det_with_check { /* nothing! general case not support
 template<typename MatrixType, typename ResultType>
 struct compute_inverse<MatrixType, ResultType, 1>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const MatrixType& matrix, ResultType& result)
   {
     typedef typename MatrixType::Scalar Scalar;
-    result.coeffRef(0,0) = Scalar(1) / matrix.coeff(0,0);
+    internal::evaluator<MatrixType> matrixEval(matrix);
+    result.coeffRef(0,0) = Scalar(1) / matrixEval.coeff(0,0);
   }
 };
 
 template<typename MatrixType, typename ResultType>
 struct compute_inverse_and_det_with_check<MatrixType, ResultType, 1>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(
     const MatrixType& matrix,
     const typename MatrixType::RealScalar& absDeterminantThreshold,
@@ -67,19 +72,21 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 1>
 ****************************/
 
 template<typename MatrixType, typename ResultType>
+EIGEN_DEVICE_FUNC 
 inline void compute_inverse_size2_helper(
     const MatrixType& matrix, const typename ResultType::Scalar& invdet,
     ResultType& result)
 {
-  result.coeffRef(0,0) = matrix.coeff(1,1) * invdet;
+  result.coeffRef(0,0) =  matrix.coeff(1,1) * invdet;
   result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet;
   result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet;
-  result.coeffRef(1,1) = matrix.coeff(0,0) * invdet;
+  result.coeffRef(1,1) =  matrix.coeff(0,0) * invdet;
 }
 
 template<typename MatrixType, typename ResultType>
 struct compute_inverse<MatrixType, ResultType, 2>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const MatrixType& matrix, ResultType& result)
   {
     typedef typename ResultType::Scalar Scalar;
@@ -91,6 +98,7 @@ struct compute_inverse<MatrixType, ResultType, 2>
 template<typename MatrixType, typename ResultType>
 struct compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(
     const MatrixType& matrix,
     const typename MatrixType::RealScalar& absDeterminantThreshold,
@@ -114,6 +122,7 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
 ****************************/
 
 template<typename MatrixType, int i, int j>
+EIGEN_DEVICE_FUNC 
 inline typename MatrixType::Scalar cofactor_3x3(const MatrixType& m)
 {
   enum {
@@ -127,6 +136,7 @@ inline typename MatrixType::Scalar cofactor_3x3(const MatrixType& m)
 }
 
 template<typename MatrixType, typename ResultType>
+EIGEN_DEVICE_FUNC
 inline void compute_inverse_size3_helper(
     const MatrixType& matrix,
     const typename ResultType::Scalar& invdet,
@@ -145,6 +155,7 @@ inline void compute_inverse_size3_helper(
 template<typename MatrixType, typename ResultType>
 struct compute_inverse<MatrixType, ResultType, 3>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const MatrixType& matrix, ResultType& result)
   {
     typedef typename ResultType::Scalar Scalar;
@@ -161,6 +172,7 @@ struct compute_inverse<MatrixType, ResultType, 3>
 template<typename MatrixType, typename ResultType>
 struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(
     const MatrixType& matrix,
     const typename MatrixType::RealScalar& absDeterminantThreshold,
@@ -188,6 +200,7 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
 ****************************/
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC 
 inline const typename Derived::Scalar general_det3_helper
 (const MatrixBase<Derived>& matrix, int i1, int i2, int i3, int j1, int j2, int j3)
 {
@@ -196,6 +209,7 @@ inline const typename Derived::Scalar general_det3_helper
 }
 
 template<typename MatrixType, int i, int j>
+EIGEN_DEVICE_FUNC 
 inline typename MatrixType::Scalar cofactor_4x4(const MatrixType& matrix)
 {
   enum {
@@ -214,6 +228,7 @@ inline typename MatrixType::Scalar cofactor_4x4(const MatrixType& matrix)
 template<int Arch, typename Scalar, typename MatrixType, typename ResultType>
 struct compute_inverse_size4
 {
+  EIGEN_DEVICE_FUNC
   static void run(const MatrixType& matrix, ResultType& result)
   {
     result.coeffRef(0,0) =  cofactor_4x4<MatrixType,0,0>(matrix);
@@ -246,6 +261,7 @@ struct compute_inverse<MatrixType, ResultType, 4>
 template<typename MatrixType, typename ResultType>
 struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(
     const MatrixType& matrix,
     const typename MatrixType::RealScalar& absDeterminantThreshold,
@@ -265,38 +281,37 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
 *** MatrixBase methods ***
 *************************/
 
-template<typename MatrixType>
-struct traits<inverse_impl<MatrixType> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
-
-template<typename MatrixType>
-struct inverse_impl : public ReturnByValue<inverse_impl<MatrixType> >
-{
-  typedef typename MatrixType::Index Index;
-  typedef typename internal::eval<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
-  MatrixTypeNested m_matrix;
-
-  inverse_impl(const MatrixType& matrix)
-    : m_matrix(matrix)
-  {}
+} // end namespace internal
 
-  inline Index rows() const { return m_matrix.rows(); }
-  inline Index cols() const { return m_matrix.cols(); }
+namespace internal {
 
-  template<typename Dest> inline void evalTo(Dest& dst) const
+// Specialization for "dense = dense_xpr.inverse()"
+template<typename DstXprType, typename XprType>
+struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
+{
+  typedef Inverse<XprType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
   {
-    const int Size = EIGEN_PLAIN_ENUM_MIN(MatrixType::ColsAtCompileTime,Dest::ColsAtCompileTime);
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    
+    const int Size = EIGEN_PLAIN_ENUM_MIN(XprType::ColsAtCompileTime,DstXprType::ColsAtCompileTime);
     EIGEN_ONLY_USED_FOR_DEBUG(Size);
-    eigen_assert(( (Size<=1) || (Size>4) || (extract_data(m_matrix)!=extract_data(dst)))
+    eigen_assert(( (Size<=1) || (Size>4) || (extract_data(src.nestedExpression())!=extract_data(dst)))
               && "Aliasing problem detected in inverse(), you need to do inverse().eval() here.");
 
-    compute_inverse<MatrixTypeNestedCleaned, Dest>::run(m_matrix, dst);
+    typedef typename internal::nested_eval<XprType,XprType::ColsAtCompileTime>::type  ActualXprType;
+    typedef typename internal::remove_all<ActualXprType>::type                        ActualXprTypeCleanded;
+    
+    ActualXprType actual_xpr(src.nestedExpression());
+    
+    compute_inverse<ActualXprTypeCleanded, DstXprType>::run(actual_xpr, dst);
   }
 };
 
+  
 } // end namespace internal
 
 /** \lu_module
@@ -317,11 +332,11 @@ struct inverse_impl : public ReturnByValue<inverse_impl<MatrixType> >
   * \sa computeInverseAndDetWithCheck()
   */
 template<typename Derived>
-inline const internal::inverse_impl<Derived> MatrixBase<Derived>::inverse() const
+inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
 {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
   eigen_assert(rows() == cols());
-  return internal::inverse_impl<Derived>(derived());
+  return Inverse<Derived>(derived());
 }
 
 /** \lu_module
@@ -357,7 +372,7 @@ inline void MatrixBase<Derived>::computeInverseAndDetWithCheck(
   // for larger sizes, evaluating has negligible cost and limits code size.
   typedef typename internal::conditional<
     RowsAtCompileTime == 2,
-    typename internal::remove_all<typename internal::nested<Derived, 2>::type>::type,
+    typename internal::remove_all<typename internal::nested_eval<Derived, 2>::type>::type,
     PlainObject
   >::type MatrixType;
   internal::compute_inverse_and_det_with_check<MatrixType, ResultType>::run
@@ -397,4 +412,4 @@ inline void MatrixBase<Derived>::computeInverseWithCheck(
 
 } // end namespace Eigen
 
-#endif // EIGEN_INVERSE_H
+#endif // EIGEN_INVERSE_IMPL_H
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 7d1db948c..d43961887 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -11,7 +11,33 @@
 #ifndef EIGEN_PARTIALLU_H
 #define EIGEN_PARTIALLU_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef traits<_MatrixType> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit,
+    CoeffReadCost = Dynamic
+  };
+};
+
+template<typename T,typename Derived>
+struct enable_if_ref;
+// {
+//   typedef Derived type;
+// };
+
+template<typename T,typename Derived>
+struct enable_if_ref<Ref<T>,Derived> {
+  typedef Derived type;
+};
+
+} // end namespace internal
 
 /** \ingroup LU_Module
   *
@@ -19,7 +45,7 @@ namespace Eigen {
   *
   * \brief LU decomposition of a matrix with partial pivoting, and related features
   *
-  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition
   *
   * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A
   * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P
@@ -42,34 +68,33 @@ namespace Eigen {
   *
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
   */
 template<typename _MatrixType> class PartialPivLU
+  : public SolverBase<PartialPivLU<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<PartialPivLU> Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU)
+    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
-    typedef typename MatrixType::Index Index;
     typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
-
+    typedef typename MatrixType::PlainObject PlainObject;
 
     /**
-    * \brief Default Constructor.
-    *
-    * The default constructor is useful in cases in which the user intends to
-    * perform decompositions via PartialPivLU::compute(const MatrixType&).
-    */
+      * \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via PartialPivLU::compute(const MatrixType&).
+      */
     PartialPivLU();
 
     /** \brief Default Constructor with memory preallocation
@@ -78,7 +103,7 @@ template<typename _MatrixType> class PartialPivLU
       * according to the specified problem \a size.
       * \sa PartialPivLU()
       */
-    PartialPivLU(Index size);
+    explicit PartialPivLU(Index size);
 
     /** Constructor.
       *
@@ -87,9 +112,25 @@ template<typename _MatrixType> class PartialPivLU
       * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
       * If you need to deal with non-full rank, use class FullPivLU instead.
       */
-    PartialPivLU(const MatrixType& matrix);
+    template<typename InputType>
+    explicit PartialPivLU(const EigenBase<InputType>& matrix);
 
-    PartialPivLU& compute(const MatrixType& matrix);
+    /** Constructor for \link InplaceDecomposition inplace decomposition \endlink
+      *
+      * \param matrix the matrix of which to compute the LU decomposition.
+      *
+      * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
+      * If you need to deal with non-full rank, use class FullPivLU instead.
+      */
+    template<typename InputType>
+    explicit PartialPivLU(EigenBase<InputType>& matrix);
+
+    template<typename InputType>
+    PartialPivLU& compute(const EigenBase<InputType>& matrix) {
+      m_lu = matrix.derived();
+      compute();
+      return *this;
+    }
 
     /** \returns the LU decomposition matrix: the upper-triangular part is U, the
       * unit-lower-triangular part is L (at least for square matrices; in the non-square
@@ -128,12 +169,22 @@ template<typename _MatrixType> class PartialPivLU
       *
       * \sa TriangularView::solve(), inverse(), computeInverse()
       */
+    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
-    inline const internal::solve_retval<PartialPivLU, Rhs>
+    inline const Solve<PartialPivLU, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return internal::solve_retval<PartialPivLU, Rhs>(*this, b.derived());
+      return Solve<PartialPivLU, Rhs>(*this, b.derived());
+    }
+
+    /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
+        the LU decomposition.
+      */
+    inline RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
     }
 
     /** \returns the inverse of the matrix of which *this is the LU decomposition.
@@ -143,11 +194,10 @@ template<typename _MatrixType> class PartialPivLU
       *
       * \sa MatrixBase::inverse(), LU::inverse()
       */
-    inline const internal::solve_retval<PartialPivLU,typename MatrixType::IdentityReturnType> inverse() const
+    inline const Inverse<PartialPivLU> inverse() const
     {
       eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return internal::solve_retval<PartialPivLU,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
+      return Inverse<PartialPivLU>(*this);
     }
 
     /** \returns the determinant of the matrix of which
@@ -163,24 +213,78 @@ template<typename _MatrixType> class PartialPivLU
       *
       * \sa MatrixBase::determinant()
       */
-    typename internal::traits<MatrixType>::Scalar determinant() const;
+    Scalar determinant() const;
 
     MatrixType reconstructedMatrix() const;
 
     inline Index rows() const { return m_lu.rows(); }
     inline Index cols() const { return m_lu.cols(); }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const {
+     /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+      * So we proceed as follows:
+      * Step 1: compute c = Pb.
+      * Step 2: replace c by the solution x to Lx = c.
+      * Step 3: replace c by the solution x to Ux = c.
+      */
+
+      eigen_assert(rhs.rows() == m_lu.rows());
+
+      // Step 1
+      dst = permutationP() * rhs;
+
+      // Step 2
+      m_lu.template triangularView<UnitLower>().solveInPlace(dst);
+
+      // Step 3
+      m_lu.template triangularView<Upper>().solveInPlace(dst);
+    }
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const {
+     /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+      * So we proceed as follows:
+      * Step 1: compute c = Pb.
+      * Step 2: replace c by the solution x to Lx = c.
+      * Step 3: replace c by the solution x to Ux = c.
+      */
+
+      eigen_assert(rhs.rows() == m_lu.cols());
+
+      if (Conjugate) {
+        // Step 1
+        dst = m_lu.template triangularView<Upper>().adjoint().solve(rhs);
+        // Step 2
+        m_lu.template triangularView<UnitLower>().adjoint().solveInPlace(dst);
+      } else {
+        // Step 1
+        dst = m_lu.template triangularView<Upper>().transpose().solve(rhs);
+        // Step 2
+        m_lu.template triangularView<UnitLower>().transpose().solveInPlace(dst);
+      }
+      // Step 3
+      dst = permutationP().transpose() * dst;
+    }
+    #endif
+
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
+    void compute();
+
     MatrixType m_lu;
     PermutationType m_p;
     TranspositionType m_rowsTranspositions;
-    Index m_det_p;
+    RealScalar m_l1_norm;
+    signed char m_det_p;
     bool m_isInitialized;
 };
 
@@ -189,6 +293,7 @@ PartialPivLU<MatrixType>::PartialPivLU()
   : m_lu(),
     m_p(),
     m_rowsTranspositions(),
+    m_l1_norm(0),
     m_det_p(0),
     m_isInitialized(false)
 {
@@ -199,20 +304,36 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size)
   : m_lu(size, size),
     m_p(size),
     m_rowsTranspositions(size),
+    m_l1_norm(0),
     m_det_p(0),
     m_isInitialized(false)
 {
 }
 
 template<typename MatrixType>
-PartialPivLU<MatrixType>::PartialPivLU(const MatrixType& matrix)
-  : m_lu(matrix.rows(), matrix.rows()),
+template<typename InputType>
+PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix)
+  : m_lu(matrix.rows(),matrix.cols()),
     m_p(matrix.rows()),
     m_rowsTranspositions(matrix.rows()),
+    m_l1_norm(0),
     m_det_p(0),
     m_isInitialized(false)
 {
-  compute(matrix);
+  compute(matrix.derived());
+}
+
+template<typename MatrixType>
+template<typename InputType>
+PartialPivLU<MatrixType>::PartialPivLU(EigenBase<InputType>& matrix)
+  : m_lu(matrix.derived()),
+    m_p(matrix.rows()),
+    m_rowsTranspositions(matrix.rows()),
+    m_l1_norm(0),
+    m_det_p(0),
+    m_isInitialized(false)
+{
+  compute();
 }
 
 namespace internal {
@@ -230,7 +351,6 @@ struct partial_lu_impl
   typedef Block<MapLU, Dynamic, Dynamic> MatrixType;
   typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
 
   /** \internal performs the LU decomposition in-place of the matrix \a lu
     * using an unblocked algorithm.
@@ -244,6 +364,8 @@ struct partial_lu_impl
     */
   static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
   {
+    typedef scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
     const Index rows = lu.rows();
     const Index cols = lu.cols();
     const Index size = (std::min)(rows,cols);
@@ -253,15 +375,15 @@ struct partial_lu_impl
     {
       Index rrows = rows-k-1;
       Index rcols = cols-k-1;
-        
+
       Index row_of_biggest_in_col;
-      RealScalar biggest_in_corner
-        = lu.col(k).tail(rows-k).cwiseAbs().maxCoeff(&row_of_biggest_in_col);
+      Score biggest_in_corner
+        = lu.col(k).tail(rows-k).unaryExpr(Scoring()).maxCoeff(&row_of_biggest_in_col);
       row_of_biggest_in_col += k;
 
       row_transpositions[k] = PivIndex(row_of_biggest_in_col);
 
-      if(biggest_in_corner != RealScalar(0))
+      if(biggest_in_corner != Score(0))
       {
         if(k != row_of_biggest_in_col)
         {
@@ -354,7 +476,7 @@ struct partial_lu_impl
       // update permutations and apply them to A_0
       for(Index i=k; i<k+bs; ++i)
       {
-        Index piv = (row_transpositions[i] += k);
+        Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k));
         A_0.row(i).swap(A_0.row(piv));
       }
 
@@ -377,45 +499,44 @@ struct partial_lu_impl
 /** \internal performs the LU decomposition with partial pivoting in-place.
   */
 template<typename MatrixType, typename TranspositionType>
-void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::Index& nb_transpositions)
+void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions)
 {
   eigen_assert(lu.cols() == row_transpositions.size());
   eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
 
   partial_lu_impl
-    <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::Index>
+    <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::StorageIndex>
     ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
 }
 
 } // end namespace internal
 
 template<typename MatrixType>
-PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const MatrixType& matrix)
+void PartialPivLU<MatrixType>::compute()
 {
   check_template_parameters();
-  
+
   // the row permutation is stored as int indices, so just to be sure:
-  eigen_assert(matrix.rows()<NumTraits<int>::highest());
-  
-  m_lu = matrix;
+  eigen_assert(m_lu.rows()<NumTraits<int>::highest());
+
+  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
 
-  eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
-  const Index size = matrix.rows();
+  eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
+  const Index size = m_lu.rows();
 
   m_rowsTranspositions.resize(size);
 
-  typename TranspositionType::Index nb_transpositions;
+  typename TranspositionType::StorageIndex nb_transpositions;
   internal::partial_lu_inplace(m_lu, m_rowsTranspositions, nb_transpositions);
   m_det_p = (nb_transpositions%2) ? -1 : 1;
 
   m_p = m_rowsTranspositions;
 
   m_isInitialized = true;
-  return *this;
 }
 
 template<typename MatrixType>
-typename internal::traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const
+typename PartialPivLU<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const
 {
   eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
   return Scalar(m_det_p) * m_lu.diagonal().prod();
@@ -438,38 +559,21 @@ MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const
   return res;
 }
 
-/***** Implementation of solve() *****************************************************/
+/***** Implementation details *****************************************************/
 
 namespace internal {
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<PartialPivLU<_MatrixType>, Rhs>
-  : solve_retval_base<PartialPivLU<_MatrixType>, Rhs>
+/***** Implementation of inverse() *****************************************************/
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename PartialPivLU<MatrixType>::Scalar>, Dense2Dense>
 {
-  EIGEN_MAKE_SOLVE_HELPERS(PartialPivLU<_MatrixType>,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
+  typedef PartialPivLU<MatrixType> LuType;
+  typedef Inverse<LuType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename LuType::Scalar> &)
   {
-    /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
-    * So we proceed as follows:
-    * Step 1: compute c = Pb.
-    * Step 2: replace c by the solution x to Lx = c.
-    * Step 3: replace c by the solution x to Ux = c.
-    */
-
-    eigen_assert(rhs().rows() == dec().matrixLU().rows());
-
-    // Step 1
-    dst = dec().permutationP() * rhs();
-
-    // Step 2
-    dec().matrixLU().template triangularView<UnitLower>().solveInPlace(dst);
-
-    // Step 3
-    dec().matrixLU().template triangularView<Upper>().solveInPlace(dst);
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
-
 } // end namespace internal
 
 /******** MatrixBase methods *******/
@@ -487,7 +591,6 @@ MatrixBase<Derived>::partialPivLu() const
   return PartialPivLU<PlainObject>(eval());
 }
 
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
 /** \lu_module
   *
   * Synonym of partialPivLu().
@@ -502,7 +605,6 @@ MatrixBase<Derived>::lu() const
 {
   return PartialPivLU<PlainObject>(eval());
 }
-#endif
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/LU/PartialPivLU_MKL.h b/Eigen/src/LU/PartialPivLU_LAPACKE.h
index 9035953c8..755168a94 100644
--- a/Eigen/src/LU/PartialPivLU_MKL.h
+++ b/Eigen/src/LU/PartialPivLU_LAPACKE.h
@@ -25,7 +25,7 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *     LU decomposition with partial pivoting based on LAPACKE_?getrf function.
  ********************************************************************************
 */
@@ -33,20 +33,18 @@
 #ifndef EIGEN_PARTIALLU_LAPACK_H
 #define EIGEN_PARTIALLU_LAPACK_H
 
-#include "Eigen/src/Core/util/MKL_support.h"
-
 namespace Eigen { 
 
 namespace internal {
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_LU_PARTPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_LAPACKE_LU_PARTPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \
 template<int StorageOrder> \
 struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
 { \
   /* \internal performs the LU decomposition in-place of the matrix represented */ \
-  static lapack_int blocked_lu(lapack_int rows, lapack_int cols, EIGTYPE* lu_data, lapack_int luStride, lapack_int* row_transpositions, lapack_int& nb_transpositions, lapack_int maxBlockSize=256) \
+  static lapack_int blocked_lu(Index rows, Index cols, EIGTYPE* lu_data, Index luStride, lapack_int* row_transpositions, lapack_int& nb_transpositions, lapack_int maxBlockSize=256) \
   { \
     EIGEN_UNUSED_VARIABLE(maxBlockSize);\
     lapack_int matrix_order, first_zero_pivot; \
@@ -54,14 +52,14 @@ struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
     EIGTYPE* a; \
 /* Set up parameters for ?getrf */ \
     matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
-    lda = luStride; \
+    lda = convert_index<lapack_int>(luStride); \
     a = lu_data; \
     ipiv = row_transpositions; \
-    m = rows; \
-    n = cols; \
+    m = convert_index<lapack_int>(rows); \
+    n = convert_index<lapack_int>(cols); \
     nb_transpositions = 0; \
 \
-    info = LAPACKE_##MKLPREFIX##getrf( matrix_order, m, n, (MKLTYPE*)a, lda, ipiv ); \
+    info = LAPACKE_##LAPACKE_PREFIX##getrf( matrix_order, m, n, (LAPACKE_TYPE*)a, lda, ipiv ); \
 \
     for(int i=0;i<m;i++) { ipiv[i]--; if (ipiv[i]!=i) nb_transpositions++; } \
 \
@@ -73,10 +71,10 @@ struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
   } \
 };
 
-EIGEN_MKL_LU_PARTPIV(double, double, d)
-EIGEN_MKL_LU_PARTPIV(float, float, s)
-EIGEN_MKL_LU_PARTPIV(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_LU_PARTPIV(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_LU_PARTPIV(double, double, d)
+EIGEN_LAPACKE_LU_PARTPIV(float, float, s)
+EIGEN_LAPACKE_LU_PARTPIV(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_LU_PARTPIV(scomplex, lapack_complex_float,  c)
 
 } // end namespace internal
 
diff --git a/Eigen/src/LU/arch/CMakeLists.txt b/Eigen/src/LU/arch/CMakeLists.txt
deleted file mode 100644
index f6b7ed9ec..000000000
--- a/Eigen/src/LU/arch/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_LU_arch_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_LU_arch_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU/arch COMPONENT Devel
-  )
diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h
index 60b7a2376..ebb64a62b 100644
--- a/Eigen/src/LU/arch/Inverse_SSE.h
+++ b/Eigen/src/LU/arch/Inverse_SSE.h
@@ -35,13 +35,15 @@ template<typename MatrixType, typename ResultType>
 struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
 {
   enum {
-    MatrixAlignment     = bool(MatrixType::Flags&AlignedBit),
-    ResultAlignment     = bool(ResultType::Flags&AlignedBit),
+    MatrixAlignment     = traits<MatrixType>::Alignment,
+    ResultAlignment     = traits<ResultType>::Alignment,
     StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
   };
+  typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
   
-  static void run(const MatrixType& matrix, ResultType& result)
+  static void run(const MatrixType& mat, ResultType& result)
   {
+    ActualMatrixType matrix(mat);
     EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
 
     // Load the full matrix into registers
@@ -151,10 +153,12 @@ struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
     iC = _mm_mul_ps(rd,iC);
     iD = _mm_mul_ps(rd,iD);
 
-    result.template writePacket<ResultAlignment>( 0, _mm_shuffle_ps(iA,iB,0x77));
-    result.template writePacket<ResultAlignment>( 4, _mm_shuffle_ps(iA,iB,0x22));
-    result.template writePacket<ResultAlignment>( 8, _mm_shuffle_ps(iC,iD,0x77));
-    result.template writePacket<ResultAlignment>(12, _mm_shuffle_ps(iC,iD,0x22));
+    Index res_stride = result.outerStride();
+    float* res = result.data();
+    pstoret<float, Packet4f, ResultAlignment>(res+0,            _mm_shuffle_ps(iA,iB,0x77));
+    pstoret<float, Packet4f, ResultAlignment>(res+res_stride,   _mm_shuffle_ps(iA,iB,0x22));
+    pstoret<float, Packet4f, ResultAlignment>(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77));
+    pstoret<float, Packet4f, ResultAlignment>(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22));
   }
 
 };
@@ -163,18 +167,21 @@ template<typename MatrixType, typename ResultType>
 struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
 {
   enum {
-    MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
-    ResultAlignment = bool(ResultType::Flags&AlignedBit),
+    MatrixAlignment     = traits<MatrixType>::Alignment,
+    ResultAlignment     = traits<ResultType>::Alignment,
     StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
   };
-  static void run(const MatrixType& matrix, ResultType& result)
+  typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
+  
+  static void run(const MatrixType& mat, ResultType& result)
   {
+    ActualMatrixType matrix(mat);
     const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
     const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
 
     // The inverse is calculated using "Divide and Conquer" technique. The
     // original matrix is divide into four 2x2 sub-matrices. Since each
-    // register of the matrix holds two element, the smaller matrices are
+    // register of the matrix holds two elements, the smaller matrices are
     // consisted of two registers. Hence we get a better locality of the
     // calculations.
 
@@ -311,14 +318,16 @@ struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
     iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
     iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
 
-    result.template writePacket<ResultAlignment>( 0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));     // iA# / det
-    result.template writePacket<ResultAlignment>( 4, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
-    result.template writePacket<ResultAlignment>( 2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));     // iB# / det
-    result.template writePacket<ResultAlignment>( 6, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
-    result.template writePacket<ResultAlignment>( 8, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));     // iC# / det
-    result.template writePacket<ResultAlignment>(12, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
-    result.template writePacket<ResultAlignment>(10, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));     // iD# / det
-    result.template writePacket<ResultAlignment>(14, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
+    Index res_stride = result.outerStride();
+    double* res = result.data();
+    pstoret<double, Packet2d, ResultAlignment>(res+0,             _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+res_stride,    _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res+2,             _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+res_stride+2,  _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride,  _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride,  _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
   }
 };
 
diff --git a/Eigen/src/MetisSupport/CMakeLists.txt b/Eigen/src/MetisSupport/CMakeLists.txt
deleted file mode 100644
index 2bad31416..000000000
--- a/Eigen/src/MetisSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MetisSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_MetisSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/MetisSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/MetisSupport/MetisSupport.h b/Eigen/src/MetisSupport/MetisSupport.h
index f2bbef20c..4c15304ad 100644
--- a/Eigen/src/MetisSupport/MetisSupport.h
+++ b/Eigen/src/MetisSupport/MetisSupport.h
@@ -18,12 +18,12 @@ namespace Eigen {
  * Row (column) i of A is the matperm(i) row (column) of Ap. 
  * WARNING: As computed by METIS, this corresponds to the vector iperm (instead of perm)
  */
-template <typename Index>
+template <typename StorageIndex>
 class MetisOrdering
 {
 public:
-  typedef PermutationMatrix<Dynamic,Dynamic,Index> PermutationType;
-  typedef Matrix<Index,Dynamic,1> IndexVector; 
+  typedef PermutationMatrix<Dynamic,Dynamic,StorageIndex> PermutationType;
+  typedef Matrix<StorageIndex,Dynamic,1> IndexVector; 
   
   template <typename MatrixType>
   void get_symmetrized_graph(const MatrixType& A)
@@ -36,7 +36,7 @@ public:
     Index TotNz = 0; 
     IndexVector visited(m); 
     visited.setConstant(-1); 
-    for (int j = 0; j < m; j++)
+    for (StorageIndex j = 0; j < m; j++)
     {
       // Compute the union structure of of A(j,:) and At(j,:)
       visited(j) = j; // Do not include the diagonal element
@@ -67,8 +67,8 @@ public:
 
     // Now compute the real adjacency list of each column/row 
     visited.setConstant(-1); 
-    Index CurNz = 0; 
-    for (int j = 0; j < m; j++)
+    StorageIndex CurNz = 0; 
+    for (StorageIndex j = 0; j < m; j++)
     {
       m_indexPtr(j) = CurNz; 
       
@@ -76,7 +76,7 @@ public:
       // Add the pattern of row/column j of A to A+At
       for (typename MatrixType::InnerIterator it(A,j); it; ++it)
       {
-        Index idx = it.index(); // Get the row index (for column major) or column index (for row major)
+        StorageIndex idx = it.index(); // Get the row index (for column major) or column index (for row major)
         if (visited(idx) != j ) 
         {
           visited(idx) = j; 
@@ -87,7 +87,7 @@ public:
       //Add the pattern of row/column j of At to A+At
       for (typename MatrixType::InnerIterator it(At, j); it; ++it)
       {
-        Index idx = it.index(); 
+        StorageIndex idx = it.index(); 
         if(visited(idx) != j)
         {
           visited(idx) = j; 
@@ -102,7 +102,7 @@ public:
   template <typename MatrixType>
   void operator() (const MatrixType& A, PermutationType& matperm)
   {
-     Index m = A.cols();
+     StorageIndex m = internal::convert_index<StorageIndex>(A.cols()); // must be StorageIndex, because it is passed by address to METIS
      IndexVector perm(m),iperm(m); 
     // First, symmetrize the matrix graph. 
      get_symmetrized_graph(A); 
diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h
index 1c28bdf41..f91ecb24e 100644
--- a/Eigen/src/OrderingMethods/Amd.h
+++ b/Eigen/src/OrderingMethods/Amd.h
@@ -8,7 +8,7 @@
 NOTE: this routine has been adapted from the CSparse library:
 
 Copyright (c) 2006, Timothy A. Davis.
-http://www.cise.ufl.edu/research/sparse/CSparse
+http://www.suitesparse.com
 
 CSparse is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
@@ -41,10 +41,10 @@ template<typename T0, typename T1> inline bool amd_marked(const T0* w, const T1&
 template<typename T0, typename T1> inline void amd_mark(const T0* w, const T1& j) { return w[j] = amd_flip(w[j]); }
 
 /* clear w */
-template<typename Index>
-static int cs_wclear (Index mark, Index lemax, Index *w, Index n)
+template<typename StorageIndex>
+static StorageIndex cs_wclear (StorageIndex mark, StorageIndex lemax, StorageIndex *w, StorageIndex n)
 {
-  Index k;
+  StorageIndex k;
   if(mark < 2 || (mark + lemax < 0))
   {
     for(k = 0; k < n; k++)
@@ -56,10 +56,10 @@ static int cs_wclear (Index mark, Index lemax, Index *w, Index n)
 }
 
 /* depth-first search and postorder of a tree rooted at node j */
-template<typename Index>
-Index cs_tdfs(Index j, Index k, Index *head, const Index *next, Index *post, Index *stack)
+template<typename StorageIndex>
+StorageIndex cs_tdfs(StorageIndex j, StorageIndex k, StorageIndex *head, const StorageIndex *next, StorageIndex *post, StorageIndex *stack)
 {
-  int i, p, top = 0;
+  StorageIndex i, p, top = 0;
   if(!head || !next || !post || !stack) return (-1);    /* check inputs */
   stack[0] = j;                 /* place j on the stack */
   while (top >= 0)                /* while (stack is not empty) */
@@ -84,42 +84,45 @@ Index cs_tdfs(Index j, Index k, Index *head, const Index *next, Index *post, Ind
 /** \internal
   * \ingroup OrderingMethods_Module 
   * Approximate minimum degree ordering algorithm.
-  * \returns the permutation P reducing the fill-in of the input matrix \a C
-  * The input matrix \a C must be a selfadjoint compressed column major SparseMatrix object. Both the upper and lower parts have to be stored, but the diagonal entries are optional.
+  *
+  * \param[in] C the input selfadjoint matrix stored in compressed column major format.
+  * \param[out] perm the permutation P reducing the fill-in of the input matrix \a C
+  *
+  * Note that the input matrix \a C must be complete, that is both the upper and lower parts have to be stored, as well as the diagonal entries.
   * On exit the values of C are destroyed */
-template<typename Scalar, typename Index>
-void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, PermutationMatrix<Dynamic,Dynamic,Index>& perm)
+template<typename Scalar, typename StorageIndex>
+void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,StorageIndex>& C, PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm)
 {
   using std::sqrt;
   
-  int d, dk, dext, lemax = 0, e, elenk, eln, i, j, k, k1,
-      k2, k3, jlast, ln, dense, nzmax, mindeg = 0, nvi, nvj, nvk, mark, wnvi,
-      ok, nel = 0, p, p1, p2, p3, p4, pj, pk, pk1, pk2, pn, q, t;
-  unsigned int h;
+  StorageIndex d, dk, dext, lemax = 0, e, elenk, eln, i, j, k, k1,
+                k2, k3, jlast, ln, dense, nzmax, mindeg = 0, nvi, nvj, nvk, mark, wnvi,
+                ok, nel = 0, p, p1, p2, p3, p4, pj, pk, pk1, pk2, pn, q, t, h;
   
-  Index n = C.cols();
-  dense = std::max<Index> (16, Index(10 * sqrt(double(n))));   /* find dense threshold */
-  dense = std::min<Index> (n-2, dense);
+  StorageIndex n = StorageIndex(C.cols());
+  dense = std::max<StorageIndex> (16, StorageIndex(10 * sqrt(double(n))));   /* find dense threshold */
+  dense = (std::min)(n-2, dense);
   
-  Index cnz = C.nonZeros();
+  StorageIndex cnz = StorageIndex(C.nonZeros());
   perm.resize(n+1);
   t = cnz + cnz/5 + 2*n;                 /* add elbow room to C */
   C.resizeNonZeros(t);
   
-  Index* W       = new Index[8*(n+1)]; /* get workspace */
-  Index* len     = W;
-  Index* nv      = W +   (n+1);
-  Index* next    = W + 2*(n+1);
-  Index* head    = W + 3*(n+1);
-  Index* elen    = W + 4*(n+1);
-  Index* degree  = W + 5*(n+1);
-  Index* w       = W + 6*(n+1);
-  Index* hhead   = W + 7*(n+1);
-  Index* last    = perm.indices().data();                              /* use P as workspace for last */
+  // get workspace
+  ei_declare_aligned_stack_constructed_variable(StorageIndex,W,8*(n+1),0);
+  StorageIndex* len     = W;
+  StorageIndex* nv      = W +   (n+1);
+  StorageIndex* next    = W + 2*(n+1);
+  StorageIndex* head    = W + 3*(n+1);
+  StorageIndex* elen    = W + 4*(n+1);
+  StorageIndex* degree  = W + 5*(n+1);
+  StorageIndex* w       = W + 6*(n+1);
+  StorageIndex* hhead   = W + 7*(n+1);
+  StorageIndex* last    = perm.indices().data();                              /* use P as workspace for last */
   
   /* --- Initialize quotient graph ---------------------------------------- */
-  Index* Cp = C.outerIndexPtr();
-  Index* Ci = C.innerIndexPtr();
+  StorageIndex* Cp = C.outerIndexPtr();
+  StorageIndex* Ci = C.innerIndexPtr();
   for(k = 0; k < n; k++)
     len[k] = Cp[k+1] - Cp[k];
   len[n] = 0;
@@ -136,10 +139,7 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
     elen[i]   = 0;                      // Ek of node i is empty
     degree[i] = len[i];                 // degree of node i
   }
-  mark = internal::cs_wclear<Index>(0, 0, w, n);         /* clear w */
-  elen[n] = -2;                         /* n is a dead element */
-  Cp[n] = -1;                           /* n is a root of assembly tree */
-  w[n] = 0;                             /* n is a dead element */
+  mark = internal::cs_wclear<StorageIndex>(0, 0, w, n);         /* clear w */
   
   /* --- Initialize degree lists ------------------------------------------ */
   for(i = 0; i < n; i++)
@@ -153,7 +153,7 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
       }
    
     d = degree[i];
-    if(d == 1)                      /* node i is empty */
+    if(d == 1 && has_diag)           /* node i is empty */
     {
       elen[i] = -2;                 /* element i is dead */
       nel++;
@@ -263,7 +263,7 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
     elen[k] = -2;                     /* k is now an element */
     
     /* --- Find set differences ----------------------------------------- */
-    mark = internal::cs_wclear<Index>(mark, lemax, w, n);  /* clear w if necessary */
+    mark = internal::cs_wclear<StorageIndex>(mark, lemax, w, n);  /* clear w if necessary */
     for(pk = pk1; pk < pk2; pk++)    /* scan 1: find |Le\Lk| */
     {
       i = Ci[pk];
@@ -333,7 +333,7 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
       }
       else
       {
-        degree[i] = std::min<Index> (degree[i], d);   /* update degree(i) */
+        degree[i] = std::min<StorageIndex> (degree[i], d);   /* update degree(i) */
         Ci[pn] = Ci[p3];         /* move first node to end */
         Ci[p3] = Ci[p1];         /* move 1st el. to end of Ei */
         Ci[p1] = k;               /* add k as 1st element in of Ei */
@@ -341,12 +341,12 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
         h %= n;                    /* finalize hash of i */
         next[i] = hhead[h];      /* place i in hash bucket */
         hhead[h] = i;
-        last[i] = h;              /* save hash of i in last[i] */
+        last[i] = h;      /* save hash of i in last[i] */
       }
     }                                   /* scan2 is done */
     degree[k] = dk;                   /* finalize |Lk| */
-    lemax = std::max<Index>(lemax, dk);
-    mark = internal::cs_wclear<Index>(mark+lemax, lemax, w, n);    /* clear w */
+    lemax = std::max<StorageIndex>(lemax, dk);
+    mark = internal::cs_wclear<StorageIndex>(mark+lemax, lemax, w, n);    /* clear w */
     
     /* --- Supernode detection ------------------------------------------ */
     for(pk = pk1; pk < pk2; pk++)
@@ -394,12 +394,12 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
       if((nvi = -nv[i]) <= 0) continue;/* skip if i is dead */
       nv[i] = nvi;                      /* restore nv[i] */
       d = degree[i] + dk - nvi;         /* compute external degree(i) */
-      d = std::min<Index> (d, n - nel - nvi);
+      d = std::min<StorageIndex> (d, n - nel - nvi);
       if(head[d] != -1) last[head[d]] = i;
       next[i] = head[d];               /* put i back in degree list */
       last[i] = -1;
       head[d] = i;
-      mindeg = std::min<Index> (mindeg, d);       /* find new minimum degree */
+      mindeg = std::min<StorageIndex> (mindeg, d);       /* find new minimum degree */
       degree[i] = d;
       Ci[p++] = i;                      /* place i in Lk */
     }
@@ -432,12 +432,10 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
   }
   for(k = 0, i = 0; i <= n; i++)       /* postorder the assembly tree */
   {
-    if(Cp[i] == -1) k = internal::cs_tdfs<Index>(i, k, head, next, perm.indices().data(), w);
+    if(Cp[i] == -1) k = internal::cs_tdfs<StorageIndex>(i, k, head, next, perm.indices().data(), w);
   }
   
   perm.indices().conservativeResize(n);
-
-  delete[] W;
 }
 
 } // namespace internal
diff --git a/Eigen/src/OrderingMethods/CMakeLists.txt b/Eigen/src/OrderingMethods/CMakeLists.txt
deleted file mode 100644
index 9f4bb2758..000000000
--- a/Eigen/src/OrderingMethods/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_OrderingMethods_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_OrderingMethods_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/OrderingMethods COMPONENT Devel
-  )
diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 44548f660..933cd564b 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -41,12 +41,8 @@
 // 
 //   The colamd/symamd library is available at
 // 
-//       http://www.cise.ufl.edu/research/sparse/colamd/
+//       http://www.suitesparse.com
 
-//   This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
-//   file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
-//   files, and by any C code that calls the routines whose prototypes are
-//   listed below, or that uses the colamd/symamd definitions listed below.
   
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
@@ -102,9 +98,6 @@ namespace internal {
 /* === Definitions ========================================================== */
 /* ========================================================================== */
 
-#define COLAMD_MAX(a,b) (((a) > (b)) ? (a) : (b))
-#define COLAMD_MIN(a,b) (((a) < (b)) ? (a) : (b))
-
 #define ONES_COMPLEMENT(r) (-(r)-1)
 
 /* -------------------------------------------------------------------------- */
@@ -135,54 +128,54 @@ namespace internal {
 /* ========================================================================== */
 
 // == Row and Column structures ==
-template <typename Index>
+template <typename IndexType>
 struct colamd_col
 {
-  Index start ;   /* index for A of first row in this column, or DEAD */
+  IndexType start ;   /* index for A of first row in this column, or DEAD */
   /* if column is dead */
-  Index length ;  /* number of rows in this column */
+  IndexType length ;  /* number of rows in this column */
   union
   {
-    Index thickness ; /* number of original columns represented by this */
+    IndexType thickness ; /* number of original columns represented by this */
     /* col, if the column is alive */
-    Index parent ;  /* parent in parent tree super-column structure, if */
+    IndexType parent ;  /* parent in parent tree super-column structure, if */
     /* the column is dead */
   } shared1 ;
   union
   {
-    Index score ; /* the score used to maintain heap, if col is alive */
-    Index order ; /* pivot ordering of this column, if col is dead */
+    IndexType score ; /* the score used to maintain heap, if col is alive */
+    IndexType order ; /* pivot ordering of this column, if col is dead */
   } shared2 ;
   union
   {
-    Index headhash ;  /* head of a hash bucket, if col is at the head of */
+    IndexType headhash ;  /* head of a hash bucket, if col is at the head of */
     /* a degree list */
-    Index hash ;  /* hash value, if col is not in a degree list */
-    Index prev ;  /* previous column in degree list, if col is in a */
+    IndexType hash ;  /* hash value, if col is not in a degree list */
+    IndexType prev ;  /* previous column in degree list, if col is in a */
     /* degree list (but not at the head of a degree list) */
   } shared3 ;
   union
   {
-    Index degree_next ; /* next column, if col is in a degree list */
-    Index hash_next ;   /* next column, if col is in a hash list */
+    IndexType degree_next ; /* next column, if col is in a degree list */
+    IndexType hash_next ;   /* next column, if col is in a hash list */
   } shared4 ;
   
 };
  
-template <typename Index>
+template <typename IndexType>
 struct Colamd_Row
 {
-  Index start ;   /* index for A of first col in this row */
-  Index length ;  /* number of principal columns in this row */
+  IndexType start ;   /* index for A of first col in this row */
+  IndexType length ;  /* number of principal columns in this row */
   union
   {
-    Index degree ;  /* number of principal & non-principal columns in row */
-    Index p ;   /* used as a row pointer in init_rows_cols () */
+    IndexType degree ;  /* number of principal & non-principal columns in row */
+    IndexType p ;   /* used as a row pointer in init_rows_cols () */
   } shared1 ;
   union
   {
-    Index mark ;  /* for computing set differences and marking dead rows*/
-    Index first_column ;/* first column in row (used in garbage collection) */
+    IndexType mark ;  /* for computing set differences and marking dead rows*/
+    IndexType first_column ;/* first column in row (used in garbage collection) */
   } shared2 ;
   
 };
@@ -202,38 +195,38 @@ struct Colamd_Row
   
   This macro is not needed when using symamd.
   
-  Explicit typecast to Index added Sept. 23, 2002, COLAMD version 2.2, to avoid
+  Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid
   gcc -pedantic warning messages.
 */
-template <typename Index>
-inline Index colamd_c(Index n_col) 
-{ return Index( ((n_col) + 1) * sizeof (colamd_col<Index>) / sizeof (Index) ) ; }
+template <typename IndexType>
+inline IndexType colamd_c(IndexType n_col) 
+{ return IndexType( ((n_col) + 1) * sizeof (colamd_col<IndexType>) / sizeof (IndexType) ) ; }
 
-template <typename Index>
-inline Index  colamd_r(Index n_row)
-{ return Index(((n_row) + 1) * sizeof (Colamd_Row<Index>) / sizeof (Index)); }
+template <typename IndexType>
+inline IndexType  colamd_r(IndexType n_row)
+{ return IndexType(((n_row) + 1) * sizeof (Colamd_Row<IndexType>) / sizeof (IndexType)); }
 
 // Prototypes of non-user callable routines
-template <typename Index>
-static Index init_rows_cols (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> col [], Index A [], Index p [], Index stats[COLAMD_STATS] ); 
+template <typename IndexType>
+static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[COLAMD_STATS] ); 
 
-template <typename Index>
-static void init_scoring (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index head [], double knobs[COLAMD_KNOBS], Index *p_n_row2, Index *p_n_col2, Index *p_max_deg);
+template <typename IndexType>
+static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], double knobs[COLAMD_KNOBS], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
 
-template <typename Index>
-static Index find_ordering (Index n_row, Index n_col, Index Alen, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index head [], Index n_col2, Index max_deg, Index pfree);
+template <typename IndexType>
+static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree);
 
-template <typename Index>
-static void order_children (Index n_col, colamd_col<Index> Col [], Index p []);
+template <typename IndexType>
+static void order_children (IndexType n_col, colamd_col<IndexType> Col [], IndexType p []);
 
-template <typename Index>
-static void detect_super_cols (colamd_col<Index> Col [], Index A [], Index head [], Index row_start, Index row_length ) ;
+template <typename IndexType>
+static void detect_super_cols (colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ;
 
-template <typename Index>
-static Index garbage_collection (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index *pfree) ;
+template <typename IndexType>
+static IndexType garbage_collection (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType *pfree) ;
 
-template <typename Index>
-static inline  Index clear_mark (Index n_row, Colamd_Row<Index> Row [] ) ;
+template <typename IndexType>
+static inline  IndexType clear_mark (IndexType n_row, Colamd_Row<IndexType> Row [] ) ;
 
 /* === No debugging ========================================================= */
 
@@ -260,8 +253,8 @@ static inline  Index clear_mark (Index n_row, Colamd_Row<Index> Row [] ) ;
  * \param n_col number of columns in A
  * \return recommended value of Alen for use by colamd
  */
-template <typename Index>
-inline Index colamd_recommended ( Index nnz, Index n_row, Index n_col)
+template <typename IndexType>
+inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType n_col)
 {
   if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0)
     return (-1);
@@ -325,22 +318,22 @@ static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS])
  * \param knobs parameter settings for colamd
  * \param stats colamd output statistics and error codes
  */
-template <typename Index>
-static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, double knobs[COLAMD_KNOBS], Index stats[COLAMD_STATS])
+template <typename IndexType>
+static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[COLAMD_KNOBS], IndexType stats[COLAMD_STATS])
 {
   /* === Local variables ================================================== */
   
-  Index i ;     /* loop index */
-  Index nnz ;     /* nonzeros in A */
-  Index Row_size ;    /* size of Row [], in integers */
-  Index Col_size ;    /* size of Col [], in integers */
-  Index need ;      /* minimum required length of A */
-  Colamd_Row<Index> *Row ;   /* pointer into A of Row [0..n_row] array */
-  colamd_col<Index> *Col ;   /* pointer into A of Col [0..n_col] array */
-  Index n_col2 ;    /* number of non-dense, non-empty columns */
-  Index n_row2 ;    /* number of non-dense, non-empty rows */
-  Index ngarbage ;    /* number of garbage collections performed */
-  Index max_deg ;   /* maximum row degree */
+  IndexType i ;     /* loop index */
+  IndexType nnz ;     /* nonzeros in A */
+  IndexType Row_size ;    /* size of Row [], in integers */
+  IndexType Col_size ;    /* size of Col [], in integers */
+  IndexType need ;      /* minimum required length of A */
+  Colamd_Row<IndexType> *Row ;   /* pointer into A of Row [0..n_row] array */
+  colamd_col<IndexType> *Col ;   /* pointer into A of Col [0..n_col] array */
+  IndexType n_col2 ;    /* number of non-dense, non-empty columns */
+  IndexType n_row2 ;    /* number of non-dense, non-empty rows */
+  IndexType ngarbage ;    /* number of garbage collections performed */
+  IndexType max_deg ;   /* maximum row degree */
   double default_knobs [COLAMD_KNOBS] ; /* default knobs array */
   
   
@@ -431,8 +424,8 @@ static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, dou
   }
   
   Alen -= Col_size + Row_size ;
-  Col = (colamd_col<Index> *) &A [Alen] ;
-  Row = (Colamd_Row<Index> *) &A [Alen + Col_size] ;
+  Col = (colamd_col<IndexType> *) &A [Alen] ;
+  Row = (Colamd_Row<IndexType> *) &A [Alen + Col_size] ;
 
   /* === Construct the row and column data structures ===================== */
   
@@ -485,29 +478,29 @@ static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, dou
   column form of the matrix.  Returns false if the matrix is invalid,
   true otherwise.  Not user-callable.
 */
-template <typename Index>
-static Index init_rows_cols  /* returns true if OK, or false otherwise */
+template <typename IndexType>
+static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
   (
     /* === Parameters ======================================================= */
 
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* row indices of A, of size Alen */
-    Index p [],     /* pointers to columns in A, of size n_col+1 */
-    Index stats [COLAMD_STATS]  /* colamd statistics */ 
+    IndexType n_row,      /* number of rows of A */
+    IndexType n_col,      /* number of columns of A */
+    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
+    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    IndexType A [],     /* row indices of A, of size Alen */
+    IndexType p [],     /* pointers to columns in A, of size n_col+1 */
+    IndexType stats [COLAMD_STATS]  /* colamd statistics */ 
     )
 {
   /* === Local variables ================================================== */
 
-  Index col ;     /* a column index */
-  Index row ;     /* a row index */
-  Index *cp ;     /* a column pointer */
-  Index *cp_end ;   /* a pointer to the end of a column */
-  Index *rp ;     /* a row pointer */
-  Index *rp_end ;   /* a pointer to the end of a row */
-  Index last_row ;    /* previous row */
+  IndexType col ;     /* a column index */
+  IndexType row ;     /* a row index */
+  IndexType *cp ;     /* a column pointer */
+  IndexType *cp_end ;   /* a pointer to the end of a column */
+  IndexType *rp ;     /* a row pointer */
+  IndexType *rp_end ;   /* a pointer to the end of a row */
+  IndexType last_row ;    /* previous row */
 
   /* === Initialize columns, and check column pointers ==================== */
 
@@ -516,7 +509,7 @@ static Index init_rows_cols  /* returns true if OK, or false otherwise */
     Col [col].start = p [col] ;
     Col [col].length = p [col+1] - p [col] ;
 
-    if (Col [col].length < 0)
+    if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200
     {
       /* column pointers must be non-decreasing */
       stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
@@ -701,46 +694,46 @@ static Index init_rows_cols  /* returns true if OK, or false otherwise */
   Kills dense or empty columns and rows, calculates an initial score for
   each column, and places all columns in the degree lists.  Not user-callable.
 */
-template <typename Index>
+template <typename IndexType>
 static void init_scoring
   (
     /* === Parameters ======================================================= */
 
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* column form and row form of A */
-    Index head [],    /* of size n_col+1 */
+    IndexType n_row,      /* number of rows of A */
+    IndexType n_col,      /* number of columns of A */
+    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
+    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    IndexType A [],     /* column form and row form of A */
+    IndexType head [],    /* of size n_col+1 */
     double knobs [COLAMD_KNOBS],/* parameters */
-    Index *p_n_row2,    /* number of non-dense, non-empty rows */
-    Index *p_n_col2,    /* number of non-dense, non-empty columns */
-    Index *p_max_deg    /* maximum row degree */
+    IndexType *p_n_row2,    /* number of non-dense, non-empty rows */
+    IndexType *p_n_col2,    /* number of non-dense, non-empty columns */
+    IndexType *p_max_deg    /* maximum row degree */
     )
 {
   /* === Local variables ================================================== */
 
-  Index c ;     /* a column index */
-  Index r, row ;    /* a row index */
-  Index *cp ;     /* a column pointer */
-  Index deg ;     /* degree of a row or column */
-  Index *cp_end ;   /* a pointer to the end of a column */
-  Index *new_cp ;   /* new column pointer */
-  Index col_length ;    /* length of pruned column */
-  Index score ;     /* current column score */
-  Index n_col2 ;    /* number of non-dense, non-empty columns */
-  Index n_row2 ;    /* number of non-dense, non-empty rows */
-  Index dense_row_count ; /* remove rows with more entries than this */
-  Index dense_col_count ; /* remove cols with more entries than this */
-  Index min_score ;   /* smallest column score */
-  Index max_deg ;   /* maximum row degree */
-  Index next_col ;    /* Used to add to degree list.*/
+  IndexType c ;     /* a column index */
+  IndexType r, row ;    /* a row index */
+  IndexType *cp ;     /* a column pointer */
+  IndexType deg ;     /* degree of a row or column */
+  IndexType *cp_end ;   /* a pointer to the end of a column */
+  IndexType *new_cp ;   /* new column pointer */
+  IndexType col_length ;    /* length of pruned column */
+  IndexType score ;     /* current column score */
+  IndexType n_col2 ;    /* number of non-dense, non-empty columns */
+  IndexType n_row2 ;    /* number of non-dense, non-empty rows */
+  IndexType dense_row_count ; /* remove rows with more entries than this */
+  IndexType dense_col_count ; /* remove cols with more entries than this */
+  IndexType min_score ;   /* smallest column score */
+  IndexType max_deg ;   /* maximum row degree */
+  IndexType next_col ;    /* Used to add to degree list.*/
 
 
   /* === Extract knobs ==================================================== */
 
-  dense_row_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
-  dense_col_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
+  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ;
+  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ;
   COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
   max_deg = 0 ;
   n_col2 = n_col ;
@@ -804,7 +797,7 @@ static void init_scoring
     else
     {
       /* keep track of max degree of remaining rows */
-      max_deg = COLAMD_MAX (max_deg, deg) ;
+      max_deg = numext::maxi(max_deg, deg) ;
     }
   }
   COLAMD_DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ;
@@ -842,10 +835,10 @@ static void init_scoring
       /* add row's external degree */
       score += Row [row].shared1.degree - 1 ;
       /* guard against integer overflow */
-      score = COLAMD_MIN (score, n_col) ;
+      score = numext::mini(score, n_col) ;
     }
     /* determine pruned column length */
-    col_length = (Index) (new_cp - &A [Col [c].start]) ;
+    col_length = (IndexType) (new_cp - &A [Col [c].start]) ;
     if (col_length == 0)
     {
       /* a newly-made null column (all rows in this col are "dense" */
@@ -914,7 +907,7 @@ static void init_scoring
       head [score] = c ;
 
       /* see if this score is less than current min */
-      min_score = COLAMD_MIN (min_score, score) ;
+      min_score = numext::mini(min_score, score) ;
 
 
     }
@@ -938,56 +931,56 @@ static void init_scoring
   (no supercolumns on input).  Uses a minimum approximate column minimum
   degree ordering method.  Not user-callable.
 */
-template <typename Index>
-static Index find_ordering /* return the number of garbage collections */
+template <typename IndexType>
+static IndexType find_ordering /* return the number of garbage collections */
   (
     /* === Parameters ======================================================= */
 
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Index Alen,     /* size of A, 2*nnz + n_col or larger */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* column form and row form of A */
-    Index head [],    /* of size n_col+1 */
-    Index n_col2,     /* Remaining columns to order */
-    Index max_deg,    /* Maximum row degree */
-    Index pfree     /* index of first free slot (2*nnz on entry) */
+    IndexType n_row,      /* number of rows of A */
+    IndexType n_col,      /* number of columns of A */
+    IndexType Alen,     /* size of A, 2*nnz + n_col or larger */
+    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
+    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    IndexType A [],     /* column form and row form of A */
+    IndexType head [],    /* of size n_col+1 */
+    IndexType n_col2,     /* Remaining columns to order */
+    IndexType max_deg,    /* Maximum row degree */
+    IndexType pfree     /* index of first free slot (2*nnz on entry) */
     )
 {
   /* === Local variables ================================================== */
 
-  Index k ;     /* current pivot ordering step */
-  Index pivot_col ;   /* current pivot column */
-  Index *cp ;     /* a column pointer */
-  Index *rp ;     /* a row pointer */
-  Index pivot_row ;   /* current pivot row */
-  Index *new_cp ;   /* modified column pointer */
-  Index *new_rp ;   /* modified row pointer */
-  Index pivot_row_start ; /* pointer to start of pivot row */
-  Index pivot_row_degree ;  /* number of columns in pivot row */
-  Index pivot_row_length ;  /* number of supercolumns in pivot row */
-  Index pivot_col_score ; /* score of pivot column */
-  Index needed_memory ;   /* free space needed for pivot row */
-  Index *cp_end ;   /* pointer to the end of a column */
-  Index *rp_end ;   /* pointer to the end of a row */
-  Index row ;     /* a row index */
-  Index col ;     /* a column index */
-  Index max_score ;   /* maximum possible score */
-  Index cur_score ;   /* score of current column */
+  IndexType k ;     /* current pivot ordering step */
+  IndexType pivot_col ;   /* current pivot column */
+  IndexType *cp ;     /* a column pointer */
+  IndexType *rp ;     /* a row pointer */
+  IndexType pivot_row ;   /* current pivot row */
+  IndexType *new_cp ;   /* modified column pointer */
+  IndexType *new_rp ;   /* modified row pointer */
+  IndexType pivot_row_start ; /* pointer to start of pivot row */
+  IndexType pivot_row_degree ;  /* number of columns in pivot row */
+  IndexType pivot_row_length ;  /* number of supercolumns in pivot row */
+  IndexType pivot_col_score ; /* score of pivot column */
+  IndexType needed_memory ;   /* free space needed for pivot row */
+  IndexType *cp_end ;   /* pointer to the end of a column */
+  IndexType *rp_end ;   /* pointer to the end of a row */
+  IndexType row ;     /* a row index */
+  IndexType col ;     /* a column index */
+  IndexType max_score ;   /* maximum possible score */
+  IndexType cur_score ;   /* score of current column */
   unsigned int hash ;   /* hash value for supernode detection */
-  Index head_column ;   /* head of hash bucket */
-  Index first_col ;   /* first column in hash bucket */
-  Index tag_mark ;    /* marker value for mark array */
-  Index row_mark ;    /* Row [row].shared2.mark */
-  Index set_difference ;  /* set difference size of row with pivot row */
-  Index min_score ;   /* smallest column score */
-  Index col_thickness ;   /* "thickness" (no. of columns in a supercol) */
-  Index max_mark ;    /* maximum value of tag_mark */
-  Index pivot_col_thickness ; /* number of columns represented by pivot col */
-  Index prev_col ;    /* Used by Dlist operations. */
-  Index next_col ;    /* Used by Dlist operations. */
-  Index ngarbage ;    /* number of garbage collections performed */
+  IndexType head_column ;   /* head of hash bucket */
+  IndexType first_col ;   /* first column in hash bucket */
+  IndexType tag_mark ;    /* marker value for mark array */
+  IndexType row_mark ;    /* Row [row].shared2.mark */
+  IndexType set_difference ;  /* set difference size of row with pivot row */
+  IndexType min_score ;   /* smallest column score */
+  IndexType col_thickness ;   /* "thickness" (no. of columns in a supercol) */
+  IndexType max_mark ;    /* maximum value of tag_mark */
+  IndexType pivot_col_thickness ; /* number of columns represented by pivot col */
+  IndexType prev_col ;    /* Used by Dlist operations. */
+  IndexType next_col ;    /* Used by Dlist operations. */
+  IndexType ngarbage ;    /* number of garbage collections performed */
 
 
   /* === Initialization and clear mark ==================================== */
@@ -1040,7 +1033,7 @@ static Index find_ordering /* return the number of garbage collections */
 
     /* === Garbage_collection, if necessary ============================= */
 
-    needed_memory = COLAMD_MIN (pivot_col_score, n_col - k) ;
+    needed_memory = numext::mini(pivot_col_score, n_col - k) ;
     if (pfree + needed_memory >= Alen)
     {
       pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
@@ -1099,7 +1092,7 @@ static Index find_ordering /* return the number of garbage collections */
 
     /* clear tag on pivot column */
     Col [pivot_col].shared1.thickness = pivot_col_thickness ;
-    max_deg = COLAMD_MAX (max_deg, pivot_row_degree) ;
+    max_deg = numext::maxi(max_deg, pivot_row_degree) ;
 
 
     /* === Kill all rows used to construct pivot row ==================== */
@@ -1273,11 +1266,11 @@ static Index find_ordering /* return the number of garbage collections */
 	/* add set difference */
 	cur_score += row_mark - tag_mark ;
 	/* integer overflow... */
-	cur_score = COLAMD_MIN (cur_score, n_col) ;
+	cur_score = numext::mini(cur_score, n_col) ;
       }
 
       /* recompute the column's length */
-      Col [col].length = (Index) (new_cp - &A [Col [col].start]) ;
+      Col [col].length = (IndexType) (new_cp - &A [Col [col].start]) ;
 
       /* === Further mass elimination ================================= */
 
@@ -1325,7 +1318,7 @@ static Index find_ordering /* return the number of garbage collections */
 	Col [col].shared4.hash_next = first_col ;
 
 	/* save hash function in Col [col].shared3.hash */
-	Col [col].shared3.hash = (Index) hash ;
+	Col [col].shared3.hash = (IndexType) hash ;
 	COLAMD_ASSERT (COL_IS_ALIVE (col)) ;
       }
     }
@@ -1386,7 +1379,7 @@ static Index find_ordering /* return the number of garbage collections */
       cur_score -= Col [col].shared1.thickness ;
 
       /* make sure score is less or equal than the max score */
-      cur_score = COLAMD_MIN (cur_score, max_score) ;
+      cur_score = numext::mini(cur_score, max_score) ;
       COLAMD_ASSERT (cur_score >= 0) ;
 
       /* store updated score */
@@ -1409,7 +1402,7 @@ static Index find_ordering /* return the number of garbage collections */
       head [cur_score] = col ;
 
       /* see if this score is less than current min */
-      min_score = COLAMD_MIN (min_score, cur_score) ;
+      min_score = numext::mini(min_score, cur_score) ;
 
     }
 
@@ -1420,7 +1413,7 @@ static Index find_ordering /* return the number of garbage collections */
       /* update pivot row length to reflect any cols that were killed */
       /* during super-col detection and mass elimination */
       Row [pivot_row].start  = pivot_row_start ;
-      Row [pivot_row].length = (Index) (new_rp - &A[pivot_row_start]) ;
+      Row [pivot_row].length = (IndexType) (new_rp - &A[pivot_row_start]) ;
       Row [pivot_row].shared1.degree = pivot_row_degree ;
       Row [pivot_row].shared2.mark = 0 ;
       /* pivot row is no longer dead */
@@ -1449,22 +1442,22 @@ static Index find_ordering /* return the number of garbage collections */
   taken by this routine is O (n_col), that is, linear in the number of
   columns.  Not user-callable.
 */
-template <typename Index>
+template <typename IndexType>
 static inline  void order_children
 (
   /* === Parameters ======================================================= */
 
-  Index n_col,      /* number of columns of A */
-  colamd_col<Index> Col [],    /* of size n_col+1 */
-  Index p []      /* p [0 ... n_col-1] is the column permutation*/
+  IndexType n_col,      /* number of columns of A */
+  colamd_col<IndexType> Col [],    /* of size n_col+1 */
+  IndexType p []      /* p [0 ... n_col-1] is the column permutation*/
   )
 {
   /* === Local variables ================================================== */
 
-  Index i ;     /* loop counter for all columns */
-  Index c ;     /* column index */
-  Index parent ;    /* index of column's parent */
-  Index order ;     /* column's order */
+  IndexType i ;     /* loop counter for all columns */
+  IndexType c ;     /* column index */
+  IndexType parent ;    /* index of column's parent */
+  IndexType order ;     /* column's order */
 
   /* === Order each non-principal column ================================== */
 
@@ -1550,33 +1543,33 @@ static inline  void order_children
   just been computed in the approximate degree computation.
   Not user-callable.
 */
-template <typename Index>
+template <typename IndexType>
 static void detect_super_cols
 (
   /* === Parameters ======================================================= */
   
-  colamd_col<Index> Col [],    /* of size n_col+1 */
-  Index A [],     /* row indices of A */
-  Index head [],    /* head of degree lists and hash buckets */
-  Index row_start,    /* pointer to set of columns to check */
-  Index row_length    /* number of columns to check */
+  colamd_col<IndexType> Col [],    /* of size n_col+1 */
+  IndexType A [],     /* row indices of A */
+  IndexType head [],    /* head of degree lists and hash buckets */
+  IndexType row_start,    /* pointer to set of columns to check */
+  IndexType row_length    /* number of columns to check */
 )
 {
   /* === Local variables ================================================== */
 
-  Index hash ;      /* hash value for a column */
-  Index *rp ;     /* pointer to a row */
-  Index c ;     /* a column index */
-  Index super_c ;   /* column index of the column to absorb into */
-  Index *cp1 ;      /* column pointer for column super_c */
-  Index *cp2 ;      /* column pointer for column c */
-  Index length ;    /* length of column super_c */
-  Index prev_c ;    /* column preceding c in hash bucket */
-  Index i ;     /* loop counter */
-  Index *rp_end ;   /* pointer to the end of the row */
-  Index col ;     /* a column index in the row to check */
-  Index head_column ;   /* first column in hash bucket or degree list */
-  Index first_col ;   /* first column in hash bucket */
+  IndexType hash ;      /* hash value for a column */
+  IndexType *rp ;     /* pointer to a row */
+  IndexType c ;     /* a column index */
+  IndexType super_c ;   /* column index of the column to absorb into */
+  IndexType *cp1 ;      /* column pointer for column super_c */
+  IndexType *cp2 ;      /* column pointer for column c */
+  IndexType length ;    /* length of column super_c */
+  IndexType prev_c ;    /* column preceding c in hash bucket */
+  IndexType i ;     /* loop counter */
+  IndexType *rp_end ;   /* pointer to the end of the row */
+  IndexType col ;     /* a column index in the row to check */
+  IndexType head_column ;   /* first column in hash bucket or degree list */
+  IndexType first_col ;   /* first column in hash bucket */
 
   /* === Consider each column in the row ================================== */
 
@@ -1701,27 +1694,27 @@ static void detect_super_cols
   itself linear in the number of nonzeros in the input matrix.
   Not user-callable.
 */
-template <typename Index>
-static Index garbage_collection  /* returns the new value of pfree */
+template <typename IndexType>
+static IndexType garbage_collection  /* returns the new value of pfree */
   (
     /* === Parameters ======================================================= */
     
-    Index n_row,      /* number of rows */
-    Index n_col,      /* number of columns */
-    Colamd_Row<Index> Row [],    /* row info */
-    colamd_col<Index> Col [],    /* column info */
-    Index A [],     /* A [0 ... Alen-1] holds the matrix */
-    Index *pfree      /* &A [0] ... pfree is in use */
+    IndexType n_row,      /* number of rows */
+    IndexType n_col,      /* number of columns */
+    Colamd_Row<IndexType> Row [],    /* row info */
+    colamd_col<IndexType> Col [],    /* column info */
+    IndexType A [],     /* A [0 ... Alen-1] holds the matrix */
+    IndexType *pfree      /* &A [0] ... pfree is in use */
     )
 {
   /* === Local variables ================================================== */
 
-  Index *psrc ;     /* source pointer */
-  Index *pdest ;    /* destination pointer */
-  Index j ;     /* counter */
-  Index r ;     /* a row index */
-  Index c ;     /* a column index */
-  Index length ;    /* length of a row or column */
+  IndexType *psrc ;     /* source pointer */
+  IndexType *pdest ;    /* destination pointer */
+  IndexType j ;     /* counter */
+  IndexType r ;     /* a row index */
+  IndexType c ;     /* a column index */
+  IndexType length ;    /* length of a row or column */
 
   /* === Defragment the columns =========================================== */
 
@@ -1734,7 +1727,7 @@ static Index garbage_collection  /* returns the new value of pfree */
 
       /* move and compact the column */
       COLAMD_ASSERT (pdest <= psrc) ;
-      Col [c].start = (Index) (pdest - &A [0]) ;
+      Col [c].start = (IndexType) (pdest - &A [0]) ;
       length = Col [c].length ;
       for (j = 0 ; j < length ; j++)
       {
@@ -1744,7 +1737,7 @@ static Index garbage_collection  /* returns the new value of pfree */
 	  *pdest++ = r ;
 	}
       }
-      Col [c].length = (Index) (pdest - &A [Col [c].start]) ;
+      Col [c].length = (IndexType) (pdest - &A [Col [c].start]) ;
     }
   }
 
@@ -1791,7 +1784,7 @@ static Index garbage_collection  /* returns the new value of pfree */
 
       /* move and compact the row */
       COLAMD_ASSERT (pdest <= psrc) ;
-      Row [r].start = (Index) (pdest - &A [0]) ;
+      Row [r].start = (IndexType) (pdest - &A [0]) ;
       length = Row [r].length ;
       for (j = 0 ; j < length ; j++)
       {
@@ -1801,7 +1794,7 @@ static Index garbage_collection  /* returns the new value of pfree */
 	  *pdest++ = c ;
 	}
       }
-      Row [r].length = (Index) (pdest - &A [Row [r].start]) ;
+      Row [r].length = (IndexType) (pdest - &A [Row [r].start]) ;
 
     }
   }
@@ -1810,7 +1803,7 @@ static Index garbage_collection  /* returns the new value of pfree */
 
   /* === Return the new value of pfree ==================================== */
 
-  return ((Index) (pdest - &A [0])) ;
+  return ((IndexType) (pdest - &A [0])) ;
 }
 
 
@@ -1822,18 +1815,18 @@ static Index garbage_collection  /* returns the new value of pfree */
   Clears the Row [].shared2.mark array, and returns the new tag_mark.
   Return value is the new tag_mark.  Not user-callable.
 */
-template <typename Index>
-static inline  Index clear_mark  /* return the new value for tag_mark */
+template <typename IndexType>
+static inline  IndexType clear_mark  /* return the new value for tag_mark */
   (
       /* === Parameters ======================================================= */
 
-    Index n_row,    /* number of rows in A */
-    Colamd_Row<Index> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
+    IndexType n_row,    /* number of rows in A */
+    Colamd_Row<IndexType> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
     )
 {
   /* === Local variables ================================================== */
 
-  Index r ;
+  IndexType r ;
 
   for (r = 0 ; r < n_row ; r++)
   {
diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index f3c31f9cb..7ea9b14d7 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@@ -19,20 +19,21 @@ namespace internal {
     
 /** \internal
   * \ingroup OrderingMethods_Module
-  * \returns the symmetric pattern A^T+A from the input matrix A. 
+  * \param[in] A the input non-symmetric matrix
+  * \param[out] symmat the symmetric pattern A^T+A from the input matrix \a A.
   * FIXME: The values should not be considered here
   */
 template<typename MatrixType> 
-void ordering_helper_at_plus_a(const MatrixType& mat, MatrixType& symmat)
+void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat)
 {
   MatrixType C;
-  C = mat.transpose(); // NOTE: Could be  costly
+  C = A.transpose(); // NOTE: Could be  costly
   for (int i = 0; i < C.rows(); i++) 
   {
       for (typename MatrixType::InnerIterator it(C, i); it; ++it)
         it.valueRef() = 0.0;
   }
-  symmat = C + mat; 
+  symmat = C + A;
 }
     
 }
@@ -44,14 +45,14 @@ void ordering_helper_at_plus_a(const MatrixType& mat, MatrixType& symmat)
   *
   * Functor computing the \em approximate \em minimum \em degree ordering
   * If the matrix is not structurally symmetric, an ordering of A^T+A is computed
-  * \tparam  Index The type of indices of the matrix 
+  * \tparam  StorageIndex The type of indices of the matrix 
   * \sa COLAMDOrdering
   */
-template <typename Index>
+template <typename StorageIndex>
 class AMDOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
     
     /** Compute the permutation vector from a sparse matrix
      * This routine is much faster if the input matrix is column-major     
@@ -60,7 +61,7 @@ class AMDOrdering
     void operator()(const MatrixType& mat, PermutationType& perm)
     {
       // Compute the symmetric pattern
-      SparseMatrix<typename MatrixType::Scalar, ColMajor, Index> symm;
+      SparseMatrix<typename MatrixType::Scalar, ColMajor, StorageIndex> symm;
       internal::ordering_helper_at_plus_a(mat,symm); 
     
       // Call the AMD routine 
@@ -72,7 +73,7 @@ class AMDOrdering
     template <typename SrcType, unsigned int SrcUpLo> 
     void operator()(const SparseSelfAdjointView<SrcType, SrcUpLo>& mat, PermutationType& perm)
     { 
-      SparseMatrix<typename SrcType::Scalar, ColMajor, Index> C; C = mat;
+      SparseMatrix<typename SrcType::Scalar, ColMajor, StorageIndex> C; C = mat;
       
       // Call the AMD routine 
       // m_mat.prune(keep_diag()); //Remove the diagonal elements 
@@ -88,13 +89,13 @@ class AMDOrdering
   * Functor computing the natural ordering (identity)
   * 
   * \note Returns an empty permutation matrix
-  * \tparam  Index The type of indices of the matrix 
+  * \tparam  StorageIndex The type of indices of the matrix 
   */
-template <typename Index>
+template <typename StorageIndex>
 class NaturalOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
     
     /** Compute the permutation vector from a column-major sparse matrix */
     template <typename MatrixType>
@@ -108,15 +109,17 @@ class NaturalOrdering
 /** \ingroup OrderingMethods_Module
   * \class COLAMDOrdering
   *
+  * \tparam  StorageIndex The type of indices of the matrix 
+  * 
   * Functor computing the \em column \em approximate \em minimum \em degree ordering 
   * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()).
   */
-template<typename Index>
+template<typename StorageIndex>
 class COLAMDOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType; 
-    typedef Matrix<Index, Dynamic, 1> IndexVector;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType; 
+    typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
     
     /** Compute the permutation vector \a perm form the sparse matrix \a mat
       * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
@@ -126,26 +129,26 @@ class COLAMDOrdering
     {
       eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering");
       
-      Index m = mat.rows();
-      Index n = mat.cols();
-      Index nnz = mat.nonZeros();
+      StorageIndex m = StorageIndex(mat.rows());
+      StorageIndex n = StorageIndex(mat.cols());
+      StorageIndex nnz = StorageIndex(mat.nonZeros());
       // Get the recommended value of Alen to be used by colamd
-      Index Alen = internal::colamd_recommended(nnz, m, n); 
+      StorageIndex Alen = internal::colamd_recommended(nnz, m, n); 
       // Set the default parameters
       double knobs [COLAMD_KNOBS]; 
-      Index stats [COLAMD_STATS];
+      StorageIndex stats [COLAMD_STATS];
       internal::colamd_set_defaults(knobs);
       
       IndexVector p(n+1), A(Alen); 
-      for(Index i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
-      for(Index i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
+      for(StorageIndex i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
+      for(StorageIndex i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
       // Call Colamd routine to compute the ordering 
-      Index info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); 
+      StorageIndex info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); 
       EIGEN_UNUSED_VARIABLE(info);
       eigen_assert( info && "COLAMD failed " );
       
       perm.resize(n);
-      for (Index i = 0; i < n; i++) perm.indices()(p(i)) = i;
+      for (StorageIndex i = 0; i < n; i++) perm.indices()(p(i)) = i;
     }
 };
 
diff --git a/Eigen/src/PaStiXSupport/CMakeLists.txt b/Eigen/src/PaStiXSupport/CMakeLists.txt
deleted file mode 100644
index 28c657e9b..000000000
--- a/Eigen/src/PaStiXSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_PastixSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_PastixSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/PaStiXSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h
index a955287d1..d2ebfd7bb 100644
--- a/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -12,6 +12,14 @@
 
 namespace Eigen { 
 
+#if defined(DCOMPLEX)
+  #define PASTIX_COMPLEX  COMPLEX
+  #define PASTIX_DCOMPLEX DCOMPLEX
+#else
+  #define PASTIX_COMPLEX  std::complex<float>
+  #define PASTIX_DCOMPLEX std::complex<double>
+#endif
+
 /** \ingroup PaStiXSupport_Module
   * \brief Interface to the PaStix solver
   * 
@@ -35,7 +43,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -44,7 +52,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -53,7 +61,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
   
   void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
@@ -74,14 +82,14 @@ namespace internal
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
-    c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<COMPLEX*>(vals), perm, invp, reinterpret_cast<COMPLEX*>(x), nbrhs, iparm, dparm); 
+    c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<PASTIX_COMPLEX*>(vals), perm, invp, reinterpret_cast<PASTIX_COMPLEX*>(x), nbrhs, iparm, dparm); 
   }
   
   void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<double> *vals, int *perm, int * invp, std::complex<double> *x, int nbrhs, int *iparm, double *dparm)
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
-    z_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<DCOMPLEX*>(vals), perm, invp, reinterpret_cast<DCOMPLEX*>(x), nbrhs, iparm, dparm); 
+    z_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<PASTIX_DCOMPLEX*>(vals), perm, invp, reinterpret_cast<PASTIX_DCOMPLEX*>(x), nbrhs, iparm, dparm); 
   }
 
   // Convert the matrix  to Fortran-style Numbering
@@ -117,20 +125,30 @@ namespace internal
 // This is the base class to interface with PaStiX functions. 
 // Users should not used this class directly. 
 template <class Derived>
-class PastixBase : internal::noncopyable
+class PastixBase : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
   public:
+    using Base::_solve_impl;
+    
     typedef typename internal::pastix_traits<Derived>::MatrixType _MatrixType;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
     
   public:
     
-    PastixBase() : m_initisOk(false), m_analysisIsOk(false), m_factorizationIsOk(false), m_isInitialized(false), m_pastixdata(0), m_size(0)
+    PastixBase() : m_initisOk(false), m_analysisIsOk(false), m_factorizationIsOk(false), m_pastixdata(0), m_size(0)
     {
       init();
     }
@@ -139,39 +157,16 @@ class PastixBase : internal::noncopyable
     {
       clean();
     }
-
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<PastixBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Pastix solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PastixBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<PastixBase, Rhs>(*this, b.derived());
-    }
     
     template<typename Rhs,typename Dest>
-    bool _solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const;
+    bool _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const;
     
-    Derived& derived()
-    {
-      return *static_cast<Derived*>(this);
-    }
-    const Derived& derived() const
-    {
-      return *static_cast<const Derived*>(this);
-    }
-
     /** Returns a reference to the integer vector IPARM of PaStiX parameters
       * to modify the default parameters. 
       * The statistics related to the different phases of factorization and solve are saved here as well
       * \sa analyzePattern() factorize()
       */
-    Array<Index,IPARM_SIZE,1>& iparm()
+    Array<StorageIndex,IPARM_SIZE,1>& iparm()
     {
       return m_iparm; 
     }
@@ -189,7 +184,7 @@ class PastixBase : internal::noncopyable
       * The statistics related to the different phases of factorization and solve are saved here as well
       * \sa analyzePattern() factorize()
       */
-    Array<RealScalar,IPARM_SIZE,1>& dparm()
+    Array<double,DPARM_SIZE,1>& dparm()
     {
       return m_dparm; 
     }
@@ -220,20 +215,6 @@ class PastixBase : internal::noncopyable
       return m_info;
     }
     
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<PastixBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Pastix LU, LLT or LDLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PastixBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<PastixBase, Rhs>(*this, b.derived());
-    }
-    
   protected:
 
     // Initialize the Pastix data structure, check the matrix
@@ -260,14 +241,13 @@ class PastixBase : internal::noncopyable
     int m_initisOk; 
     int m_analysisIsOk;
     int m_factorizationIsOk;
-    bool m_isInitialized;
     mutable ComputationInfo m_info; 
     mutable pastix_data_t *m_pastixdata; // Data structure for pastix
     mutable int m_comm; // The MPI communicator identifier
-    mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
-    mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
-    mutable Matrix<Index,Dynamic,1> m_perm;  // Permutation vector
-    mutable Matrix<Index,Dynamic,1> m_invp;  // Inverse permutation vector
+    mutable Array<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
+    mutable Array<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
+    mutable Matrix<StorageIndex,Dynamic,1> m_perm;  // Permutation vector
+    mutable Matrix<StorageIndex,Dynamic,1> m_invp;  // Inverse permutation vector
     mutable int m_size; // Size of the matrix 
 }; 
 
@@ -288,7 +268,7 @@ void PastixBase<Derived>::init()
          0, 0, 0, 1, m_iparm.data(), m_dparm.data());
   
   m_iparm[IPARM_MATRIX_VERIFICATION] = API_NO;
-  m_iparm[IPARM_VERBOSE]             = 2;
+  m_iparm[IPARM_VERBOSE]             = API_VERBOSE_NOT;
   m_iparm[IPARM_ORDERING]            = API_ORDER_SCOTCH;
   m_iparm[IPARM_INCOMPLETE]          = API_NO;
   m_iparm[IPARM_OOC_LIMIT]           = 2000;
@@ -320,7 +300,6 @@ void PastixBase<Derived>::compute(ColSpMatrix& mat)
   factorize(mat);
   
   m_iparm(IPARM_MATRIX_VERIFICATION) = API_NO;
-  m_isInitialized = m_factorizationIsOk;
 }
 
 
@@ -333,7 +312,7 @@ void PastixBase<Derived>::analyzePattern(ColSpMatrix& mat)
   if(m_size>0)
     clean();
   
-  m_size = mat.rows();
+  m_size = internal::convert_index<int>(mat.rows());
   m_perm.resize(m_size);
   m_invp.resize(m_size);
   
@@ -362,7 +341,7 @@ void PastixBase<Derived>::factorize(ColSpMatrix& mat)
   eigen_assert(m_analysisIsOk && "The analysis phase should be called before the factorization phase");
   m_iparm(IPARM_START_TASK) = API_TASK_NUMFACT;
   m_iparm(IPARM_END_TASK) = API_TASK_NUMFACT;
-  m_size = mat.rows();
+  m_size = internal::convert_index<int>(mat.rows());
   
   internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, m_size, mat.outerIndexPtr(), mat.innerIndexPtr(),
                mat.valuePtr(), m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
@@ -385,7 +364,7 @@ void PastixBase<Derived>::factorize(ColSpMatrix& mat)
 /* Solve the system */
 template<typename Base>
 template<typename Rhs,typename Dest>
-bool PastixBase<Base>::_solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const
+bool PastixBase<Base>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const
 {
   eigen_assert(m_isInitialized && "The matrix should be factorized first");
   EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
@@ -398,7 +377,7 @@ bool PastixBase<Base>::_solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) co
     m_iparm[IPARM_START_TASK]          = API_TASK_SOLVE;
     m_iparm[IPARM_END_TASK]            = API_TASK_REFINE;
   
-    internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, x.rows(), 0, 0, 0,
+    internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, internal::convert_index<int>(x.rows()), 0, 0, 0,
                            m_perm.data(), m_invp.data(), &x(0, i), rhs, m_iparm.data(), m_dparm.data());
   }
   
@@ -423,8 +402,10 @@ bool PastixBase<Base>::_solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) co
   * NOTE : Note that if the analysis and factorization phase are called separately, 
   * the input matrix will be symmetrized at each call, hence it is advised to 
   * symmetrize the matrix in a end-user program and set \p IsStrSym to true
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SparseLU
   * 
   */
 template<typename _MatrixType, bool IsStrSym>
@@ -434,7 +415,7 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> >
     typedef _MatrixType MatrixType;
     typedef PastixBase<PastixLU<MatrixType> > Base;
     typedef typename Base::ColSpMatrix ColSpMatrix;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     
   public:
     PastixLU() : Base()
@@ -442,7 +423,7 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> >
       init();
     }
     
-    PastixLU(const MatrixType& matrix):Base()
+    explicit PastixLU(const MatrixType& matrix):Base()
     {
       init();
       compute(matrix);
@@ -534,8 +515,10 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> >
   * 
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SimplicialLLT
   */
 template<typename _MatrixType, int _UpLo>
 class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
@@ -552,7 +535,7 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
       init();
     }
     
-    PastixLLT(const MatrixType& matrix):Base()
+    explicit PastixLLT(const MatrixType& matrix):Base()
     {
       init();
       compute(matrix);
@@ -598,6 +581,7 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
     
     void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
     {
+      out.resize(matrix.rows(), matrix.cols());
       // Pastix supports only lower, column-major matrices 
       out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
       internal::c_to_fortran_numbering(out);
@@ -615,8 +599,10 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
   * 
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SimplicialLDLT
   */
 template<typename _MatrixType, int _UpLo>
 class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> >
@@ -633,7 +619,7 @@ class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> >
       init();
     }
     
-    PastixLDLT(const MatrixType& matrix):Base()
+    explicit PastixLDLT(const MatrixType& matrix):Base()
     {
       init();
       compute(matrix);
@@ -681,41 +667,12 @@ class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> >
     void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
     {
       // Pastix supports only lower, column-major matrices 
+      out.resize(matrix.rows(), matrix.cols());
       out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
       internal::c_to_fortran_numbering(out);
     }
 };
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<PastixBase<_MatrixType>, Rhs>
-  : solve_retval_base<PastixBase<_MatrixType>, Rhs>
-{
-  typedef PastixBase<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Rhs>
-struct sparse_solve_retval<PastixBase<_MatrixType>, Rhs>
-  : sparse_solve_retval_base<PastixBase<_MatrixType>, Rhs>
-{
-  typedef PastixBase<_MatrixType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif
diff --git a/Eigen/src/PardisoSupport/CMakeLists.txt b/Eigen/src/PardisoSupport/CMakeLists.txt
deleted file mode 100644
index a097ab401..000000000
--- a/Eigen/src/PardisoSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_PardisoSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_PardisoSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/PardisoSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index 18cd7d88a..091c3970e 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -40,13 +40,13 @@ template<typename _MatrixType, int Options=Upper> class PardisoLDLT;
 
 namespace internal
 {
-  template<typename Index>
+  template<typename IndexType>
   struct pardiso_run_selector
   {
-    static Index run( _MKL_DSS_HANDLE_t pt, Index maxfct, Index mnum, Index type, Index phase, Index n, void *a,
-                      Index *ia, Index *ja, Index *perm, Index nrhs, Index *iparm, Index msglvl, void *b, void *x)
+    static IndexType run( _MKL_DSS_HANDLE_t pt, IndexType maxfct, IndexType mnum, IndexType type, IndexType phase, IndexType n, void *a,
+                      IndexType *ia, IndexType *ja, IndexType *perm, IndexType nrhs, IndexType *iparm, IndexType msglvl, void *b, void *x)
     {
-      Index error = 0;
+      IndexType error = 0;
       ::pardiso(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
       return error;
     }
@@ -54,11 +54,11 @@ namespace internal
   template<>
   struct pardiso_run_selector<long long int>
   {
-    typedef long long int Index;
-    static Index run( _MKL_DSS_HANDLE_t pt, Index maxfct, Index mnum, Index type, Index phase, Index n, void *a,
-                      Index *ia, Index *ja, Index *perm, Index nrhs, Index *iparm, Index msglvl, void *b, void *x)
+    typedef long long int IndexType;
+    static IndexType run( _MKL_DSS_HANDLE_t pt, IndexType maxfct, IndexType mnum, IndexType type, IndexType phase, IndexType n, void *a,
+                      IndexType *ia, IndexType *ja, IndexType *perm, IndexType nrhs, IndexType *iparm, IndexType msglvl, void *b, void *x)
     {
-      Index error = 0;
+      IndexType error = 0;
       ::pardiso_64(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
       return error;
     }
@@ -72,7 +72,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -81,7 +81,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -90,35 +90,44 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;    
+    typedef typename _MatrixType::StorageIndex StorageIndex;    
   };
 
-}
+} // end namespace internal
 
 template<class Derived>
-class PardisoImpl
+class PardisoImpl : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
+    
     typedef internal::pardiso_traits<Derived> Traits;
   public:
+    using Base::_solve_impl;
+    
     typedef typename Traits::MatrixType MatrixType;
     typedef typename Traits::Scalar Scalar;
     typedef typename Traits::RealScalar RealScalar;
-    typedef typename Traits::Index Index;
-    typedef SparseMatrix<Scalar,RowMajor,Index> SparseMatrixType;
+    typedef typename Traits::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,RowMajor,StorageIndex> SparseMatrixType;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
-    typedef Matrix<Index, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
-    typedef Matrix<Index, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
-    typedef Array<Index,64,1,DontAlign> ParameterType;
+    typedef Matrix<StorageIndex, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<StorageIndex, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+    typedef Array<StorageIndex,64,1,DontAlign> ParameterType;
     enum {
-      ScalarIsComplex = NumTraits<Scalar>::IsComplex
+      ScalarIsComplex = NumTraits<Scalar>::IsComplex,
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
     };
 
     PardisoImpl()
     {
-      eigen_assert((sizeof(Index) >= sizeof(_INTEGER_t) && sizeof(Index) <= 8) && "Non-supported index type");
+      eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) && "Non-supported index type");
       m_iparm.setZero();
       m_msglvl = 0; // No output
-      m_initialized = false;
+      m_isInitialized = false;
     }
 
     ~PardisoImpl()
@@ -136,7 +145,7 @@ class PardisoImpl
       */
     ComputationInfo info() const
     {
-      eigen_assert(m_initialized && "Decomposition is not initialized.");
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
       return m_info;
     }
 
@@ -165,54 +174,18 @@ class PardisoImpl
     Derived& factorize(const MatrixType& matrix);
 
     Derived& compute(const MatrixType& matrix);
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<PardisoImpl, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_initialized && "Pardiso solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PardisoImpl::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<PardisoImpl, Rhs>(*this, b.derived());
-    }
 
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<PardisoImpl, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_initialized && "Pardiso solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PardisoImpl::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<PardisoImpl, Rhs>(*this, b.derived());
-    }
-
-    Derived& derived()
-    {
-      return *static_cast<Derived*>(this);
-    }
-    const Derived& derived() const
-    {
-      return *static_cast<const Derived*>(this);
-    }
-
-    template<typename BDerived, typename XDerived>
-    bool _solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const;
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
 
   protected:
     void pardisoRelease()
     {
-      if(m_initialized) // Factorization ran at least once
+      if(m_isInitialized) // Factorization ran at least once
       {
-        internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, -1, m_size, 0, 0, 0, m_perm.data(), 0,
-                                                   m_iparm.data(), m_msglvl, 0, 0);
+        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, internal::convert_index<StorageIndex>(m_size),0, 0, 0, m_perm.data(), 0,
+                                                          m_iparm.data(), m_msglvl, NULL, NULL);
+        m_isInitialized = false;
       }
     }
 
@@ -221,11 +194,11 @@ class PardisoImpl
       m_type = type;
       bool symmetric = std::abs(m_type) < 10;
       m_iparm[0] = 1;   // No solver default
-      m_iparm[1] = 3;   // use Metis for the ordering
-      m_iparm[2] = 1;   // Numbers of processors, value of OMP_NUM_THREADS
+      m_iparm[1] = 2;   // use Metis for the ordering
+      m_iparm[2] = 0;   // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
       m_iparm[3] = 0;   // No iterative-direct algorithm
       m_iparm[4] = 0;   // No user fill-in reducing permutation
-      m_iparm[5] = 0;   // Write solution into x
+      m_iparm[5] = 0;   // Write solution into x, b is left unchanged
       m_iparm[6] = 0;   // Not in use
       m_iparm[7] = 2;   // Max numbers of iterative refinement steps
       m_iparm[8] = 0;   // Not in use
@@ -246,13 +219,16 @@ class PardisoImpl
       m_iparm[26] = 0;  // No matrix checker
       m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
       m_iparm[34] = 1;  // C indexing
-      m_iparm[59] = 1;  // Automatic switch between In-Core and Out-of-Core modes
+      m_iparm[36] = 0;  // CSR
+      m_iparm[59] = 0;  // 0 - In-Core ; 1 - Automatic switch between In-Core and Out-of-Core modes ; 2 - Out-of-Core
+      
+      memset(m_pt, 0, sizeof(m_pt));
     }
 
   protected:
     // cached data to reduce reallocation, etc.
     
-    void manageErrorCode(Index error)
+    void manageErrorCode(Index error) const
     {
       switch(error)
       {
@@ -269,16 +245,14 @@ class PardisoImpl
     }
 
     mutable SparseMatrixType m_matrix;
-    ComputationInfo m_info;
-    bool m_initialized, m_analysisIsOk, m_factorizationIsOk;
-    Index m_type, m_msglvl;
+    mutable ComputationInfo m_info;
+    bool m_analysisIsOk, m_factorizationIsOk;
+    StorageIndex m_type, m_msglvl;
     mutable void *m_pt[64];
     mutable ParameterType m_iparm;
     mutable IntColVectorType m_perm;
     Index m_size;
     
-  private:
-    PardisoImpl(PardisoImpl &) {}
 };
 
 template<class Derived>
@@ -288,19 +262,17 @@ Derived& PardisoImpl<Derived>::compute(const MatrixType& a)
   eigen_assert(a.rows() == a.cols());
 
   pardisoRelease();
-  memset(m_pt, 0, sizeof(m_pt));
   m_perm.setZero(m_size);
   derived().getMatrix(a);
   
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 12, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
-
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 12, internal::convert_index<StorageIndex>(m_size),
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   manageErrorCode(error);
   m_analysisIsOk = true;
   m_factorizationIsOk = true;
-  m_initialized = true;
+  m_isInitialized = true;
   return derived();
 }
 
@@ -311,19 +283,18 @@ Derived& PardisoImpl<Derived>::analyzePattern(const MatrixType& a)
   eigen_assert(m_size == a.cols());
 
   pardisoRelease();
-  memset(m_pt, 0, sizeof(m_pt));
   m_perm.setZero(m_size);
   derived().getMatrix(a);
   
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 11, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 11, internal::convert_index<StorageIndex>(m_size),
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
   manageErrorCode(error);
   m_analysisIsOk = true;
   m_factorizationIsOk = false;
-  m_initialized = true;
+  m_isInitialized = true;
   return derived();
 }
 
@@ -335,22 +306,25 @@ Derived& PardisoImpl<Derived>::factorize(const MatrixType& a)
   
   derived().getMatrix(a);
 
-  Index error;  
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 22, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+  Index error;
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 22, internal::convert_index<StorageIndex>(m_size),
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
   manageErrorCode(error);
   m_factorizationIsOk = true;
   return derived();
 }
 
-template<class Base>
+template<class Derived>
 template<typename BDerived,typename XDerived>
-bool PardisoImpl<Base>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
+void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
 {
   if(m_iparm[0] == 0) // Factorization was not computed
-    return false;
+  {
+    m_info = InvalidInput;
+    return;
+  }
 
   //Index n = m_matrix.rows();
   Index nrhs = Index(b.cols());
@@ -380,12 +354,12 @@ bool PardisoImpl<Base>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerive
   }
   
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 33, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), nrhs, m_iparm.data(), m_msglvl,
-                                                     rhs_ptr, x.derived().data());
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 33, internal::convert_index<StorageIndex>(m_size),
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), internal::convert_index<StorageIndex>(nrhs), m_iparm.data(), m_msglvl,
+                                                            rhs_ptr, x.derived().data());
 
-  return error==0;
+  manageErrorCode(error);
 }
 
 
@@ -397,15 +371,20 @@ bool PardisoImpl<Base>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerive
   * using the Intel MKL PARDISO library. The sparse matrix A must be squared and invertible.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SparseLU
   */
 template<typename MatrixType>
 class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
 {
   protected:
-    typedef PardisoImpl< PardisoLU<MatrixType> > Base;
+    typedef PardisoImpl<PardisoLU> Base;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
@@ -423,7 +402,7 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
       pardisoInit(Base::ScalarIsComplex ? 13 : 11);
     }
 
-    PardisoLU(const MatrixType& matrix)
+    explicit PardisoLU(const MatrixType& matrix)
       : Base()
     {
       pardisoInit(Base::ScalarIsComplex ? 13 : 11);
@@ -433,10 +412,8 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
     void getMatrix(const MatrixType& matrix)
     {
       m_matrix = matrix;
+      m_matrix.makeCompressed();
     }
-    
-  private:
-    PardisoLU(PardisoLU& ) {}
 };
 
 /** \ingroup PardisoSupport_Module
@@ -447,11 +424,16 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
   * using the Intel MKL PARDISO library. The sparse matrix A must be selfajoint and positive definite.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used.
   *         Upper|Lower can be used to tell both triangular parts can be used as input.
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SimplicialLLT
   */
 template<typename MatrixType, int _UpLo>
 class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
@@ -459,7 +441,6 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
   protected:
     typedef PardisoImpl< PardisoLLT<MatrixType,_UpLo> > Base;
     typedef typename Base::Scalar Scalar;
-    typedef typename Base::Index Index;
     typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
@@ -467,9 +448,9 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
 
   public:
 
+    typedef typename Base::StorageIndex StorageIndex;
     enum { UpLo = _UpLo };
     using Base::compute;
-    using Base::solve;
 
     PardisoLLT()
       : Base()
@@ -477,7 +458,7 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
       pardisoInit(Base::ScalarIsComplex ? 4 : 2);
     }
 
-    PardisoLLT(const MatrixType& matrix)
+    explicit PardisoLLT(const MatrixType& matrix)
       : Base()
     {
       pardisoInit(Base::ScalarIsComplex ? 4 : 2);
@@ -489,13 +470,11 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
     void getMatrix(const MatrixType& matrix)
     {
       // PARDISO supports only upper, row-major matrices
-      PermutationMatrix<Dynamic,Dynamic,Index> p_null;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> p_null;
       m_matrix.resize(matrix.rows(), matrix.cols());
       m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+      m_matrix.makeCompressed();
     }
-    
-  private:
-    PardisoLLT(PardisoLLT& ) {}
 };
 
 /** \ingroup PardisoSupport_Module
@@ -507,12 +486,17 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
   * For complex matrices, A can also be symmetric only, see the \a Options template parameter.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam Options can be any bitwise combination of Upper, Lower, and Symmetric. The default is Upper, meaning only the upper triangular part has to be used.
   *         Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix.
   *         Upper|Lower can be used to tell both triangular parts can be used as input.
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SimplicialLDLT
   */
 template<typename MatrixType, int Options>
 class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
@@ -520,7 +504,6 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
   protected:
     typedef PardisoImpl< PardisoLDLT<MatrixType,Options> > Base;
     typedef typename Base::Scalar Scalar;
-    typedef typename Base::Index Index;
     typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
@@ -528,8 +511,8 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
 
   public:
 
+    typedef typename Base::StorageIndex StorageIndex;
     using Base::compute;
-    using Base::solve;
     enum { UpLo = Options&(Upper|Lower) };
 
     PardisoLDLT()
@@ -538,7 +521,7 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
       pardisoInit(Base::ScalarIsComplex ? ( bool(Options&Symmetric) ? 6 : -4 ) : -2);
     }
 
-    PardisoLDLT(const MatrixType& matrix)
+    explicit PardisoLDLT(const MatrixType& matrix)
       : Base()
     {
       pardisoInit(Base::ScalarIsComplex ? ( bool(Options&Symmetric) ? 6 : -4 ) : -2);
@@ -548,45 +531,13 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
     void getMatrix(const MatrixType& matrix)
     {
       // PARDISO supports only upper, row-major matrices
-      PermutationMatrix<Dynamic,Dynamic,Index> p_null;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> p_null;
       m_matrix.resize(matrix.rows(), matrix.cols());
       m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+      m_matrix.makeCompressed();
     }
-    
-  private:
-    PardisoLDLT(PardisoLDLT& ) {}
-};
-
-namespace internal {
-  
-template<typename _Derived, typename Rhs>
-struct solve_retval<PardisoImpl<_Derived>, Rhs>
-  : solve_retval_base<PardisoImpl<_Derived>, Rhs>
-{
-  typedef PardisoImpl<_Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
 };
 
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<PardisoImpl<Derived>, Rhs>
-  : sparse_solve_retval_base<PardisoImpl<Derived>, Rhs>
-{
-  typedef PardisoImpl<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_PARDISOSUPPORT_H
diff --git a/Eigen/src/QR/CMakeLists.txt b/Eigen/src/QR/CMakeLists.txt
deleted file mode 100644
index 96f43d7f5..000000000
--- a/Eigen/src/QR/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_QR_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_QR_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/QR COMPONENT Devel
-  )
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 567eab7cd..0e47c8332 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -11,7 +11,16 @@
 #ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_H
 #define EIGEN_COLPIVOTINGHOUSEHOLDERQR_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
+ : traits<_MatrixType>
+{
+  enum { Flags = 0 };
+};
+
+} // end namespace internal
 
 /** \ingroup QR_Module
   *
@@ -19,19 +28,21 @@ namespace Eigen {
   *
   * \brief Householder rank-revealing QR decomposition of a matrix with column-pivoting
   *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
   *
   * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
-  * such that 
+  * such that
   * \f[
   *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \mathbf{R}
   * \f]
-  * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an 
+  * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an
   * upper triangular matrix.
   *
   * This decomposition performs column pivoting in order to be rank-revealing and improve
   * numerical stability. It is slower than HouseholderQR, and faster than FullPivHouseholderQR.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::colPivHouseholderQr()
   */
 template<typename _MatrixType> class ColPivHouseholderQR
@@ -42,25 +53,25 @@ template<typename _MatrixType> class ColPivHouseholderQR
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, Options, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
+    // FIXME should be int
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
     typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
     typedef typename internal::plain_row_type<MatrixType, RealScalar>::type RealRowVectorType;
     typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename HCoeffsType::ConjugateReturnType>::type> HouseholderSequenceType;
-    
+    typedef typename MatrixType::PlainObject PlainObject;
+
   private:
-    
-    typedef typename PermutationType::Index PermIndexType;
-    
+
+    typedef typename PermutationType::StorageIndex PermIndexType;
+
   public:
 
     /**
@@ -75,7 +86,8 @@ template<typename _MatrixType> class ColPivHouseholderQR
         m_colsPermutation(),
         m_colsTranspositions(),
         m_temp(),
-        m_colSqNorms(),
+        m_colNormsUpdated(),
+        m_colNormsDirect(),
         m_isInitialized(false),
         m_usePrescribedThreshold(false) {}
 
@@ -91,7 +103,8 @@ template<typename _MatrixType> class ColPivHouseholderQR
         m_colsPermutation(PermIndexType(cols)),
         m_colsTranspositions(cols),
         m_temp(cols),
-        m_colSqNorms(cols),
+        m_colNormsUpdated(cols),
+        m_colNormsDirect(cols),
         m_isInitialized(false),
         m_usePrescribedThreshold(false) {}
 
@@ -99,25 +112,48 @@ template<typename _MatrixType> class ColPivHouseholderQR
       *
       * This constructor computes the QR factorization of the matrix \a matrix by calling
       * the method compute(). It is a short cut for:
-      * 
+      *
       * \code
       * ColPivHouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
       * qr.compute(matrix);
       * \endcode
-      * 
+      *
       * \sa compute()
       */
-    ColPivHouseholderQR(const MatrixType& matrix)
+    template<typename InputType>
+    explicit ColPivHouseholderQR(const EigenBase<InputType>& matrix)
       : m_qr(matrix.rows(), matrix.cols()),
         m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
         m_colsPermutation(PermIndexType(matrix.cols())),
         m_colsTranspositions(matrix.cols()),
         m_temp(matrix.cols()),
-        m_colSqNorms(matrix.cols()),
+        m_colNormsUpdated(matrix.cols()),
+        m_colNormsDirect(matrix.cols()),
         m_isInitialized(false),
         m_usePrescribedThreshold(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
+    }
+
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa ColPivHouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit ColPivHouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_colsPermutation(PermIndexType(matrix.cols())),
+        m_colsTranspositions(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_colNormsUpdated(matrix.cols()),
+        m_colNormsDirect(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      computeInPlace();
     }
 
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
@@ -127,9 +163,6 @@ template<typename _MatrixType> class ColPivHouseholderQR
       *
       * \returns a solution.
       *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
       * \note_about_checking_solutions
       *
       * \note_about_arbitrary_choice_of_solution
@@ -138,17 +171,17 @@ template<typename _MatrixType> class ColPivHouseholderQR
       * Output: \verbinclude ColPivHouseholderQR_solve.out
       */
     template<typename Rhs>
-    inline const internal::solve_retval<ColPivHouseholderQR, Rhs>
+    inline const Solve<ColPivHouseholderQR, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return internal::solve_retval<ColPivHouseholderQR, Rhs>(*this, b.derived());
+      return Solve<ColPivHouseholderQR, Rhs>(*this, b.derived());
     }
 
-    HouseholderSequenceType householderQ(void) const;
-    HouseholderSequenceType matrixQ(void) const
+    HouseholderSequenceType householderQ() const;
+    HouseholderSequenceType matrixQ() const
     {
-      return householderQ(); 
+      return householderQ();
     }
 
     /** \returns a reference to the matrix where the Householder QR decomposition is stored
@@ -158,14 +191,14 @@ template<typename _MatrixType> class ColPivHouseholderQR
       eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
       return m_qr;
     }
-    
-    /** \returns a reference to the matrix where the result Householder QR is stored 
-     * \warning The strict lower part of this matrix contains internal values. 
+
+    /** \returns a reference to the matrix where the result Householder QR is stored
+     * \warning The strict lower part of this matrix contains internal values.
      * Only the upper triangular part should be referenced. To get it, use
      * \code matrixR().template triangularView<Upper>() \endcode
-     * For rank-deficient matrices, use 
-     * \code 
-     * matrixR().topLeftCorner(rank(), rank()).template triangularView<Upper>() 
+     * For rank-deficient matrices, use
+     * \code
+     * matrixR().topLeftCorner(rank(), rank()).template triangularView<Upper>()
      * \endcode
      */
     const MatrixType& matrixR() const
@@ -173,8 +206,9 @@ template<typename _MatrixType> class ColPivHouseholderQR
       eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
       return m_qr;
     }
-    
-    ColPivHouseholderQR& compute(const MatrixType& matrix);
+
+    template<typename InputType>
+    ColPivHouseholderQR& compute(const EigenBase<InputType>& matrix);
 
     /** \returns a const reference to the column permutation matrix */
     const PermutationType& colsPermutation() const
@@ -284,20 +318,17 @@ template<typename _MatrixType> class ColPivHouseholderQR
       * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
       *       Use isInvertible() to first determine whether this matrix is invertible.
       */
-    inline const
-    internal::solve_retval<ColPivHouseholderQR, typename MatrixType::IdentityReturnType>
-    inverse() const
+    inline const Inverse<ColPivHouseholderQR> inverse() const
     {
       eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return internal::solve_retval<ColPivHouseholderQR,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_qr.rows(), m_qr.cols()));
+      return Inverse<ColPivHouseholderQR>(*this);
     }
 
     inline Index rows() const { return m_qr.rows(); }
     inline Index cols() const { return m_qr.cols(); }
-    
+
     /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
-      * 
+      *
       * For advanced uses only.
       */
     const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
@@ -370,12 +401,12 @@ template<typename _MatrixType> class ColPivHouseholderQR
       *          diagonal coefficient of R.
       */
     RealScalar maxPivot() const { return m_maxpivot; }
-    
+
     /** \brief Reports whether the QR factorization was succesful.
       *
-      * \note This function always returns \c Success. It is provided for compatibility 
+      * \note This function always returns \c Success. It is provided for compatibility
       * with other factorization routines.
-      * \returns \c Success 
+      * \returns \c Success
       */
     ComputationInfo info() const
     {
@@ -383,19 +414,30 @@ template<typename _MatrixType> class ColPivHouseholderQR
       return Success;
     }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif
+
   protected:
-    
+
+    friend class CompleteOrthogonalDecomposition<MatrixType>;
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
+    void computeInPlace();
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     PermutationType m_colsPermutation;
     IntRowVectorType m_colsTranspositions;
     RowVectorType m_temp;
-    RealRowVectorType m_colSqNorms;
+    RealRowVectorType m_colNormsUpdated;
+    RealRowVectorType m_colNormsDirect;
     bool m_isInitialized, m_usePrescribedThreshold;
     RealScalar m_prescribedThreshold, m_maxpivot;
     Index m_nonzero_pivots;
@@ -426,51 +468,57 @@ typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::logAbsDetermina
   * \sa class ColPivHouseholderQR, ColPivHouseholderQR(const MatrixType&)
   */
 template<typename MatrixType>
-ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+template<typename InputType>
+ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
+{
+  m_qr = matrix.derived();
+  computeInPlace();
+  return *this;
+}
+
+template<typename MatrixType>
+void ColPivHouseholderQR<MatrixType>::computeInPlace()
 {
   check_template_parameters();
-  
-  using std::abs;
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
-  Index size = matrix.diagonalSize();
-  
+
   // the column permutation is stored as int indices, so just to be sure:
-  eigen_assert(cols<=NumTraits<int>::highest());
+  eigen_assert(m_qr.cols()<=NumTraits<int>::highest());
+
+  using std::abs;
+
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
+  Index size = m_qr.diagonalSize();
 
-  m_qr = matrix;
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
 
-  m_colsTranspositions.resize(matrix.cols());
+  m_colsTranspositions.resize(m_qr.cols());
   Index number_of_transpositions = 0;
 
-  m_colSqNorms.resize(cols);
-  for(Index k = 0; k < cols; ++k)
-    m_colSqNorms.coeffRef(k) = m_qr.col(k).squaredNorm();
+  m_colNormsUpdated.resize(cols);
+  m_colNormsDirect.resize(cols);
+  for (Index k = 0; k < cols; ++k) {
+    // colNormsDirect(k) caches the most recent directly computed norm of
+    // column k.
+    m_colNormsDirect.coeffRef(k) = m_qr.col(k).norm();
+    m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k);
+  }
 
-  RealScalar threshold_helper = m_colSqNorms.maxCoeff() * numext::abs2(NumTraits<Scalar>::epsilon()) / RealScalar(rows);
+  RealScalar threshold_helper =  numext::abs2<Scalar>(m_colNormsUpdated.maxCoeff() * NumTraits<Scalar>::epsilon()) / RealScalar(rows);
+  RealScalar norm_downdate_threshold = numext::sqrt(NumTraits<Scalar>::epsilon());
 
   m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
   m_maxpivot = RealScalar(0);
 
   for(Index k = 0; k < size; ++k)
   {
-    // first, we look up in our table m_colSqNorms which column has the biggest squared norm
+    // first, we look up in our table m_colNormsUpdated which column has the biggest norm
     Index biggest_col_index;
-    RealScalar biggest_col_sq_norm = m_colSqNorms.tail(cols-k).maxCoeff(&biggest_col_index);
+    RealScalar biggest_col_sq_norm = numext::abs2(m_colNormsUpdated.tail(cols-k).maxCoeff(&biggest_col_index));
     biggest_col_index += k;
 
-    // since our table m_colSqNorms accumulates imprecision at every step, we must now recompute
-    // the actual squared norm of the selected column.
-    // Note that not doing so does result in solve() sometimes returning inf/nan values
-    // when running the unit test with 1000 repetitions.
-    biggest_col_sq_norm = m_qr.col(biggest_col_index).tail(rows-k).squaredNorm();
-
-    // we store that back into our table: it can't hurt to correct our table.
-    m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm;
-
     // Track the number of meaningful pivots but do not stop the decomposition to make
     // sure that the initial matrix is properly reproduced. See bug 941.
     if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
@@ -480,7 +528,8 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
     m_colsTranspositions.coeffRef(k) = biggest_col_index;
     if(k != biggest_col_index) {
       m_qr.col(k).swap(m_qr.col(biggest_col_index));
-      std::swap(m_colSqNorms.coeffRef(k), m_colSqNorms.coeffRef(biggest_col_index));
+      std::swap(m_colNormsUpdated.coeffRef(k), m_colNormsUpdated.coeffRef(biggest_col_index));
+      std::swap(m_colNormsDirect.coeffRef(k), m_colNormsDirect.coeffRef(biggest_col_index));
       ++number_of_transpositions;
     }
 
@@ -498,8 +547,28 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
     m_qr.bottomRightCorner(rows-k, cols-k-1)
         .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows-k-1), m_hCoeffs.coeffRef(k), &m_temp.coeffRef(k+1));
 
-    // update our table of squared norms of the columns
-    m_colSqNorms.tail(cols-k-1) -= m_qr.row(k).tail(cols-k-1).cwiseAbs2();
+    // update our table of norms of the columns
+    for (Index j = k + 1; j < cols; ++j) {
+      // The following implements the stable norm downgrade step discussed in
+      // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf
+      // and used in LAPACK routines xGEQPF and xGEQP3.
+      // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html
+      if (m_colNormsUpdated.coeffRef(j) != 0) {
+        RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j);
+        temp = (RealScalar(1) + temp) * (RealScalar(1) - temp);
+        temp = temp < 0 ? 0 : temp;
+        RealScalar temp2 = temp * numext::abs2<Scalar>(m_colNormsUpdated.coeffRef(j) /
+                                                       m_colNormsDirect.coeffRef(j));
+        if (temp2 <= norm_downdate_threshold) {
+          // The updated norm has become too inaccurate so re-compute the column
+          // norm directly.
+          m_colNormsDirect.coeffRef(j) = m_qr.col(j).tail(rows - k - 1).norm();
+          m_colNormsUpdated.coeffRef(j) = m_colNormsDirect.coeffRef(j);
+        } else {
+          m_colNormsUpdated.coeffRef(j) *= numext::sqrt(temp);
+        }
+      }
+    }
   }
 
   m_colsPermutation.setIdentity(PermIndexType(cols));
@@ -508,46 +577,50 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
   m_isInitialized = true;
-
-  return *this;
 }
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<ColPivHouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<ColPivHouseholderQR<_MatrixType>, Rhs>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  EIGEN_MAKE_SOLVE_HELPERS(ColPivHouseholderQR<_MatrixType>,Rhs)
+  eigen_assert(rhs.rows() == rows());
+
+  const Index nonzero_pivots = nonzeroPivots();
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  if(nonzero_pivots == 0)
   {
-    eigen_assert(rhs().rows() == dec().rows());
+    dst.setZero();
+    return;
+  }
 
-    const Index cols = dec().cols(),
-				nonzero_pivots = dec().nonzeroPivots();
+  typename RhsType::PlainObject c(rhs);
 
-    if(nonzero_pivots == 0)
-    {
-      dst.setZero();
-      return;
-    }
+  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
+  c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs)
+                    .setLength(nonzero_pivots)
+                    .transpose()
+    );
 
-    typename Rhs::PlainObject c(rhs());
+  m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(nonzero_pivots));
 
-    // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-    c.applyOnTheLeft(householderSequence(dec().matrixQR(), dec().hCoeffs())
-                     .setLength(dec().nonzeroPivots())
-		     .transpose()
-      );
+  for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i);
+  for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero();
+}
+#endif
 
-    dec().matrixR()
-       .topLeftCorner(nonzero_pivots, nonzero_pivots)
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(nonzero_pivots));
+namespace internal {
 
-    for(Index i = 0; i < nonzero_pivots; ++i) dst.row(dec().colsPermutation().indices().coeff(i)) = c.row(i);
-    for(Index i = nonzero_pivots; i < cols; ++i) dst.row(dec().colsPermutation().indices().coeff(i)).setZero();
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename ColPivHouseholderQR<MatrixType>::Scalar>, Dense2Dense>
+{
+  typedef ColPivHouseholderQR<MatrixType> QrType;
+  typedef Inverse<QrType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
+  {
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
 
diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
index b5b198326..4e9651f83 100644
--- a/Eigen/src/QR/ColPivHouseholderQR_MKL.h
+++ b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
@@ -25,37 +25,34 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Householder QR decomposition of a matrix with column pivoting based on
  *    LAPACKE_?geqp3 function.
  ********************************************************************************
 */
 
-#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
-#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
+#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
 
 namespace Eigen { 
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_QR_COLPIV(EIGTYPE, MKLTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
-template<> inline \
+#define EIGEN_LAPACKE_QR_COLPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW) \
+template<> template<typename InputType> inline \
 ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >& \
 ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >::compute( \
-              const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix) \
+              const EigenBase<InputType>& matrix) \
 \
 { \
   using std::abs; \
   typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
   typedef MatrixType::RealScalar RealScalar; \
   Index rows = matrix.rows();\
   Index cols = matrix.cols();\
-  Index size = matrix.diagonalSize();\
 \
   m_qr = matrix;\
+  Index size = m_qr.diagonalSize();\
   m_hCoeffs.resize(size);\
 \
   m_colsTranspositions.resize(cols);\
@@ -66,34 +63,35 @@ ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynami
   m_colsPermutation.resize(cols); \
   m_colsPermutation.indices().setZero(); \
 \
-  lapack_int lda = m_qr.outerStride(), i; \
-  lapack_int matrix_order = MKLCOLROW; \
-  LAPACKE_##MKLPREFIX##geqp3( matrix_order, rows, cols, (MKLTYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (MKLTYPE*)m_hCoeffs.data()); \
+  lapack_int lda = internal::convert_index<lapack_int,Index>(m_qr.outerStride()); \
+  lapack_int matrix_order = LAPACKE_COLROW; \
+  LAPACKE_##LAPACKE_PREFIX##geqp3( matrix_order, internal::convert_index<lapack_int,Index>(rows), internal::convert_index<lapack_int,Index>(cols), \
+                              (LAPACKE_TYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (LAPACKE_TYPE*)m_hCoeffs.data()); \
   m_isInitialized = true; \
   m_maxpivot=m_qr.diagonal().cwiseAbs().maxCoeff(); \
   m_hCoeffs.adjointInPlace(); \
   RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold(); \
   lapack_int *perm = m_colsPermutation.indices().data(); \
-  for(i=0;i<size;i++) { \
+  for(Index i=0;i<size;i++) { \
     m_nonzero_pivots += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);\
   } \
-  for(i=0;i<cols;i++) perm[i]--;\
+  for(Index i=0;i<cols;i++) perm[i]--;\
 \
   /*m_det_pq = (number_of_transpositions%2) ? -1 : 1;  // TODO: It's not needed now; fix upon availability in Eigen */ \
 \
   return *this; \
 }
 
-EIGEN_MKL_QR_COLPIV(double,   double,        d, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(float,    float,         s, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(dcomplex, MKL_Complex16, z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(scomplex, MKL_Complex8,  c, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(double,   double,        d, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(float,    float,         s, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float,  c, ColMajor, LAPACK_COL_MAJOR)
 
-EIGEN_MKL_QR_COLPIV(double,   double,        d, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(float,    float,         s, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(dcomplex, MKL_Complex16, z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(scomplex, MKL_Complex8,  c, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(double,   double,        d, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(float,    float,         s, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float,  c, RowMajor, LAPACK_ROW_MAJOR)
 
 } // end namespace Eigen
 
-#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
+#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
new file mode 100644
index 000000000..34c637b70
--- /dev/null
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -0,0 +1,562 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H
+#define EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H
+
+namespace Eigen {
+
+namespace internal {
+template <typename _MatrixType>
+struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
+    : traits<_MatrixType> {
+  enum { Flags = 0 };
+};
+
+}  // end namespace internal
+
+/** \ingroup QR_Module
+  *
+  * \class CompleteOrthogonalDecomposition
+  *
+  * \brief Complete orthogonal decomposition (COD) of a matrix.
+  *
+  * \param MatrixType the type of the matrix of which we are computing the COD.
+  *
+  * This class performs a rank-revealing complete orthogonal decomposition of a
+  * matrix  \b A into matrices \b P, \b Q, \b T, and \b Z such that
+  * \f[
+  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \,
+  *                     \begin{bmatrix} \mathbf{T} &  \mathbf{0} \\
+  *                                     \mathbf{0} & \mathbf{0} \end{bmatrix} \, \mathbf{Z}
+  * \f]
+  * by using Householder transformations. Here, \b P is a permutation matrix,
+  * \b Q and \b Z are unitary matrices and \b T an upper triangular matrix of
+  * size rank-by-rank. \b A may be rank deficient.
+  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
+  * \sa MatrixBase::completeOrthogonalDecomposition()
+  */
+template <typename _MatrixType>
+class CompleteOrthogonalDecomposition {
+ public:
+  typedef _MatrixType MatrixType;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+  typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime>
+      PermutationType;
+  typedef typename internal::plain_row_type<MatrixType, Index>::type
+      IntRowVectorType;
+  typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
+  typedef typename internal::plain_row_type<MatrixType, RealScalar>::type
+      RealRowVectorType;
+  typedef HouseholderSequence<
+      MatrixType, typename internal::remove_all<
+                      typename HCoeffsType::ConjugateReturnType>::type>
+      HouseholderSequenceType;
+  typedef typename MatrixType::PlainObject PlainObject;
+
+ private:
+  typedef typename PermutationType::Index PermIndexType;
+
+ public:
+  /**
+   * \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via
+   * \c CompleteOrthogonalDecomposition::compute(const* MatrixType&).
+   */
+  CompleteOrthogonalDecomposition() : m_cpqr(), m_zCoeffs(), m_temp() {}
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa CompleteOrthogonalDecomposition()
+   */
+  CompleteOrthogonalDecomposition(Index rows, Index cols)
+      : m_cpqr(rows, cols), m_zCoeffs((std::min)(rows, cols)), m_temp(cols) {}
+
+  /** \brief Constructs a complete orthogonal decomposition from a given
+   * matrix.
+   *
+   * This constructor computes the complete orthogonal decomposition of the
+   * matrix \a matrix by calling the method compute(). The default
+   * threshold for rank determination will be used. It is a short cut for:
+   *
+   * \code
+   * CompleteOrthogonalDecomposition<MatrixType> cod(matrix.rows(),
+   *                                                 matrix.cols());
+   * cod.setThreshold(Default);
+   * cod.compute(matrix);
+   * \endcode
+   *
+   * \sa compute()
+   */
+  template <typename InputType>
+  explicit CompleteOrthogonalDecomposition(const EigenBase<InputType>& matrix)
+      : m_cpqr(matrix.rows(), matrix.cols()),
+        m_zCoeffs((std::min)(matrix.rows(), matrix.cols())),
+        m_temp(matrix.cols())
+  {
+    compute(matrix.derived());
+  }
+
+  /** \brief Constructs a complete orthogonal decomposition from a given matrix
+    *
+    * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+    *
+    * \sa CompleteOrthogonalDecomposition(const EigenBase&)
+    */
+  template<typename InputType>
+  explicit CompleteOrthogonalDecomposition(EigenBase<InputType>& matrix)
+    : m_cpqr(matrix.derived()),
+      m_zCoeffs((std::min)(matrix.rows(), matrix.cols())),
+      m_temp(matrix.cols())
+  {
+    computeInPlace();
+  }
+
+
+  /** This method computes the minimum-norm solution X to a least squares
+   * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of
+   * which \c *this is the complete orthogonal decomposition.
+   *
+   * \param b the right-hand sides of the problem to solve.
+   *
+   * \returns a solution.
+   *
+   */
+  template <typename Rhs>
+  inline const Solve<CompleteOrthogonalDecomposition, Rhs> solve(
+      const MatrixBase<Rhs>& b) const {
+    eigen_assert(m_cpqr.m_isInitialized &&
+                 "CompleteOrthogonalDecomposition is not initialized.");
+    return Solve<CompleteOrthogonalDecomposition, Rhs>(*this, b.derived());
+  }
+
+  HouseholderSequenceType householderQ(void) const;
+  HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); }
+
+  /** \returns the matrix \b Z.
+   */
+  MatrixType matrixZ() const {
+    MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols());
+    applyZAdjointOnTheLeftInPlace(Z);
+    return Z.adjoint();
+  }
+
+  /** \returns a reference to the matrix where the complete orthogonal
+   * decomposition is stored
+   */
+  const MatrixType& matrixQTZ() const { return m_cpqr.matrixQR(); }
+
+  /** \returns a reference to the matrix where the complete orthogonal
+   * decomposition is stored.
+   * \warning The strict lower part and \code cols() - rank() \endcode right
+   * columns of this matrix contains internal values.
+   * Only the upper triangular part should be referenced. To get it, use
+   * \code matrixT().template triangularView<Upper>() \endcode
+   * For rank-deficient matrices, use
+   * \code
+   * matrixR().topLeftCorner(rank(), rank()).template triangularView<Upper>()
+   * \endcode
+   */
+  const MatrixType& matrixT() const { return m_cpqr.matrixQR(); }
+
+  template <typename InputType>
+  CompleteOrthogonalDecomposition& compute(const EigenBase<InputType>& matrix) {
+    // Compute the column pivoted QR factorization A P = Q R.
+    m_cpqr.compute(matrix);
+    computeInPlace();
+    return *this;
+  }
+
+  /** \returns a const reference to the column permutation matrix */
+  const PermutationType& colsPermutation() const {
+    return m_cpqr.colsPermutation();
+  }
+
+  /** \returns the absolute value of the determinant of the matrix of which
+   * *this is the complete orthogonal decomposition. It has only linear
+   * complexity (that is, O(n) where n is the dimension of the square matrix)
+   * as the complete orthogonal decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   * One way to work around that is to use logAbsDeterminant() instead.
+   *
+   * \sa logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::RealScalar absDeterminant() const;
+
+  /** \returns the natural log of the absolute value of the determinant of the
+   * matrix of which *this is the complete orthogonal decomposition. It has
+   * only linear complexity (that is, O(n) where n is the dimension of the
+   * square matrix) as the complete orthogonal decomposition has already been
+   * computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \note This method is useful to work around the risk of overflow/underflow
+   * that's inherent to determinant computation.
+   *
+   * \sa absDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::RealScalar logAbsDeterminant() const;
+
+  /** \returns the rank of the matrix of which *this is the complete orthogonal
+   * decomposition.
+   *
+   * \note This method has to determine which pivots should be considered
+   * nonzero. For that, it uses the threshold value that you can control by
+   * calling setThreshold(const RealScalar&).
+   */
+  inline Index rank() const { return m_cpqr.rank(); }
+
+  /** \returns the dimension of the kernel of the matrix of which *this is the
+   * complete orthogonal decomposition.
+   *
+   * \note This method has to determine which pivots should be considered
+   * nonzero. For that, it uses the threshold value that you can control by
+   * calling setThreshold(const RealScalar&).
+   */
+  inline Index dimensionOfKernel() const { return m_cpqr.dimensionOfKernel(); }
+
+  /** \returns true if the matrix of which *this is the decomposition represents
+   * an injective linear map, i.e. has trivial kernel; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered
+   * nonzero. For that, it uses the threshold value that you can control by
+   * calling setThreshold(const RealScalar&).
+   */
+  inline bool isInjective() const { return m_cpqr.isInjective(); }
+
+  /** \returns true if the matrix of which *this is the decomposition represents
+   * a surjective linear map; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered
+   * nonzero. For that, it uses the threshold value that you can control by
+   * calling setThreshold(const RealScalar&).
+   */
+  inline bool isSurjective() const { return m_cpqr.isSurjective(); }
+
+  /** \returns true if the matrix of which *this is the complete orthogonal
+   * decomposition is invertible.
+   *
+   * \note This method has to determine which pivots should be considered
+   * nonzero. For that, it uses the threshold value that you can control by
+   * calling setThreshold(const RealScalar&).
+   */
+  inline bool isInvertible() const { return m_cpqr.isInvertible(); }
+
+  /** \returns the pseudo-inverse of the matrix of which *this is the complete
+   * orthogonal decomposition.
+   * \warning: Do not compute \c this->pseudoInverse()*rhs to solve a linear systems.
+   * It is more efficient and numerically stable to call \c this->solve(rhs).
+   */
+  inline const Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const
+  {
+    return Inverse<CompleteOrthogonalDecomposition>(*this);
+  }
+
+  inline Index rows() const { return m_cpqr.rows(); }
+  inline Index cols() const { return m_cpqr.cols(); }
+
+  /** \returns a const reference to the vector of Householder coefficients used
+   * to represent the factor \c Q.
+   *
+   * For advanced uses only.
+   */
+  inline const HCoeffsType& hCoeffs() const { return m_cpqr.hCoeffs(); }
+
+  /** \returns a const reference to the vector of Householder coefficients
+   * used to represent the factor \c Z.
+   *
+   * For advanced uses only.
+   */
+  const HCoeffsType& zCoeffs() const { return m_zCoeffs; }
+
+  /** Allows to prescribe a threshold to be used by certain methods, such as
+   * rank(), who need to determine when pivots are to be considered nonzero.
+   * Most be called before calling compute().
+   *
+   * When it needs to get the threshold value, Eigen calls threshold(). By
+   * default, this uses a formula to automatically determine a reasonable
+   * threshold. Once you have called the present method
+   * setThreshold(const RealScalar&), your value is used instead.
+   *
+   * \param threshold The new value to use as the threshold.
+   *
+   * A pivot will be considered nonzero if its absolute value is strictly
+   * greater than
+   *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
+   * where maxpivot is the biggest pivot.
+   *
+   * If you want to come back to the default behavior, call
+   * setThreshold(Default_t)
+   */
+  CompleteOrthogonalDecomposition& setThreshold(const RealScalar& threshold) {
+    m_cpqr.setThreshold(threshold);
+    return *this;
+  }
+
+  /** Allows to come back to the default behavior, letting Eigen use its default
+   * formula for determining the threshold.
+   *
+   * You should pass the special object Eigen::Default as parameter here.
+   * \code qr.setThreshold(Eigen::Default); \endcode
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  CompleteOrthogonalDecomposition& setThreshold(Default_t) {
+    m_cpqr.setThreshold(Default);
+    return *this;
+  }
+
+  /** Returns the threshold that will be used by certain methods such as rank().
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  RealScalar threshold() const { return m_cpqr.threshold(); }
+
+  /** \returns the number of nonzero pivots in the complete orthogonal
+   * decomposition. Here nonzero is meant in the exact sense, not in a
+   * fuzzy sense. So that notion isn't really intrinsically interesting,
+   * but it is still useful when implementing algorithms.
+   *
+   * \sa rank()
+   */
+  inline Index nonzeroPivots() const { return m_cpqr.nonzeroPivots(); }
+
+  /** \returns the absolute value of the biggest pivot, i.e. the biggest
+   *          diagonal coefficient of R.
+   */
+  inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); }
+
+  /** \brief Reports whether the complete orthogonal decomposition was
+   * succesful.
+   *
+   * \note This function always returns \c Success. It is provided for
+   * compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_cpqr.m_isInitialized && "Decomposition is not initialized.");
+    return Success;
+  }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const;
+#endif
+
+ protected:
+  static void check_template_parameters() {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+  }
+
+  void computeInPlace();
+
+  /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$.
+   */
+  template <typename Rhs>
+  void applyZAdjointOnTheLeftInPlace(Rhs& rhs) const;
+
+  ColPivHouseholderQR<MatrixType> m_cpqr;
+  HCoeffsType m_zCoeffs;
+  RowVectorType m_temp;
+};
+
+template <typename MatrixType>
+typename MatrixType::RealScalar
+CompleteOrthogonalDecomposition<MatrixType>::absDeterminant() const {
+  return m_cpqr.absDeterminant();
+}
+
+template <typename MatrixType>
+typename MatrixType::RealScalar
+CompleteOrthogonalDecomposition<MatrixType>::logAbsDeterminant() const {
+  return m_cpqr.logAbsDeterminant();
+}
+
+/** Performs the complete orthogonal decomposition of the given matrix \a
+ * matrix. The result of the factorization is stored into \c *this, and a
+ * reference to \c *this is returned.
+ *
+ * \sa class CompleteOrthogonalDecomposition,
+ * CompleteOrthogonalDecomposition(const MatrixType&)
+ */
+template <typename MatrixType>
+void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
+{
+  check_template_parameters();
+
+  // the column permutation is stored as int indices, so just to be sure:
+  eigen_assert(m_cpqr.cols() <= NumTraits<int>::highest());
+
+  const Index rank = m_cpqr.rank();
+  const Index cols = m_cpqr.cols();
+  const Index rows = m_cpqr.rows();
+  m_zCoeffs.resize((std::min)(rows, cols));
+  m_temp.resize(cols);
+
+  if (rank < cols) {
+    // We have reduced the (permuted) matrix to the form
+    //   [R11 R12]
+    //   [ 0  R22]
+    // where R11 is r-by-r (r = rank) upper triangular, R12 is
+    // r-by-(n-r), and R22 is empty or the norm of R22 is negligible.
+    // We now compute the complete orthogonal decomposition by applying
+    // Householder transformations from the right to the upper trapezoidal
+    // matrix X = [R11 R12] to zero out R12 and obtain the factorization
+    // [R11 R12] = [T11 0] * Z, where T11 is r-by-r upper triangular and
+    // Z = Z(0) * Z(1) ... Z(r-1) is an n-by-n orthogonal matrix.
+    // We store the data representing Z in R12 and m_zCoeffs.
+    for (Index k = rank - 1; k >= 0; --k) {
+      if (k != rank - 1) {
+        // Given the API for Householder reflectors, it is more convenient if
+        // we swap the leading parts of columns k and r-1 (zero-based) to form
+        // the matrix X_k = [X(0:k, k), X(0:k, r:n)]
+        m_cpqr.m_qr.col(k).head(k + 1).swap(
+            m_cpqr.m_qr.col(rank - 1).head(k + 1));
+      }
+      // Construct Householder reflector Z(k) to zero out the last row of X_k,
+      // i.e. choose Z(k) such that
+      // [X(k, k), X(k, r:n)] * Z(k) = [beta, 0, .., 0].
+      RealScalar beta;
+      m_cpqr.m_qr.row(k)
+          .tail(cols - rank + 1)
+          .makeHouseholderInPlace(m_zCoeffs(k), beta);
+      m_cpqr.m_qr(k, rank - 1) = beta;
+      if (k > 0) {
+        // Apply Z(k) to the first k rows of X_k
+        m_cpqr.m_qr.topRightCorner(k, cols - rank + 1)
+            .applyHouseholderOnTheRight(
+                m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k),
+                &m_temp(0));
+      }
+      if (k != rank - 1) {
+        // Swap X(0:k,k) back to its proper location.
+        m_cpqr.m_qr.col(k).head(k + 1).swap(
+            m_cpqr.m_qr.col(rank - 1).head(k + 1));
+      }
+    }
+  }
+}
+
+template <typename MatrixType>
+template <typename Rhs>
+void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace(
+    Rhs& rhs) const {
+  const Index cols = this->cols();
+  const Index nrhs = rhs.cols();
+  const Index rank = this->rank();
+  Matrix<typename MatrixType::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+  for (Index k = 0; k < rank; ++k) {
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+    rhs.middleRows(rank - 1, cols - rank + 1)
+        .applyHouseholderOnTheLeft(
+            matrixQTZ().row(k).tail(cols - rank).adjoint(), zCoeffs()(k),
+            &temp(0));
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+  }
+}
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename _MatrixType>
+template <typename RhsType, typename DstType>
+void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
+    const RhsType& rhs, DstType& dst) const {
+  eigen_assert(rhs.rows() == this->rows());
+
+  const Index rank = this->rank();
+  if (rank == 0) {
+    dst.setZero();
+    return;
+  }
+
+  // Compute c = Q^* * rhs
+  // Note that the matrix Q = H_0^* H_1^*... so its inverse is
+  // Q^* = (H_0 H_1 ...)^T
+  typename RhsType::PlainObject c(rhs);
+  c.applyOnTheLeft(
+      householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose());
+
+  // Solve T z = c(1:rank, :)
+  dst.topRows(rank) = matrixT()
+                          .topLeftCorner(rank, rank)
+                          .template triangularView<Upper>()
+                          .solve(c.topRows(rank));
+
+  const Index cols = this->cols();
+  if (rank < cols) {
+    // Compute y = Z^* * [ z ]
+    //                   [ 0 ]
+    dst.bottomRows(cols - rank).setZero();
+    applyZAdjointOnTheLeftInPlace(dst);
+  }
+
+  // Undo permutation to get x = P^{-1} * y.
+  dst = colsPermutation() * dst;
+}
+#endif
+
+namespace internal {
+
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename CompleteOrthogonalDecomposition<MatrixType>::Scalar>, Dense2Dense>
+{
+  typedef CompleteOrthogonalDecomposition<MatrixType> CodType;
+  typedef Inverse<CodType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename CodType::Scalar> &)
+  {
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows()));
+  }
+};
+
+} // end namespace internal
+
+/** \returns the matrix Q as a sequence of householder transformations */
+template <typename MatrixType>
+typename CompleteOrthogonalDecomposition<MatrixType>::HouseholderSequenceType
+CompleteOrthogonalDecomposition<MatrixType>::householderQ() const {
+  return m_cpqr.householderQ();
+}
+
+/** \return the complete orthogonal decomposition of \c *this.
+  *
+  * \sa class CompleteOrthogonalDecomposition
+  */
+template <typename Derived>
+const CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::completeOrthogonalDecomposition() const {
+  return CompleteOrthogonalDecomposition<PlainObject>(eval());
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index 0b39966e1..e489bddc2 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -15,6 +15,12 @@ namespace Eigen {
 
 namespace internal {
 
+template<typename _MatrixType> struct traits<FullPivHouseholderQR<_MatrixType> >
+ : traits<_MatrixType>
+{
+  enum { Flags = 0 };
+};
+
 template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType;
 
 template<typename MatrixType>
@@ -23,7 +29,7 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
   typedef typename MatrixType::PlainObject ReturnType;
 };
 
-}
+} // end namespace internal
 
 /** \ingroup QR_Module
   *
@@ -31,19 +37,21 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
   *
   * \brief Householder rank-revealing QR decomposition of a matrix with full pivoting
   *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
   *
-  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
+  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b P', \b Q and \b R
   * such that 
   * \f[
-  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \mathbf{R}
+  *  \mathbf{P} \, \mathbf{A} \, \mathbf{P}' = \mathbf{Q} \, \mathbf{R}
   * \f]
-  * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an 
-  * upper triangular matrix.
+  * by using Householder transformations. Here, \b P and \b P' are permutation matrices, \b Q a unitary matrix 
+  * and \b R an upper triangular matrix.
   *
   * This decomposition performs a very prudent full pivoting in order to be rank-revealing and achieve optimal
   * numerical stability. The trade-off is that it is slower than HouseholderQR and ColPivHouseholderQR.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::fullPivHouseholderQr()
   */
 template<typename _MatrixType> class FullPivHouseholderQR
@@ -54,21 +62,22 @@ template<typename _MatrixType> class FullPivHouseholderQR
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    // FIXME should be int
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
-    typedef Matrix<Index, 1,
+    typedef Matrix<StorageIndex, 1,
                    EIGEN_SIZE_MIN_PREFER_DYNAMIC(ColsAtCompileTime,RowsAtCompileTime), RowMajor, 1,
                    EIGEN_SIZE_MIN_PREFER_FIXED(MaxColsAtCompileTime,MaxRowsAtCompileTime)> IntDiagSizeVectorType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
     typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
+    typedef typename MatrixType::PlainObject PlainObject;
 
     /** \brief Default Constructor.
       *
@@ -113,7 +122,8 @@ template<typename _MatrixType> class FullPivHouseholderQR
       * 
       * \sa compute()
       */
-    FullPivHouseholderQR(const MatrixType& matrix)
+    template<typename InputType>
+    explicit FullPivHouseholderQR(const EigenBase<InputType>& matrix)
       : m_qr(matrix.rows(), matrix.cols()),
         m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
         m_rows_transpositions((std::min)(matrix.rows(), matrix.cols())),
@@ -123,7 +133,27 @@ template<typename _MatrixType> class FullPivHouseholderQR
         m_isInitialized(false),
         m_usePrescribedThreshold(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
+    }
+
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa FullPivHouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit FullPivHouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
+        m_rows_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_permutation(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      computeInPlace();
     }
 
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
@@ -134,9 +164,6 @@ template<typename _MatrixType> class FullPivHouseholderQR
       * \returns the exact or least-square solution if the rank is greater or equal to the number of columns of A,
       * and an arbitrary solution otherwise.
       *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
       * \note_about_checking_solutions
       *
       * \note_about_arbitrary_choice_of_solution
@@ -145,11 +172,11 @@ template<typename _MatrixType> class FullPivHouseholderQR
       * Output: \verbinclude FullPivHouseholderQR_solve.out
       */
     template<typename Rhs>
-    inline const internal::solve_retval<FullPivHouseholderQR, Rhs>
+    inline const Solve<FullPivHouseholderQR, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return internal::solve_retval<FullPivHouseholderQR, Rhs>(*this, b.derived());
+      return Solve<FullPivHouseholderQR, Rhs>(*this, b.derived());
     }
 
     /** \returns Expression object representing the matrix Q
@@ -164,7 +191,8 @@ template<typename _MatrixType> class FullPivHouseholderQR
       return m_qr;
     }
 
-    FullPivHouseholderQR& compute(const MatrixType& matrix);
+    template<typename InputType>
+    FullPivHouseholderQR& compute(const EigenBase<InputType>& matrix);
 
     /** \returns a const reference to the column permutation matrix */
     const PermutationType& colsPermutation() const
@@ -280,13 +308,11 @@ template<typename _MatrixType> class FullPivHouseholderQR
       *
       * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
       *       Use isInvertible() to first determine whether this matrix is invertible.
-      */    inline const
-    internal::solve_retval<FullPivHouseholderQR, typename MatrixType::IdentityReturnType>
-    inverse() const
+      */
+    inline const Inverse<FullPivHouseholderQR> inverse() const
     {
       eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return internal::solve_retval<FullPivHouseholderQR,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_qr.rows(), m_qr.cols()));
+      return Inverse<FullPivHouseholderQR>(*this);
     }
 
     inline Index rows() const { return m_qr.rows(); }
@@ -366,6 +392,12 @@ template<typename _MatrixType> class FullPivHouseholderQR
       *          diagonal coefficient of U.
       */
     RealScalar maxPivot() const { return m_maxpivot; }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif
 
   protected:
     
@@ -374,6 +406,8 @@ template<typename _MatrixType> class FullPivHouseholderQR
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
     
+    void computeInPlace();
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     IntDiagSizeVectorType m_rows_transpositions;
@@ -411,16 +445,25 @@ typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::logAbsDetermin
   * \sa class FullPivHouseholderQR, FullPivHouseholderQR(const MatrixType&)
   */
 template<typename MatrixType>
-FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+template<typename InputType>
+FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
+{
+  m_qr = matrix.derived();
+  computeInPlace();
+  return *this;
+}
+
+template<typename MatrixType>
+void FullPivHouseholderQR<MatrixType>::computeInPlace()
 {
   check_template_parameters();
-  
+
   using std::abs;
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
   Index size = (std::min)(rows,cols);
 
-  m_qr = matrix;
+  
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
@@ -439,13 +482,15 @@ FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(cons
   for (Index k = 0; k < size; ++k)
   {
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
 
-    biggest_in_corner = m_qr.bottomRightCorner(rows-k, cols-k)
-                            .cwiseAbs()
-                            .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
+    Score score = m_qr.bottomRightCorner(rows-k, cols-k)
+                      .unaryExpr(Scoring())
+                      .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k;
     col_of_biggest_in_corner += k;
+    RealScalar biggest_in_corner = internal::abs_knowing_score<Scalar>()(m_qr(row_of_biggest_in_corner, col_of_biggest_in_corner), score);
     if(k==0) biggest = biggest_in_corner;
 
     // if the corner is negligible, then we have less than full rank, and we can finish early
@@ -489,50 +534,55 @@ FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(cons
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
   m_isInitialized = true;
-
-  return *this;
 }
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<FullPivHouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<FullPivHouseholderQR<_MatrixType>, Rhs>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  EIGEN_MAKE_SOLVE_HELPERS(FullPivHouseholderQR<_MatrixType>,Rhs)
+  eigen_assert(rhs.rows() == rows());
+  const Index l_rank = rank();
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  // FIXME introduce nonzeroPivots() and use it here. and more generally,
+  // make the same improvements in this dec as in FullPivLU.
+  if(l_rank==0)
   {
-    const Index rows = dec().rows(), cols = dec().cols();
-    eigen_assert(rhs().rows() == rows);
+    dst.setZero();
+    return;
+  }
 
-    // FIXME introduce nonzeroPivots() and use it here. and more generally,
-    // make the same improvements in this dec as in FullPivLU.
-    if(dec().rank()==0)
-    {
-      dst.setZero();
-      return;
-    }
+  typename RhsType::PlainObject c(rhs);
 
-    typename Rhs::PlainObject c(rhs());
+  Matrix<Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
+  for (Index k = 0; k < l_rank; ++k)
+  {
+    Index remainingSize = rows()-k;
+    c.row(k).swap(c.row(m_rows_transpositions.coeff(k)));
+    c.bottomRightCorner(remainingSize, rhs.cols())
+      .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1),
+                               m_hCoeffs.coeff(k), &temp.coeffRef(0));
+  }
 
-    Matrix<Scalar,1,Rhs::ColsAtCompileTime> temp(rhs().cols());
-    for (Index k = 0; k < dec().rank(); ++k)
-    {
-      Index remainingSize = rows-k;
-      c.row(k).swap(c.row(dec().rowsTranspositions().coeff(k)));
-      c.bottomRightCorner(remainingSize, rhs().cols())
-       .applyHouseholderOnTheLeft(dec().matrixQR().col(k).tail(remainingSize-1),
-                                  dec().hCoeffs().coeff(k), &temp.coeffRef(0));
-    }
+  m_qr.topLeftCorner(l_rank, l_rank)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(l_rank));
 
-    dec().matrixQR()
-       .topLeftCorner(dec().rank(), dec().rank())
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(dec().rank()));
+  for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i);
+  for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero();
+}
+#endif
 
-    for(Index i = 0; i < dec().rank(); ++i) dst.row(dec().colsPermutation().indices().coeff(i)) = c.row(i);
-    for(Index i = dec().rank(); i < cols; ++i) dst.row(dec().colsPermutation().indices().coeff(i)).setZero();
+namespace internal {
+  
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivHouseholderQR<MatrixType>::Scalar>, Dense2Dense>
+{
+  typedef FullPivHouseholderQR<MatrixType> QrType;
+  typedef Inverse<QrType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
+  {    
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
 
@@ -546,7 +596,6 @@ template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType
   : public ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
 {
 public:
-  typedef typename MatrixType::Index Index;
   typedef typename FullPivHouseholderQR<MatrixType>::IntDiagSizeVectorType IntDiagSizeVectorType;
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
   typedef Matrix<typename MatrixType::Scalar, 1, MatrixType::RowsAtCompileTime, RowMajor, 1,
@@ -558,7 +607,7 @@ public:
     : m_qr(qr),
       m_hCoeffs(hCoeffs),
       m_rowsTranspositions(rowsTranspositions)
-      {}
+  {}
 
   template <typename ResultType>
   void evalTo(ResultType& result) const
@@ -588,8 +637,8 @@ public:
     }
   }
 
-    Index rows() const { return m_qr.rows(); }
-    Index cols() const { return m_qr.rows(); }
+  Index rows() const { return m_qr.rows(); }
+  Index cols() const { return m_qr.rows(); }
 
 protected:
   typename MatrixType::Nested m_qr;
@@ -597,6 +646,11 @@ protected:
   typename IntDiagSizeVectorType::Nested m_rowsTranspositions;
 };
 
+// template<typename MatrixType>
+// struct evaluator<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
+//  : public evaluator<ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> > >
+// {};
+
 } // end namespace internal
 
 template<typename MatrixType>
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 343a66499..3513d995c 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -21,7 +21,7 @@ namespace Eigen {
   *
   * \brief Householder QR decomposition of a matrix
   *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
   *
   * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R
   * such that 
@@ -37,6 +37,8 @@ namespace Eigen {
   * This Householder QR decomposition is faster, but less numerically stable and less feature-full than
   * FullPivHouseholderQR or ColPivHouseholderQR.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
   * \sa MatrixBase::householderQr()
   */
 template<typename _MatrixType> class HouseholderQR
@@ -47,13 +49,13 @@ template<typename _MatrixType> class HouseholderQR
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    // FIXME should be int
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags&RowMajorBit) ? RowMajor : ColMajor, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
@@ -91,13 +93,32 @@ template<typename _MatrixType> class HouseholderQR
       * 
       * \sa compute()
       */
-    HouseholderQR(const MatrixType& matrix)
+    template<typename InputType>
+    explicit HouseholderQR(const EigenBase<InputType>& matrix)
       : m_qr(matrix.rows(), matrix.cols()),
         m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
         m_temp(matrix.cols()),
         m_isInitialized(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
+    }
+
+
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
+      * \c MatrixType is a Eigen::Ref.
+      *
+      * \sa HouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit HouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_temp(matrix.cols()),
+        m_isInitialized(false)
+    {
+      computeInPlace();
     }
 
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
@@ -107,9 +128,6 @@ template<typename _MatrixType> class HouseholderQR
       *
       * \returns a solution.
       *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
       * \note_about_checking_solutions
       *
       * \note_about_arbitrary_choice_of_solution
@@ -118,11 +136,11 @@ template<typename _MatrixType> class HouseholderQR
       * Output: \verbinclude HouseholderQR_solve.out
       */
     template<typename Rhs>
-    inline const internal::solve_retval<HouseholderQR, Rhs>
+    inline const Solve<HouseholderQR, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
-      return internal::solve_retval<HouseholderQR, Rhs>(*this, b.derived());
+      return Solve<HouseholderQR, Rhs>(*this, b.derived());
     }
 
     /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
@@ -148,7 +166,12 @@ template<typename _MatrixType> class HouseholderQR
         return m_qr;
     }
 
-    HouseholderQR& compute(const MatrixType& matrix);
+    template<typename InputType>
+    HouseholderQR& compute(const EigenBase<InputType>& matrix) {
+      m_qr = matrix.derived();
+      computeInPlace();
+      return *this;
+    }
 
     /** \returns the absolute value of the determinant of the matrix of which
       * *this is the QR decomposition. It has only linear complexity
@@ -187,6 +210,12 @@ template<typename _MatrixType> class HouseholderQR
       * For advanced uses only.
       */
     const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif
 
   protected:
     
@@ -194,6 +223,8 @@ template<typename _MatrixType> class HouseholderQR
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
+
+    void computeInPlace();
     
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
@@ -224,7 +255,6 @@ namespace internal {
 template<typename MatrixQR, typename HCoeffs>
 void householder_qr_inplace_unblocked(MatrixQR& mat, HCoeffs& hCoeffs, typename MatrixQR::Scalar* tempData = 0)
 {
-  typedef typename MatrixQR::Index Index;
   typedef typename MatrixQR::Scalar Scalar;
   typedef typename MatrixQR::RealScalar RealScalar;
   Index rows = mat.rows();
@@ -263,11 +293,9 @@ template<typename MatrixQR, typename HCoeffs,
 struct householder_qr_inplace_blocked
 {
   // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h
-  static void run(MatrixQR& mat, HCoeffs& hCoeffs,
-      typename MatrixQR::Index maxBlockSize=32,
+  static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32,
       typename MatrixQR::Scalar* tempData = 0)
   {
-    typedef typename MatrixQR::Index Index;
     typedef typename MatrixQR::Scalar Scalar;
     typedef Block<MatrixQR,Dynamic,Dynamic> BlockType;
 
@@ -289,8 +317,8 @@ struct householder_qr_inplace_blocked
     for (k = 0; k < size; k += blockSize)
     {
       Index bs = (std::min)(size-k,blockSize);  // actual size of the block
-      Index tcols = cols - k - bs;            // trailing columns
-      Index brows = rows-k;                   // rows of the block
+      Index tcols = cols - k - bs;              // trailing columns
+      Index brows = rows-k;                     // rows of the block
 
       // partition the matrix:
       //        A00 | A01 | A02
@@ -308,43 +336,38 @@ struct householder_qr_inplace_blocked
       if(tcols)
       {
         BlockType A21_22 = mat.block(k,k+bs,brows,tcols);
-        apply_block_householder_on_the_left(A21_22,A11_21,hCoeffsSegment.adjoint());
+        apply_block_householder_on_the_left(A21_22,A11_21,hCoeffsSegment, false); // false == backward
       }
     }
   }
 };
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<HouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<HouseholderQR<_MatrixType>, Rhs>
-{
-  EIGEN_MAKE_SOLVE_HELPERS(HouseholderQR<_MatrixType>,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    const Index rows = dec().rows(), cols = dec().cols();
-    const Index rank = (std::min)(rows, cols);
-    eigen_assert(rhs().rows() == rows);
+} // end namespace internal
 
-    typename Rhs::PlainObject c(rhs());
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
+{
+  const Index rank = (std::min)(rows(), cols());
+  eigen_assert(rhs.rows() == rows());
 
-    // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-    c.applyOnTheLeft(householderSequence(
-      dec().matrixQR().leftCols(rank),
-      dec().hCoeffs().head(rank)).transpose()
-    );
+  typename RhsType::PlainObject c(rhs);
 
-    dec().matrixQR()
-       .topLeftCorner(rank, rank)
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(rank));
+  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
+  c.applyOnTheLeft(householderSequence(
+    m_qr.leftCols(rank),
+    m_hCoeffs.head(rank)).transpose()
+  );
 
-    dst.topRows(rank) = c.topRows(rank);
-    dst.bottomRows(cols-rank).setZero();
-  }
-};
+  m_qr.topLeftCorner(rank, rank)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(rank));
 
-} // end namespace internal
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(cols()-rank).setZero();
+}
+#endif
 
 /** Performs the QR factorization of the given matrix \a matrix. The result of
   * the factorization is stored into \c *this, and a reference to \c *this
@@ -353,15 +376,14 @@ struct solve_retval<HouseholderQR<_MatrixType>, Rhs>
   * \sa class HouseholderQR, HouseholderQR(const MatrixType&)
   */
 template<typename MatrixType>
-HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+void HouseholderQR<MatrixType>::computeInPlace()
 {
   check_template_parameters();
   
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
   Index size = (std::min)(rows,cols);
 
-  m_qr = matrix;
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
@@ -369,7 +391,6 @@ HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const MatrixType&
   internal::householder_qr_inplace_blocked<MatrixType, HCoeffsType>::run(m_qr, m_hCoeffs, 48, m_temp.data());
 
   m_isInitialized = true;
-  return *this;
 }
 
 /** \return the Householder QR decomposition of \c *this.
diff --git a/Eigen/src/QR/HouseholderQR_MKL.h b/Eigen/src/QR/HouseholderQR_LAPACKE.h
index b80f1b48d..1dc7d5363 100644
--- a/Eigen/src/QR/HouseholderQR_MKL.h
+++ b/Eigen/src/QR/HouseholderQR_LAPACKE.h
@@ -25,47 +25,44 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Householder QR decomposition of a matrix w/o pivoting based on
  *    LAPACKE_?geqrf function.
  ********************************************************************************
 */
 
-#ifndef EIGEN_QR_MKL_H
-#define EIGEN_QR_MKL_H
-
-#include "../Core/util/MKL_support.h"
+#ifndef EIGEN_QR_LAPACKE_H
+#define EIGEN_QR_LAPACKE_H
 
 namespace Eigen { 
 
-  namespace internal {
+namespace internal {
 
-    /** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_QR_NOPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_LAPACKE_QR_NOPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \
 template<typename MatrixQR, typename HCoeffs> \
 struct householder_qr_inplace_blocked<MatrixQR, HCoeffs, EIGTYPE, true> \
 { \
-  static void run(MatrixQR& mat, HCoeffs& hCoeffs, \
-      typename MatrixQR::Index = 32, \
+  static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index = 32, \
       typename MatrixQR::Scalar* = 0) \
   { \
     lapack_int m = (lapack_int) mat.rows(); \
     lapack_int n = (lapack_int) mat.cols(); \
     lapack_int lda = (lapack_int) mat.outerStride(); \
     lapack_int matrix_order = (MatrixQR::IsRowMajor) ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
-    LAPACKE_##MKLPREFIX##geqrf( matrix_order, m, n, (MKLTYPE*)mat.data(), lda, (MKLTYPE*)hCoeffs.data()); \
+    LAPACKE_##LAPACKE_PREFIX##geqrf( matrix_order, m, n, (LAPACKE_TYPE*)mat.data(), lda, (LAPACKE_TYPE*)hCoeffs.data()); \
     hCoeffs.adjointInPlace(); \
   } \
 };
 
-EIGEN_MKL_QR_NOPIV(double, double, d)
-EIGEN_MKL_QR_NOPIV(float, float, s)
-EIGEN_MKL_QR_NOPIV(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_QR_NOPIV(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_QR_NOPIV(double, double, d)
+EIGEN_LAPACKE_QR_NOPIV(float, float, s)
+EIGEN_LAPACKE_QR_NOPIV(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_QR_NOPIV(scomplex, lapack_complex_float, c)
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_QR_MKL_H
+#endif // EIGEN_QR_LAPACKE_H
diff --git a/Eigen/src/SPQRSupport/CMakeLists.txt b/Eigen/src/SPQRSupport/CMakeLists.txt
deleted file mode 100644
index 4968beaf2..000000000
--- a/Eigen/src/SPQRSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SPQRSupport_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SPQRSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SPQRSupport/ COMPONENT Devel
-  )
diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index de00877de..953d57c9d 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -32,45 +33,54 @@ namespace Eigen {
   } // End namespace internal
   
 /**
- * \ingroup SPQRSupport_Module
- * \class SPQR
- * \brief Sparse QR factorization based on SuiteSparseQR library
- * 
- * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition 
- * of sparse matrices. The result is then used to solve linear leasts_square systems.
- * Clearly, a QR factorization is returned such that A*P = Q*R where :
- * 
- * P is the column permutation. Use colsPermutation() to get it.
- * 
- * Q is the orthogonal matrix represented as Householder reflectors. 
- * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
- * You can then apply it to a vector.
- * 
- * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix.
- * NOTE : The Index type of R is always UF_long. You can get it with SPQR::Index
- * 
- * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
- * NOTE 
- * 
- */
+  * \ingroup SPQRSupport_Module
+  * \class SPQR
+  * \brief Sparse QR factorization based on SuiteSparseQR library
+  *
+  * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition
+  * of sparse matrices. The result is then used to solve linear leasts_square systems.
+  * Clearly, a QR factorization is returned such that A*P = Q*R where :
+  *
+  * P is the column permutation. Use colsPermutation() to get it.
+  *
+  * Q is the orthogonal matrix represented as Householder reflectors.
+  * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
+  * You can then apply it to a vector.
+  *
+  * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix.
+  * NOTE : The Index type of R is always SuiteSparse_long. You can get it with SPQR::Index
+  *
+  * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
+  *
+  * \implsparsesolverconcept
+  *
+  *
+  */
 template<typename _MatrixType>
-class SPQR
+class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
 {
+  protected:
+    typedef SparseSolverBase<SPQR<_MatrixType> > Base;
+    using Base::m_isInitialized;
   public:
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef UF_long Index ; 
-    typedef SparseMatrix<Scalar, ColMajor, Index> MatrixType;
-    typedef PermutationMatrix<Dynamic, Dynamic> PermutationType;
+    typedef SuiteSparse_long StorageIndex ;
+    typedef SparseMatrix<Scalar, ColMajor, StorageIndex> MatrixType;
+    typedef Map<PermutationMatrix<Dynamic, Dynamic, StorageIndex> > PermutationType;
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
   public:
     SPQR() 
-      : m_isInitialized(false), m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
+      : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
     { 
       cholmod_l_start(&m_cc);
     }
     
-    SPQR(const _MatrixType& matrix)
-    : m_isInitialized(false), m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
+    explicit SPQR(const _MatrixType& matrix)
+    : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
     {
       cholmod_l_start(&m_cc);
       compute(matrix);
@@ -103,23 +113,22 @@ class SPQR
       RealScalar pivotThreshold = m_tolerance;
       if(m_useDefaultThreshold) 
       {
-        using std::max;
         RealScalar max2Norm = 0.0;
-        for (int j = 0; j < mat.cols(); j++) max2Norm = (max)(max2Norm, mat.col(j).norm());
+        for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm());
         if(max2Norm==RealScalar(0))
           max2Norm = RealScalar(1);
         pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon();
       }
-      
       cholmod_sparse A; 
       A = viewAsCholmod(mat);
+      m_rows = matrix.rows();
       Index col = matrix.cols();
       m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, 
                              &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
 
       if (!m_cR)
       {
-        m_info = NumericalIssue; 
+        m_info = NumericalIssue;
         m_isInitialized = false;
         return;
       }
@@ -130,28 +139,15 @@ class SPQR
     /** 
      * Get the number of rows of the input matrix and the Q matrix
      */
-    inline Index rows() const {return m_cR->nrow; }
+    inline Index rows() const {return m_rows; }
     
     /** 
      * Get the number of columns of the input matrix. 
      */
     inline Index cols() const { return m_cR->ncol; }
-   
-      /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SPQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
-      eigen_assert(this->rows()==B.rows()
-                    && "SPQR::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::solve_retval<SPQR, Rhs>(*this, B.derived());
-    }
     
     template<typename Rhs, typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
       eigen_assert(b.cols()==1 && "This method is for vectors only");
@@ -184,7 +180,7 @@ class SPQR
     {
       eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
       if(!m_isRUpToDate) {
-        m_R = viewAsEigen<Scalar,ColMajor, typename MatrixType::Index>(*m_cR);
+        m_R = viewAsEigen<Scalar,ColMajor, typename MatrixType::StorageIndex>(*m_cR);
         m_isRUpToDate = true;
       }
       return m_R;
@@ -198,11 +194,7 @@ class SPQR
     PermutationType colsPermutation() const
     { 
       eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      Index n = m_cR->ncol;
-      PermutationType colsPerm(n);
-      for(Index j = 0; j <n; j++) colsPerm.indices()(j) = m_E[j];
-      return colsPerm; 
-      
+      return PermutationType(m_E, m_cR->ncol);
     }
     /**
      * Gets the rank of the matrix. 
@@ -237,7 +229,6 @@ class SPQR
       return m_info;
     }
   protected:
-    bool m_isInitialized;
     bool m_analysisIsOk;
     bool m_factorizationIsOk;
     mutable bool m_isRUpToDate;
@@ -247,13 +238,14 @@ class SPQR
     RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero
     mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format
     mutable MatrixType m_R; // The sparse matrix R in Eigen format
-    mutable Index *m_E; // The permutation applied to columns
+    mutable StorageIndex *m_E; // The permutation applied to columns
     mutable cholmod_sparse *m_H;  //The householder vectors
-    mutable Index *m_HPinv; // The row permutation of H
+    mutable StorageIndex *m_HPinv; // The row permutation of H
     mutable cholmod_dense *m_HTau; // The Householder coefficients
     mutable Index m_rank; // The rank of the matrix
     mutable cholmod_common m_cc; // Workspace and parameters
     bool m_useDefaultThreshold;     // Use default threshold
+    Index m_rows;
     template<typename ,typename > friend struct SPQR_QProduct;
 };
 
@@ -261,7 +253,7 @@ template <typename SPQRType, typename Derived>
 struct SPQR_QProduct : ReturnByValue<SPQR_QProduct<SPQRType,Derived> >
 {
   typedef typename SPQRType::Scalar Scalar;
-  typedef typename SPQRType::Index Index;
+  typedef typename SPQRType::StorageIndex StorageIndex;
   //Define the constructor to get reference to argument types
   SPQR_QProduct(const SPQRType& spqr, const Derived& other, bool transpose) : m_spqr(spqr),m_other(other),m_transpose(transpose) {}
   
@@ -317,22 +309,5 @@ struct SPQRMatrixQTransposeReturnType{
   const SPQRType& m_spqr;
 };
 
-namespace internal {
-  
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<SPQR<_MatrixType>, Rhs>
-  : solve_retval_base<SPQR<_MatrixType>, Rhs>
-{
-  typedef SPQR<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 }// End namespace Eigen
 #endif
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
new file mode 100644
index 000000000..25fca6f4d
--- /dev/null
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -0,0 +1,1230 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+// 
+// We used the "A Divide-And-Conquer Algorithm for the Bidiagonal SVD"
+// research report written by Ming Gu and Stanley C.Eisenstat
+// The code variable names correspond to the names they used in their 
+// report
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+// Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2014-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BDCSVD_H
+#define EIGEN_BDCSVD_H
+// #define EIGEN_BDCSVD_DEBUG_VERBOSE
+// #define EIGEN_BDCSVD_SANITY_CHECKS
+
+namespace Eigen {
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+IOFormat bdcsvdfmt(8, 0, ", ", "\n", "  [", "]");
+#endif
+  
+template<typename _MatrixType> class BDCSVD;
+
+namespace internal {
+
+template<typename _MatrixType> 
+struct traits<BDCSVD<_MatrixType> >
+{
+  typedef _MatrixType MatrixType;
+};  
+
+} // end namespace internal
+  
+  
+/** \ingroup SVD_Module
+ *
+ *
+ * \class BDCSVD
+ *
+ * \brief class Bidiagonal Divide and Conquer SVD
+ *
+ * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition
+ *
+ * This class first reduces the input matrix to bi-diagonal form using class UpperBidiagonalization,
+ * and then performs a divide-and-conquer diagonalization. Small blocks are diagonalized using class JacobiSVD.
+ * You can control the switching size with the setSwitchSize() method, default is 16.
+ * For small matrice (<16), it is thus preferable to directly use JacobiSVD. For larger ones, BDCSVD is highly
+ * recommended and can several order of magnitude faster.
+ *
+ * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations.
+ * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless
+ * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will
+ * significantly degrade the accuracy.
+ *
+ * \sa class JacobiSVD
+ */
+template<typename _MatrixType> 
+class BDCSVD : public SVDBase<BDCSVD<_MatrixType> >
+{
+  typedef SVDBase<BDCSVD> Base;
+    
+public:
+  using Base::rows;
+  using Base::cols;
+  using Base::computeU;
+  using Base::computeV;
+  
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime, 
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime, 
+    DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime), 
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, 
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, 
+    MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime, MaxColsAtCompileTime), 
+    MatrixOptions = MatrixType::Options
+  };
+
+  typedef typename Base::MatrixUType MatrixUType;
+  typedef typename Base::MatrixVType MatrixVType;
+  typedef typename Base::SingularValuesType SingularValuesType;
+  
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> MatrixX;
+  typedef Matrix<RealScalar, Dynamic, Dynamic, ColMajor> MatrixXr;
+  typedef Matrix<RealScalar, Dynamic, 1> VectorType;
+  typedef Array<RealScalar, Dynamic, 1> ArrayXr;
+  typedef Array<Index,1,Dynamic> ArrayXi;
+  typedef Ref<ArrayXr> ArrayRef;
+  typedef Ref<ArrayXi> IndicesRef;
+
+  /** \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via BDCSVD::compute(const MatrixType&).
+   */
+  BDCSVD() : m_algoswap(16), m_numIters(0)
+  {}
+
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem size.
+   * \sa BDCSVD()
+   */
+  BDCSVD(Index rows, Index cols, unsigned int computationOptions = 0)
+    : m_algoswap(16), m_numIters(0)
+  {
+    allocate(rows, cols, computationOptions);
+  }
+
+  /** \brief Constructor performing the decomposition of given matrix.
+   *
+   * \param matrix the matrix to decompose
+   * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
+   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
+   *                           #ComputeFullV, #ComputeThinV.
+   *
+   * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
+   * available with the (non - default) FullPivHouseholderQR preconditioner.
+   */
+  BDCSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
+    : m_algoswap(16), m_numIters(0)
+  {
+    compute(matrix, computationOptions);
+  }
+
+  ~BDCSVD() 
+  {
+  }
+  
+  /** \brief Method performing the decomposition of given matrix using custom options.
+   *
+   * \param matrix the matrix to decompose
+   * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
+   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
+   *                           #ComputeFullV, #ComputeThinV.
+   *
+   * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
+   * available with the (non - default) FullPivHouseholderQR preconditioner.
+   */
+  BDCSVD& compute(const MatrixType& matrix, unsigned int computationOptions);
+
+  /** \brief Method performing the decomposition of given matrix using current options.
+   *
+   * \param matrix the matrix to decompose
+   *
+   * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int).
+   */
+  BDCSVD& compute(const MatrixType& matrix)
+  {
+    return compute(matrix, this->m_computationOptions);
+  }
+
+  void setSwitchSize(int s) 
+  {
+    eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3");
+    m_algoswap = s;
+  }
+ 
+private:
+  void allocate(Index rows, Index cols, unsigned int computationOptions);
+  void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
+  void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
+  void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals, ArrayRef shifts, ArrayRef mus);
+  void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat);
+  void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V);
+  void deflation43(Index firstCol, Index shift, Index i, Index size);
+  void deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
+  void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
+  template<typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
+  void copyUV(const HouseholderU &householderU, const HouseholderV &householderV, const NaiveU &naiveU, const NaiveV &naivev);
+  void structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1);
+  static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift);
+
+protected:
+  MatrixXr m_naiveU, m_naiveV;
+  MatrixXr m_computed;
+  Index m_nRec;
+  ArrayXr m_workspace;
+  ArrayXi m_workspaceI;
+  int m_algoswap;
+  bool m_isTranspose, m_compU, m_compV;
+  
+  using Base::m_singularValues;
+  using Base::m_diagSize;
+  using Base::m_computeFullU;
+  using Base::m_computeFullV;
+  using Base::m_computeThinU;
+  using Base::m_computeThinV;
+  using Base::m_matrixU;
+  using Base::m_matrixV;
+  using Base::m_isInitialized;
+  using Base::m_nonzeroSingularValues;
+
+public:  
+  int m_numIters;
+}; //end class BDCSVD
+
+
+// Method to allocate and initialize matrix and attributes
+template<typename MatrixType>
+void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
+{
+  m_isTranspose = (cols > rows);
+
+  if (Base::allocate(rows, cols, computationOptions))
+    return;
+  
+  m_computed = MatrixXr::Zero(m_diagSize + 1, m_diagSize );
+  m_compU = computeV();
+  m_compV = computeU();
+  if (m_isTranspose)
+    std::swap(m_compU, m_compV);
+  
+  if (m_compU) m_naiveU = MatrixXr::Zero(m_diagSize + 1, m_diagSize + 1 );
+  else         m_naiveU = MatrixXr::Zero(2, m_diagSize + 1 );
+  
+  if (m_compV) m_naiveV = MatrixXr::Zero(m_diagSize, m_diagSize);
+  
+  m_workspace.resize((m_diagSize+1)*(m_diagSize+1)*3);
+  m_workspaceI.resize(3*m_diagSize);
+}// end allocate
+
+template<typename MatrixType>
+BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsigned int computationOptions) 
+{
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "\n\n\n======================================================================================================================\n\n\n";
+#endif
+  allocate(matrix.rows(), matrix.cols(), computationOptions);
+  using std::abs;
+
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  
+  //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
+  if(matrix.cols() < m_algoswap)
+  {
+    // FIXME this line involves temporaries
+    JacobiSVD<MatrixType> jsvd(matrix,computationOptions);
+    if(computeU()) m_matrixU = jsvd.matrixU();
+    if(computeV()) m_matrixV = jsvd.matrixV();
+    m_singularValues = jsvd.singularValues();
+    m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
+    m_isInitialized = true;
+    return *this;
+  }
+  
+  //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
+  RealScalar scale = matrix.cwiseAbs().maxCoeff();
+  if(scale==RealScalar(0)) scale = RealScalar(1);
+  MatrixX copy;
+  if (m_isTranspose) copy = matrix.adjoint()/scale;
+  else               copy = matrix/scale;
+  
+  //**** step 1 - Bidiagonalization
+  // FIXME this line involves temporaries
+  internal::UpperBidiagonalization<MatrixX> bid(copy);
+
+  //**** step 2 - Divide & Conquer
+  m_naiveU.setZero();
+  m_naiveV.setZero();
+  // FIXME this line involves a temporary matrix
+  m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose();
+  m_computed.template bottomRows<1>().setZero();
+  divide(0, m_diagSize - 1, 0, 0, 0);
+
+  //**** step 3 - Copy singular values and vectors
+  for (int i=0; i<m_diagSize; i++)
+  {
+    RealScalar a = abs(m_computed.coeff(i, i));
+    m_singularValues.coeffRef(i) = a * scale;
+    if (a<considerZero)
+    {
+      m_nonzeroSingularValues = i;
+      m_singularValues.tail(m_diagSize - i - 1).setZero();
+      break;
+    }
+    else if (i == m_diagSize - 1)
+    {
+      m_nonzeroSingularValues = i + 1;
+      break;
+    }
+  }
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+//   std::cout << "m_naiveU\n" << m_naiveU << "\n\n";
+//   std::cout << "m_naiveV\n" << m_naiveV << "\n\n";
+#endif
+  if(m_isTranspose) copyUV(bid.householderV(), bid.householderU(), m_naiveV, m_naiveU);
+  else              copyUV(bid.householderU(), bid.householderV(), m_naiveU, m_naiveV);
+
+  m_isInitialized = true;
+  return *this;
+}// end compute
+
+
+template<typename MatrixType>
+template<typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
+void BDCSVD<MatrixType>::copyUV(const HouseholderU &householderU, const HouseholderV &householderV, const NaiveU &naiveU, const NaiveV &naiveV)
+{
+  // Note exchange of U and V: m_matrixU is set from m_naiveV and vice versa
+  if (computeU())
+  {
+    Index Ucols = m_computeThinU ? m_diagSize : householderU.cols();
+    m_matrixU = MatrixX::Identity(householderU.cols(), Ucols);
+    m_matrixU.topLeftCorner(m_diagSize, m_diagSize) = naiveV.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
+    householderU.applyThisOnTheLeft(m_matrixU); // FIXME this line involves a temporary buffer
+  }
+  if (computeV())
+  {
+    Index Vcols = m_computeThinV ? m_diagSize : householderV.cols();
+    m_matrixV = MatrixX::Identity(householderV.cols(), Vcols);
+    m_matrixV.topLeftCorner(m_diagSize, m_diagSize) = naiveU.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
+    householderV.applyThisOnTheLeft(m_matrixV); // FIXME this line involves a temporary buffer
+  }
+}
+
+/** \internal
+  * Performs A = A * B exploiting the special structure of the matrix A. Splitting A as:
+  *  A = [A1]
+  *      [A2]
+  * such that A1.rows()==n1, then we assume that at least half of the columns of A1 and A2 are zeros.
+  * We can thus pack them prior to the the matrix product. However, this is only worth the effort if the matrix is large
+  * enough.
+  */
+template<typename MatrixType>
+void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1)
+{
+  Index n = A.rows();
+  if(n>100)
+  {
+    // If the matrices are large enough, let's exploit the sparse structure of A by
+    // splitting it in half (wrt n1), and packing the non-zero columns.
+    Index n2 = n - n1;
+    Map<MatrixXr> A1(m_workspace.data()      , n1, n);
+    Map<MatrixXr> A2(m_workspace.data()+ n1*n, n2, n);
+    Map<MatrixXr> B1(m_workspace.data()+  n*n, n,  n);
+    Map<MatrixXr> B2(m_workspace.data()+2*n*n, n,  n);
+    Index k1=0, k2=0;
+    for(Index j=0; j<n; ++j)
+    {
+      if( (A.col(j).head(n1).array()!=0).any() )
+      {
+        A1.col(k1) = A.col(j).head(n1);
+        B1.row(k1) = B.row(j);
+        ++k1;
+      }
+      if( (A.col(j).tail(n2).array()!=0).any() )
+      {
+        A2.col(k2) = A.col(j).tail(n2);
+        B2.row(k2) = B.row(j);
+        ++k2;
+      }
+    }
+  
+    A.topRows(n1).noalias()    = A1.leftCols(k1) * B1.topRows(k1);
+    A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
+  }
+  else
+  {
+    Map<MatrixXr,Aligned> tmp(m_workspace.data(),n,n);
+    tmp.noalias() = A*B;
+    A = tmp;
+  }
+}
+
+// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the 
+// place of the submatrix we are currently working on.
+
+//@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
+//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; 
+// lastCol + 1 - firstCol is the size of the submatrix.
+//@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W)
+//@param firstRowW : Same as firstRowW with the column.
+//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
+// to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
+template<typename MatrixType>
+void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift)
+{
+  // requires rows = cols + 1;
+  using std::pow;
+  using std::sqrt;
+  using std::abs;
+  const Index n = lastCol - firstCol + 1;
+  const Index k = n/2;
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar alphaK;
+  RealScalar betaK; 
+  RealScalar r0; 
+  RealScalar lambda, phi, c0, s0;
+  VectorType l, f;
+  // We use the other algorithm which is more efficient for small 
+  // matrices.
+  if (n < m_algoswap)
+  {
+    // FIXME this line involves temporaries
+    JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0));
+    if (m_compU)
+      m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU();
+    else 
+    {
+      m_naiveU.row(0).segment(firstCol, n + 1).real() = b.matrixU().row(0);
+      m_naiveU.row(1).segment(firstCol, n + 1).real() = b.matrixU().row(n);
+    }
+    if (m_compV) m_naiveV.block(firstRowW, firstColW, n, n).real() = b.matrixV();
+    m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero();
+    m_computed.diagonal().segment(firstCol + shift, n) = b.singularValues().head(n);
+    return;
+  }
+  // We use the divide and conquer algorithm
+  alphaK =  m_computed(firstCol + k, firstCol + k);
+  betaK = m_computed(firstCol + k + 1, firstCol + k);
+  // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
+  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the 
+  // right submatrix before the left one. 
+  divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
+  divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
+
+  if (m_compU)
+  {
+    lambda = m_naiveU(firstCol + k, firstCol + k);
+    phi = m_naiveU(firstCol + k + 1, lastCol + 1);
+  } 
+  else 
+  {
+    lambda = m_naiveU(1, firstCol + k);
+    phi = m_naiveU(0, lastCol + 1);
+  }
+  r0 = sqrt((abs(alphaK * lambda) * abs(alphaK * lambda)) + abs(betaK * phi) * abs(betaK * phi));
+  if (m_compU)
+  {
+    l = m_naiveU.row(firstCol + k).segment(firstCol, k);
+    f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
+  } 
+  else 
+  {
+    l = m_naiveU.row(1).segment(firstCol, k);
+    f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
+  }
+  if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1;
+  if (r0<considerZero)
+  {
+    c0 = 1;
+    s0 = 0;
+  }
+  else
+  {
+    c0 = alphaK * lambda / r0;
+    s0 = betaK * phi / r0;
+  }
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+  
+  if (m_compU)
+  {
+    MatrixXr q1 (m_naiveU.col(firstCol + k).segment(firstCol, k + 1));     
+    // we shiftW Q1 to the right
+    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
+      m_naiveU.col(i + 1).segment(firstCol, k + 1) = m_naiveU.col(i).segment(firstCol, k + 1);
+    // we shift q1 at the left with a factor c0
+    m_naiveU.col(firstCol).segment( firstCol, k + 1) = (q1 * c0);
+    // last column = q1 * - s0
+    m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) = (q1 * ( - s0));
+    // first column = q2 * s0
+    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) = m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0; 
+    // q2 *= c0
+    m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0;
+  } 
+  else 
+  {
+    RealScalar q1 = m_naiveU(0, firstCol + k);
+    // we shift Q1 to the right
+    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
+      m_naiveU(0, i + 1) = m_naiveU(0, i);
+    // we shift q1 at the left with a factor c0
+    m_naiveU(0, firstCol) = (q1 * c0);
+    // last column = q1 * - s0
+    m_naiveU(0, lastCol + 1) = (q1 * ( - s0));
+    // first column = q2 * s0
+    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) *s0; 
+    // q2 *= c0
+    m_naiveU(1, lastCol + 1) *= c0;
+    m_naiveU.row(1).segment(firstCol + 1, k).setZero();
+    m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
+  }
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+  
+  m_computed(firstCol + shift, firstCol + shift) = r0;
+  m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) = alphaK * l.transpose().real();
+  m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) = betaK * f.transpose().real();
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  ArrayXr tmp1 = (m_computed.block(firstCol+shift, firstCol+shift, n, n)).jacobiSvd().singularValues();
+#endif
+  // Second part: try to deflate singular values in combined matrix
+  deflation(firstCol, lastCol, k, firstRowW, firstColW, shift);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  ArrayXr tmp2 = (m_computed.block(firstCol+shift, firstCol+shift, n, n)).jacobiSvd().singularValues();
+  std::cout << "\n\nj1 = " << tmp1.transpose().format(bdcsvdfmt) << "\n";
+  std::cout << "j2 = " << tmp2.transpose().format(bdcsvdfmt) << "\n\n";
+  std::cout << "err:      " << ((tmp1-tmp2).abs()>1e-12*tmp2.abs()).transpose() << "\n";
+  static int count = 0;
+  std::cout << "# " << ++count << "\n\n";
+  assert((tmp1-tmp2).matrix().norm() < 1e-14*tmp2.matrix().norm());
+//   assert(count<681);
+//   assert(((tmp1-tmp2).abs()<1e-13*tmp2.abs()).all());
+#endif
+  
+  // Third part: compute SVD of combined matrix
+  MatrixXr UofSVD, VofSVD;
+  VectorType singVals;
+  computeSVDofM(firstCol + shift, n, UofSVD, singVals, VofSVD);
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(UofSVD.allFinite());
+  assert(VofSVD.allFinite());
+#endif
+  
+  if (m_compU)
+    structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2);
+  else
+  {
+    Map<Matrix<RealScalar,2,Dynamic>,Aligned> tmp(m_workspace.data(),2,n+1);
+    tmp.noalias() = m_naiveU.middleCols(firstCol, n+1) * UofSVD;
+    m_naiveU.middleCols(firstCol, n + 1) = tmp;
+  }
+  
+  if (m_compV)  structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n+1)/2);
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+  
+  m_computed.block(firstCol + shift, firstCol + shift, n, n).setZero();
+  m_computed.block(firstCol + shift, firstCol + shift, n, n).diagonal() = singVals;
+}// end divide
+
+// Compute SVD of m_computed.block(firstCol, firstCol, n + 1, n); this block only has non-zeros in
+// the first column and on the diagonal and has undergone deflation, so diagonal is in increasing
+// order except for possibly the (0,0) entry. The computed SVD is stored U, singVals and V, except
+// that if m_compV is false, then V is not computed. Singular values are sorted in decreasing order.
+//
+// TODO Opportunities for optimization: better root finding algo, better stopping criterion, better
+// handling of round-off errors, be consistent in ordering
+// For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
+template <typename MatrixType>
+void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
+{
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  using std::abs;
+  ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
+  m_workspace.head(n) =  m_computed.block(firstCol, firstCol, n, n).diagonal();
+  ArrayRef diag = m_workspace.head(n);
+  diag(0) = 0;
+
+  // Allocate space for singular values and vectors
+  singVals.resize(n);
+  U.resize(n+1, n+1);
+  if (m_compV) V.resize(n, n);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  if (col0.hasNaN() || diag.hasNaN())
+    std::cout << "\n\nHAS NAN\n\n";
+#endif
+  
+  // Many singular values might have been deflated, the zero ones have been moved to the end,
+  // but others are interleaved and we must ignore them at this stage.
+  // To this end, let's compute a permutation skipping them:
+  Index actual_n = n;
+  while(actual_n>1 && diag(actual_n-1)==0) --actual_n;
+  Index m = 0; // size of the deflated problem
+  for(Index k=0;k<actual_n;++k)
+    if(abs(col0(k))>considerZero)
+      m_workspaceI(m++) = k;
+  Map<ArrayXi> perm(m_workspaceI.data(),m);
+  
+  Map<ArrayXr> shifts(m_workspace.data()+1*n, n);
+  Map<ArrayXr> mus(m_workspace.data()+2*n, n);
+  Map<ArrayXr> zhat(m_workspace.data()+3*n, n);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "computeSVDofM using:\n";
+  std::cout << "  z: " << col0.transpose() << "\n";
+  std::cout << "  d: " << diag.transpose() << "\n";
+#endif
+  
+  // Compute singVals, shifts, and mus
+  computeSingVals(col0, diag, perm, singVals, shifts, mus);
+  
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "  j:        " << (m_computed.block(firstCol, firstCol, n, n)).jacobiSvd().singularValues().transpose().reverse() << "\n\n";
+  std::cout << "  sing-val: " << singVals.transpose() << "\n";
+  std::cout << "  mu:       " << mus.transpose() << "\n";
+  std::cout << "  shift:    " << shifts.transpose() << "\n";
+  
+  {
+    Index actual_n = n;
+    while(actual_n>1 && abs(col0(actual_n-1))<considerZero) --actual_n;
+    std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
+    std::cout << "    check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
+    std::cout << "    check2 (>0)      : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n";
+    std::cout << "    check3 (>0)      : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n";
+    std::cout << "    check4 (>0)      : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n";
+  }
+#endif
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(singVals.allFinite());
+  assert(mus.allFinite());
+  assert(shifts.allFinite());
+#endif
+  
+  // Compute zhat
+  perturbCol0(col0, diag, perm, singVals, shifts, mus, zhat);
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "  zhat: " << zhat.transpose() << "\n";
+#endif
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(zhat.allFinite());
+#endif
+  
+  computeSingVecs(zhat, diag, perm, singVals, shifts, mus, U, V);
+  
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "U^T U: " << (U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() << "\n";
+  std::cout << "V^T V: " << (V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() << "\n";
+#endif
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(U.allFinite());
+  assert(V.allFinite());
+  assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n);
+  assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n);
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+  
+  // Because of deflation, the singular values might not be completely sorted.
+  // Fortunately, reordering them is a O(n) problem
+  for(Index i=0; i<actual_n-1; ++i)
+  {
+    if(singVals(i)>singVals(i+1))
+    {
+      using std::swap;
+      swap(singVals(i),singVals(i+1));
+      U.col(i).swap(U.col(i+1));
+      if(m_compV) V.col(i).swap(V.col(i+1));
+    }
+  }
+  
+  // Reverse order so that singular values in increased order
+  // Because of deflation, the zeros singular-values are already at the end
+  singVals.head(actual_n).reverseInPlace();
+  U.leftCols(actual_n).rowwise().reverseInPlace();
+  if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
+  
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n) );
+  std::cout << "  * j:        " << jsvd.singularValues().transpose() << "\n\n";
+  std::cout << "  * sing-val: " << singVals.transpose() << "\n";
+//   std::cout << "  * err:      " << ((jsvd.singularValues()-singVals)>1e-13*singVals.norm()).transpose() << "\n";
+#endif
+}
+
+template <typename MatrixType>
+typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift)
+{
+  Index m = perm.size();
+  RealScalar res = 1;
+  for(Index i=0; i<m; ++i)
+  {
+    Index j = perm(i);
+    res += numext::abs2(col0(j)) / ((diagShifted(j) - mu) * (diag(j) + shift + mu));
+  }
+  return res;
+
+}
+
+template <typename MatrixType>
+void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm,
+                                         VectorType& singVals, ArrayRef shifts, ArrayRef mus)
+{
+  using std::abs;
+  using std::swap;
+
+  Index n = col0.size();
+  Index actual_n = n;
+  while(actual_n>1 && col0(actual_n-1)==0) --actual_n;
+
+  for (Index k = 0; k < n; ++k)
+  {
+    if (col0(k) == 0 || actual_n==1)
+    {
+      // if col0(k) == 0, then entry is deflated, so singular value is on diagonal
+      // if actual_n==1, then the deflated problem is already diagonalized
+      singVals(k) = k==0 ? col0(0) : diag(k);
+      mus(k) = 0;
+      shifts(k) = k==0 ? col0(0) : diag(k);
+      continue;
+    } 
+
+    // otherwise, use secular equation to find singular value
+    RealScalar left = diag(k);
+    RealScalar right; // was: = (k != actual_n-1) ? diag(k+1) : (diag(actual_n-1) + col0.matrix().norm());
+    if(k==actual_n-1)
+      right = (diag(actual_n-1) + col0.matrix().norm());
+    else
+    {
+      // Skip deflated singular values
+      Index l = k+1;
+      while(col0(l)==0) { ++l; eigen_internal_assert(l<actual_n); }
+      right = diag(l);
+    }
+
+    // first decide whether it's closer to the left end or the right end
+    RealScalar mid = left + (right-left) / 2;
+    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, 0);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+    std::cout << right-left << "\n";
+    std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right)   << "\n";
+    std::cout << "     = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.2*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.3*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.4*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.49*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.5*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.51*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.6*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.7*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.8*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n";
+#endif
+    RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right;
+    
+    // measure everything relative to shift
+    Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n);
+    diagShifted = diag - shift;
+    
+    // initial guess
+    RealScalar muPrev, muCur;
+    if (shift == left)
+    {
+      muPrev = (right - left) * RealScalar(0.1);
+      if (k == actual_n-1) muCur = right - left;
+      else                 muCur = (right - left) * RealScalar(0.5);
+    }
+    else
+    {
+      muPrev = -(right - left) * RealScalar(0.1);
+      muCur = -(right - left) * RealScalar(0.5);
+    }
+
+    RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift);
+    RealScalar fCur = secularEq(muCur, col0, diag, perm, diagShifted, shift);
+    if (abs(fPrev) < abs(fCur))
+    {
+      swap(fPrev, fCur);
+      swap(muPrev, muCur);
+    }
+
+    // rational interpolation: fit a function of the form a / mu + b through the two previous
+    // iterates and use its zero to compute the next iterate
+    bool useBisection = fPrev*fCur>0;
+    while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
+    {
+      ++m_numIters;
+
+      // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples.
+      RealScalar a = (fCur - fPrev) / (1/muCur - 1/muPrev);
+      RealScalar b = fCur - a / muCur;
+      // And find mu such that f(mu)==0:
+      RealScalar muZero = -a/b;
+      RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
+      
+      muPrev = muCur;
+      fPrev = fCur;
+      muCur = muZero;
+      fCur = fZero;
+      
+      
+      if (shift == left  && (muCur < 0 || muCur > right - left)) useBisection = true;
+      if (shift == right && (muCur < -(right - left) || muCur > 0)) useBisection = true;
+      if (abs(fCur)>abs(fPrev)) useBisection = true;
+    }
+
+    // fall back on bisection method if rational interpolation did not work
+    if (useBisection)
+    {
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "useBisection for k = " << k << ", actual_n = " << actual_n << "\n";
+#endif
+      RealScalar leftShifted, rightShifted;
+      if (shift == left)
+      {
+        leftShifted = (std::numeric_limits<RealScalar>::min)();
+        // I don't understand why the case k==0 would be special there:
+        // if (k == 0) rightShifted = right - left; else 
+        rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe
+      }
+      else
+      {
+        leftShifted = -(right - left) * RealScalar(0.6);
+        rightShifted = -(std::numeric_limits<RealScalar>::min)();
+      }
+      
+      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
+
+#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
+      RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
+#endif
+
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+      if(!(fLeft * fRight<0))
+      {
+        std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose()  << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n";
+        std::cout << k << " : " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  " << left << " - " << right << " -> " <<  leftShifted << " " << rightShifted << "   shift=" << shift << "\n";
+      }
+#endif
+      eigen_internal_assert(fLeft * fRight < 0);
+      
+      while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
+      {
+        RealScalar midShifted = (leftShifted + rightShifted) / 2;
+        fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+        if (fLeft * fMid < 0)
+        {
+          rightShifted = midShifted;
+        }
+        else
+        {
+          leftShifted = midShifted;
+          fLeft = fMid;
+        }
+      }
+
+      muCur = (leftShifted + rightShifted) / 2;
+    }
+      
+    singVals[k] = shift + muCur;
+    shifts[k] = shift;
+    mus[k] = muCur;
+
+    // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
+    // (deflation is supposed to avoid this from happening)
+    // - this does no seem to be necessary anymore -
+//     if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
+//     if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
+  }
+}
+
+
+// zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
+template <typename MatrixType>
+void BDCSVD<MatrixType>::perturbCol0
+   (const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals,
+    const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat)
+{
+  using std::sqrt;
+  Index n = col0.size();
+  Index m = perm.size();
+  if(m==0)
+  {
+    zhat.setZero();
+    return;
+  }
+  Index last = perm(m-1);
+  // The offset permits to skip deflated entries while computing zhat
+  for (Index k = 0; k < n; ++k)
+  {
+    if (col0(k) == 0) // deflated
+      zhat(k) = 0;
+    else
+    {
+      // see equation (3.6)
+      RealScalar dk = diag(k);
+      RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk));
+
+      for(Index l = 0; l<m; ++l)
+      {
+        Index i = perm(l);
+        if(i!=k)
+        {
+          Index j = i<k ? i : perm(l-1);
+          prod *= ((singVals(j)+dk) / ((diag(i)+dk))) * ((mus(j)+(shifts(j)-dk)) / ((diag(i)-dk)));
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+          if(i!=k && std::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 )
+            std::cout << "     " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk))
+                       << ") / (" << (diag(i)+dk) << " * " << (diag(i)-dk) << ")\n";
+#endif
+        }
+      }
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n";
+#endif
+      RealScalar tmp = sqrt(prod);
+      zhat(k) = col0(k) > 0 ? tmp : -tmp;
+    }
+  }
+}
+
+// compute singular vectors
+template <typename MatrixType>
+void BDCSVD<MatrixType>::computeSingVecs
+   (const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals,
+    const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V)
+{
+  Index n = zhat.size();
+  Index m = perm.size();
+  
+  for (Index k = 0; k < n; ++k)
+  {
+    if (zhat(k) == 0)
+    {
+      U.col(k) = VectorType::Unit(n+1, k);
+      if (m_compV) V.col(k) = VectorType::Unit(n, k);
+    }
+    else
+    {
+      U.col(k).setZero();
+      for(Index l=0;l<m;++l)
+      {
+        Index i = perm(l);
+        U(i,k) = zhat(i)/(((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k]));
+      }
+      U(n,k) = 0;      
+      U.col(k).normalize();
+    
+      if (m_compV)
+      {
+        V.col(k).setZero();
+        for(Index l=1;l<m;++l)
+        {
+          Index i = perm(l);
+          V(i,k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k]));
+        }
+        V(0,k) = -1;
+        V.col(k).normalize();
+      }
+    }
+  }
+  U.col(n) = VectorType::Unit(n+1, n);
+}
+
+
+// page 12_13
+// i >= 1, di almost null and zi non null.
+// We use a rotation to zero out zi applied to the left of M
+template <typename MatrixType>
+void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index size)
+{
+  using std::abs;
+  using std::sqrt;
+  using std::pow;
+  Index start = firstCol + shift;
+  RealScalar c = m_computed(start, start);
+  RealScalar s = m_computed(start+i, start);
+  RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s));
+  if (r == 0)
+  {
+    m_computed(start+i, start+i) = 0;
+    return;
+  }
+  m_computed(start,start) = r;  
+  m_computed(start+i, start) = 0;
+  m_computed(start+i, start+i) = 0;
+  
+  JacobiRotation<RealScalar> J(c/r,-s/r);
+  if (m_compU)  m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J);
+  else          m_naiveU.applyOnTheRight(firstCol, firstCol+i, J);
+}// end deflation 43
+
+
+// page 13
+// i,j >= 1, i!=j and |di - dj| < epsilon * norm2(M)
+// We apply two rotations to have zj = 0;
+// TODO deflation44 is still broken and not properly tested
+template <typename MatrixType>
+void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size)
+{
+  using std::abs;
+  using std::sqrt;
+  using std::conj;
+  using std::pow;
+  RealScalar c = m_computed(firstColm+i, firstColm);
+  RealScalar s = m_computed(firstColm+j, firstColm);
+  RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s));
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "deflation 4.4: " << i << "," << j << " -> " << c << " " << s << " " << r << " ; "
+    << m_computed(firstColm + i-1, firstColm)  << " "
+    << m_computed(firstColm + i, firstColm)  << " "
+    << m_computed(firstColm + i+1, firstColm) << " "
+    << m_computed(firstColm + i+2, firstColm) << "\n";
+  std::cout << m_computed(firstColm + i-1, firstColm + i-1)  << " "
+    << m_computed(firstColm + i, firstColm+i)  << " "
+    << m_computed(firstColm + i+1, firstColm+i+1) << " "
+    << m_computed(firstColm + i+2, firstColm+i+2) << "\n";
+#endif
+  if (r==0)
+  {
+    m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j);
+    return;
+  }
+  c/=r;
+  s/=r;
+  m_computed(firstColm + i, firstColm) = r;  
+  m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
+  m_computed(firstColm + j, firstColm) = 0;
+
+  JacobiRotation<RealScalar> J(c,-s);
+  if (m_compU)  m_naiveU.middleRows(firstColu, size+1).applyOnTheRight(firstColu + i, firstColu + j, J);
+  else          m_naiveU.applyOnTheRight(firstColu+i, firstColu+j, J);
+  if (m_compV)  m_naiveV.middleRows(firstRowW, size).applyOnTheRight(firstColW + i, firstColW + j, J);
+}// end deflation 44
+
+
+// acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
+template <typename MatrixType>
+void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift)
+{
+  using std::sqrt;
+  using std::abs;
+  const Index length = lastCol + 1 - firstCol;
+  
+  Block<MatrixXr,Dynamic,1> col0(m_computed, firstCol+shift, firstCol+shift, length, 1);
+  Diagonal<MatrixXr> fulldiag(m_computed);
+  VectorBlock<Diagonal<MatrixXr>,Dynamic> diag(fulldiag, firstCol+shift, length);
+  
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
+  RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero,NumTraits<RealScalar>::epsilon() * maxDiag);
+  RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE  
+  std::cout << "\ndeflate:" << diag.head(k+1).transpose() << "  |  " << diag.segment(k+1,length-k-1).transpose() << "\n";
+#endif
+  
+  //condition 4.1
+  if (diag(0) < epsilon_coarse)
+  { 
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+    std::cout << "deflation 4.1, because " << diag(0) << " < " << epsilon_coarse << "\n";
+#endif
+    diag(0) = epsilon_coarse;
+  }
+
+  //condition 4.2
+  for (Index i=1;i<length;++i)
+    if (abs(col0(i)) < epsilon_strict)
+    {
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict << "  (diag(" << i << ")=" << diag(i) << ")\n";
+#endif
+      col0(i) = 0;
+    }
+
+  //condition 4.3
+  for (Index i=1;i<length; i++)
+    if (diag(i) < epsilon_coarse)
+    {
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "deflation 4.3, cancel z(" << i << ")=" << col0(i) << " because diag(" << i << ")=" << diag(i) << " < " << epsilon_coarse << "\n";
+#endif
+      deflation43(firstCol, shift, i, length);
+    }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "to be sorted: " << diag.transpose() << "\n\n";
+#endif
+  {
+    // Check for total deflation
+    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting
+    bool total_deflation = (col0.tail(length-1).array()<considerZero).all();
+    
+    // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
+    // First, compute the respective permutation.
+    Index *permutation = m_workspaceI.data();
+    {
+      permutation[0] = 0;
+      Index p = 1;
+      
+      // Move deflated diagonal entries at the end.
+      for(Index i=1; i<length; ++i)
+        if(abs(diag(i))<considerZero)
+          permutation[p++] = i;
+        
+      Index i=1, j=k+1;
+      for( ; p < length; ++p)
+      {
+             if (i > k)             permutation[p] = j++;
+        else if (j >= length)       permutation[p] = i++;
+        else if (diag(i) < diag(j)) permutation[p] = j++;
+        else                        permutation[p] = i++;
+      }
+    }
+    
+    // If we have a total deflation, then we have to insert diag(0) at the right place
+    if(total_deflation)
+    {
+      for(Index i=1; i<length; ++i)
+      {
+        Index pi = permutation[i];
+        if(abs(diag(pi))<considerZero || diag(0)<diag(pi))
+          permutation[i-1] = permutation[i];
+        else
+        {
+          permutation[i-1] = 0;
+          break;
+        }
+      }
+    }
+    
+    // Current index of each col, and current column of each index
+    Index *realInd = m_workspaceI.data()+length;
+    Index *realCol = m_workspaceI.data()+2*length;
+    
+    for(int pos = 0; pos< length; pos++)
+    {
+      realCol[pos] = pos;
+      realInd[pos] = pos;
+    }
+    
+    for(Index i = total_deflation?0:1; i < length; i++)
+    {
+      const Index pi = permutation[length - (total_deflation ? i+1 : i)];
+      const Index J = realCol[pi];
+      
+      using std::swap;
+      // swap diagonal and first column entries:
+      swap(diag(i), diag(J));
+      if(i!=0 && J!=0) swap(col0(i), col0(J));
+
+      // change columns
+      if (m_compU) m_naiveU.col(firstCol+i).segment(firstCol, length + 1).swap(m_naiveU.col(firstCol+J).segment(firstCol, length + 1));
+      else         m_naiveU.col(firstCol+i).segment(0, 2)                .swap(m_naiveU.col(firstCol+J).segment(0, 2));
+      if (m_compV) m_naiveV.col(firstColW + i).segment(firstRowW, length).swap(m_naiveV.col(firstColW + J).segment(firstRowW, length));
+
+      //update real pos
+      const Index realI = realInd[i];
+      realCol[realI] = J;
+      realCol[pi] = i;
+      realInd[J] = realI;
+      realInd[i] = pi;
+    }
+  }
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";
+  std::cout << "      : " << col0.transpose() << "\n\n";
+#endif
+    
+  //condition 4.4
+  {
+    Index i = length-1;
+    while(i>0 && (abs(diag(i))<considerZero || abs(col0(i))<considerZero)) --i;
+    for(; i>1;--i)
+       if( (diag(i) - diag(i-1)) < NumTraits<RealScalar>::epsilon()*maxDiag )
+      {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+        std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*diag(i) << "\n";
+#endif
+        eigen_internal_assert(abs(diag(i) - diag(i-1))<epsilon_coarse && " diagonal entries are not properly sorted");
+        deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i-1, i, length);
+      }
+  }
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  for(Index j=2;j<length;++j)
+    assert(diag(j-1)<=diag(j) || abs(diag(j))<considerZero);
+#endif
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+}//end deflation
+
+#ifndef __CUDACC__
+/** \svd_module
+  *
+  * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
+  *
+  * \sa class BDCSVD
+  */
+template<typename Derived>
+BDCSVD<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::bdcSvd(unsigned int computationOptions) const
+{
+  return BDCSVD<PlainObject>(*this, computationOptions);
+}
+#endif
+
+} // end namespace Eigen
+
+#endif
diff --git a/Eigen/src/SVD/CMakeLists.txt b/Eigen/src/SVD/CMakeLists.txt
deleted file mode 100644
index 55efc44b1..000000000
--- a/Eigen/src/SVD/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SVD_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SVD_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SVD COMPONENT Devel
-  )
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 1b2977419..43488b1e0 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -51,7 +52,6 @@ template<typename MatrixType, int QRPreconditioner, int Case>
 class qr_preconditioner_impl<MatrixType, QRPreconditioner, Case, false>
 {
 public:
-  typedef typename MatrixType::Index Index;
   void allocate(const JacobiSVD<MatrixType, QRPreconditioner>&) {}
   bool run(JacobiSVD<MatrixType, QRPreconditioner>&, const MatrixType&)
   {
@@ -65,7 +65,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   enum
   {
@@ -106,7 +105,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   enum
   {
@@ -114,9 +112,11 @@ public:
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
+    TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor))
+              : ColsAtCompileTime==1 ? (MatrixType::Options |   RowMajor)
+              : MatrixType::Options
   };
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
+  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime>
           TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
@@ -156,8 +156,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
-
   void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
   {
     if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
@@ -197,7 +195,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   enum
   {
@@ -205,10 +202,12 @@ public:
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
+    TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor))
+              : ColsAtCompileTime==1 ? (MatrixType::Options |   RowMajor)
+              : MatrixType::Options
   };
 
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
+  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime>
           TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
@@ -256,8 +255,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
-
   void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
   {
     if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
@@ -296,7 +293,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   enum
   {
@@ -358,8 +354,8 @@ template<typename MatrixType, int QRPreconditioner>
 struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false>
 {
   typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
-  typedef typename SVD::Index Index;
-  static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {}
+  typedef typename MatrixType::RealScalar RealScalar;
+  static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, RealScalar&) { return true; }
 };
 
 template<typename MatrixType, int QRPreconditioner>
@@ -368,20 +364,30 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
   typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename SVD::Index Index;
-  static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q)
+  static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry)
   {
     using std::sqrt;
+    using std::abs;
     Scalar z;
     JacobiRotation<Scalar> rot;
     RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p)));
-    
+
+    const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+    const RealScalar precision = NumTraits<Scalar>::epsilon();
+
     if(n==0)
     {
-      z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
-      work_matrix.row(p) *= z;
-      if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
-      if(work_matrix.coeff(q,q)!=Scalar(0))
+      // make sure first column is zero
+      work_matrix.coeffRef(p,p) = work_matrix.coeffRef(q,p) = Scalar(0);
+
+      if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero)
+      {
+        // work_matrix.coeff(p,q) can be zero if work_matrix.coeff(q,p) is not zero but small enough to underflow when computing n
+        z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
+        work_matrix.row(p) *= z;
+        if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
+      }
+      if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero)
       {
         z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
         work_matrix.row(q) *= z;
@@ -395,52 +401,33 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
       rot.s() = work_matrix.coeff(q,p) / n;
       work_matrix.applyOnTheLeft(p,q,rot);
       if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint());
-      if(work_matrix.coeff(p,q) != Scalar(0))
+      if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero)
       {
-        Scalar z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
+        z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
         work_matrix.col(q) *= z;
         if(svd.computeV()) svd.m_matrixV.col(q) *= z;
       }
-      if(work_matrix.coeff(q,q) != Scalar(0))
+      if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero)
       {
         z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
         work_matrix.row(q) *= z;
         if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
       }
     }
+
+    // update largest diagonal entry
+    maxDiagEntry = numext::maxi<RealScalar>(maxDiagEntry,numext::maxi<RealScalar>(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q))));
+    // and check whether the 2x2 block is already diagonal
+    RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+    return abs(work_matrix.coeff(p,q))>threshold || abs(work_matrix.coeff(q,p)) > threshold;
   }
 };
 
-template<typename MatrixType, typename RealScalar, typename Index>
-void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
-                            JacobiRotation<RealScalar> *j_left,
-                            JacobiRotation<RealScalar> *j_right)
+template<typename _MatrixType, int QRPreconditioner> 
+struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
 {
-  using std::sqrt;
-  using std::abs;
-  Matrix<RealScalar,2,2> m;
-  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
-       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
-  JacobiRotation<RealScalar> rot1;
-  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
-  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
-  if(t == RealScalar(0))
-  {
-    rot1.c() = RealScalar(0);
-    rot1.s() = d > RealScalar(0) ? RealScalar(1) : RealScalar(-1);
-  }
-  else
-  {
-    RealScalar t2d2 = numext::hypot(t,d);
-    rot1.c() = abs(t)/t2d2;
-    rot1.s() = d/t2d2;
-    if(t<RealScalar(0))
-      rot1.s() = -rot1.s();
-  }
-  m.applyOnTheLeft(0,1,rot1);
-  j_right->makeJacobi(m,0,1);
-  *j_left  = rot1 * j_right->transpose();
-}
+  typedef _MatrixType MatrixType;
+};
 
 } // end namespace internal
 
@@ -451,8 +438,8 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
   *
   * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix
   *
-  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
-  * \param QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally
+  * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition
+  * \tparam QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally
   *                        for the R-SVD step for non-square matrices. See discussion of possible values below.
   *
   * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
@@ -498,13 +485,14 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
   * \sa MatrixBase::jacobiSvd()
   */
 template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
+ : public SVDBase<JacobiSVD<_MatrixType,QRPreconditioner> >
 {
+    typedef SVDBase<JacobiSVD> Base;
   public:
 
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
@@ -515,13 +503,10 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
       MatrixOptions = MatrixType::Options
     };
 
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime,
-                   MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-            MatrixUType;
-    typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime,
-                   MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime>
-            MatrixVType;
-    typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
+    typedef typename Base::MatrixUType MatrixUType;
+    typedef typename Base::MatrixVType MatrixVType;
+    typedef typename Base::SingularValuesType SingularValuesType;
+    
     typedef typename internal::plain_row_type<MatrixType>::type RowType;
     typedef typename internal::plain_col_type<MatrixType>::type ColType;
     typedef Matrix<Scalar, DiagSizeAtCompileTime, DiagSizeAtCompileTime,
@@ -534,11 +519,6 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
       * perform decompositions via JacobiSVD::compute(const MatrixType&).
       */
     JacobiSVD()
-      : m_isInitialized(false),
-        m_isAllocated(false),
-        m_usePrescribedThreshold(false),
-        m_computationOptions(0),
-        m_rows(-1), m_cols(-1), m_diagSize(0)
     {}
 
 
@@ -549,11 +529,6 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
       * \sa JacobiSVD()
       */
     JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0)
-      : m_isInitialized(false),
-        m_isAllocated(false),
-        m_usePrescribedThreshold(false),
-        m_computationOptions(0),
-        m_rows(-1), m_cols(-1)
     {
       allocate(rows, cols, computationOptions);
     }
@@ -568,12 +543,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
      * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
      * available with the (non-default) FullPivHouseholderQR preconditioner.
      */
-    JacobiSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
-      : m_isInitialized(false),
-        m_isAllocated(false),
-        m_usePrescribedThreshold(false),
-        m_computationOptions(0),
-        m_rows(-1), m_cols(-1)
+    explicit JacobiSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
     {
       compute(matrix, computationOptions);
     }
@@ -601,164 +571,33 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
       return compute(matrix, m_computationOptions);
     }
 
-    /** \returns the \a U matrix.
-     *
-     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
-     * the U matrix is n-by-n if you asked for #ComputeFullU, and is n-by-m if you asked for #ComputeThinU.
-     *
-     * The \a m first columns of \a U are the left singular vectors of the matrix being decomposed.
-     *
-     * This method asserts that you asked for \a U to be computed.
-     */
-    const MatrixUType& matrixU() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(computeU() && "This JacobiSVD decomposition didn't compute U. Did you ask for it?");
-      return m_matrixU;
-    }
-
-    /** \returns the \a V matrix.
-     *
-     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
-     * the V matrix is p-by-p if you asked for #ComputeFullV, and is p-by-m if you asked for ComputeThinV.
-     *
-     * The \a m first columns of \a V are the right singular vectors of the matrix being decomposed.
-     *
-     * This method asserts that you asked for \a V to be computed.
-     */
-    const MatrixVType& matrixV() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(computeV() && "This JacobiSVD decomposition didn't compute V. Did you ask for it?");
-      return m_matrixV;
-    }
-
-    /** \returns the vector of singular values.
-     *
-     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p, the
-     * returned vector has size \a m.  Singular values are always sorted in decreasing order.
-     */
-    const SingularValuesType& singularValues() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      return m_singularValues;
-    }
-
-    /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */
-    inline bool computeU() const { return m_computeFullU || m_computeThinU; }
-    /** \returns true if \a V (full or thin) is asked for in this SVD decomposition */
-    inline bool computeV() const { return m_computeFullV || m_computeThinV; }
-
-    /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-      *
-      * \param b the right-hand-side of the equation to solve.
-      *
-      * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
-      *
-      * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and least-squares solving.
-      * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<JacobiSVD, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(computeU() && computeV() && "JacobiSVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-      return internal::solve_retval<JacobiSVD, Rhs>(*this, b.derived());
-    }
-
-    /** \returns the number of singular values that are not exactly 0 */
-    Index nonzeroSingularValues() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      return m_nonzeroSingularValues;
-    }
-    
-    /** \returns the rank of the matrix of which \c *this is the SVD.
-      *
-      * \note This method has to determine which singular values should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline Index rank() const
-    {
-      using std::abs;
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      if(m_singularValues.size()==0) return 0;
-      RealScalar premultiplied_threshold = m_singularValues.coeff(0) * threshold();
-      Index i = m_nonzeroSingularValues-1;
-      while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
-      return i+1;
-    }
-    
-    /** Allows to prescribe a threshold to be used by certain methods, such as rank() and solve(),
-      * which need to determine when singular values are to be considered nonzero.
-      * This is not used for the SVD decomposition itself.
-      *
-      * When it needs to get the threshold value, Eigen calls threshold().
-      * The default is \c NumTraits<Scalar>::epsilon()
-      *
-      * \param threshold The new value to use as the threshold.
-      *
-      * A singular value will be considered nonzero if its value is strictly greater than
-      *  \f$ \vert singular value \vert \leqslant threshold \times \vert max singular value \vert \f$.
-      *
-      * If you want to come back to the default behavior, call setThreshold(Default_t)
-      */
-    JacobiSVD& setThreshold(const RealScalar& threshold)
-    {
-      m_usePrescribedThreshold = true;
-      m_prescribedThreshold = threshold;
-      return *this;
-    }
-
-    /** Allows to come back to the default behavior, letting Eigen use its default formula for
-      * determining the threshold.
-      *
-      * You should pass the special object Eigen::Default as parameter here.
-      * \code svd.setThreshold(Eigen::Default); \endcode
-      *
-      * See the documentation of setThreshold(const RealScalar&).
-      */
-    JacobiSVD& setThreshold(Default_t)
-    {
-      m_usePrescribedThreshold = false;
-      return *this;
-    }
-
-    /** Returns the threshold that will be used by certain methods such as rank().
-      *
-      * See the documentation of setThreshold(const RealScalar&).
-      */
-    RealScalar threshold() const
-    {
-      eigen_assert(m_isInitialized || m_usePrescribedThreshold);
-      return m_usePrescribedThreshold ? m_prescribedThreshold
-                                      : (std::max<Index>)(1,m_diagSize)*NumTraits<Scalar>::epsilon();
-    }
-
-    inline Index rows() const { return m_rows; }
-    inline Index cols() const { return m_cols; }
+    using Base::computeU;
+    using Base::computeV;
+    using Base::rows;
+    using Base::cols;
+    using Base::rank;
 
   private:
     void allocate(Index rows, Index cols, unsigned int computationOptions);
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-    }
 
   protected:
-    MatrixUType m_matrixU;
-    MatrixVType m_matrixV;
-    SingularValuesType m_singularValues;
+    using Base::m_matrixU;
+    using Base::m_matrixV;
+    using Base::m_singularValues;
+    using Base::m_isInitialized;
+    using Base::m_isAllocated;
+    using Base::m_usePrescribedThreshold;
+    using Base::m_computeFullU;
+    using Base::m_computeThinU;
+    using Base::m_computeFullV;
+    using Base::m_computeThinV;
+    using Base::m_computationOptions;
+    using Base::m_nonzeroSingularValues;
+    using Base::m_rows;
+    using Base::m_cols;
+    using Base::m_diagSize;
+    using Base::m_prescribedThreshold;
     WorkMatrixType m_workMatrix;
-    bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold;
-    bool m_computeFullU, m_computeThinU;
-    bool m_computeFullV, m_computeThinV;
-    unsigned int m_computationOptions;
-    Index m_nonzeroSingularValues, m_rows, m_cols, m_diagSize;
-    RealScalar m_prescribedThreshold;
 
     template<typename __MatrixType, int _QRPreconditioner, bool _IsComplex>
     friend struct internal::svd_precondition_2x2_block_to_be_real;
@@ -816,15 +655,13 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, u
   
   if(m_cols>m_rows)   m_qr_precond_morecols.allocate(*this);
   if(m_rows>m_cols)   m_qr_precond_morerows.allocate(*this);
-  if(m_cols!=m_cols)  m_scaledMatrix.resize(rows,cols);
+  if(m_rows!=m_cols)  m_scaledMatrix.resize(rows,cols);
 }
 
 template<typename MatrixType, int QRPreconditioner>
 JacobiSVD<MatrixType, QRPreconditioner>&
 JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsigned int computationOptions)
 {
-  check_template_parameters();
-  
   using std::abs;
   allocate(matrix.rows(), matrix.cols(), computationOptions);
 
@@ -832,8 +669,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   // only worsening the precision of U and V as we accumulate more rotations
   const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
 
-  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
-  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
+  // limit for denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
 
   // Scaling factor to reduce over/under-flows
   RealScalar scale = matrix.cwiseAbs().maxCoeff();
@@ -857,6 +694,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   }
 
   /*** step 2. The main Jacobi SVD iteration. ***/
+  RealScalar maxDiagEntry = m_workMatrix.cwiseAbs().diagonal().maxCoeff();
 
   bool finished = false;
   while(!finished)
@@ -872,25 +710,27 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
         // if this 2x2 sub-matrix is not diagonal already...
         // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
         // keep us iterating forever. Similarly, small denormal numbers are considered zero.
-        using std::max;
-        RealScalar threshold = (max)(considerAsZero, precision * (max)(abs(m_workMatrix.coeff(p,p)),
-                                                                       abs(m_workMatrix.coeff(q,q))));
-        // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791)
+        RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
         if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold)
         {
           finished = false;
-
           // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q);
-          JacobiRotation<RealScalar> j_left, j_right;
-          internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
-
-          // accumulate resulting Jacobi rotations
-          m_workMatrix.applyOnTheLeft(p,q,j_left);
-          if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose());
-
-          m_workMatrix.applyOnTheRight(p,q,j_right);
-          if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
+          // the complex to real operation returns true if the updated 2x2 block is not already diagonal
+          if(internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q, maxDiagEntry))
+          {
+            JacobiRotation<RealScalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+
+            // accumulate resulting Jacobi rotations
+            m_workMatrix.applyOnTheLeft(p,q,j_left);
+            if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose());
+
+            m_workMatrix.applyOnTheRight(p,q,j_right);
+            if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
+
+            // keep track of the largest diagonal coefficient
+            maxDiagEntry = numext::maxi<RealScalar>(maxDiagEntry,numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q))));
+          }
         }
       }
     }
@@ -900,10 +740,25 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
 
   for(Index i = 0; i < m_diagSize; ++i)
   {
-    RealScalar a = abs(m_workMatrix.coeff(i,i));
-    m_singularValues.coeffRef(i) = a;
-    if(computeU() && (a!=RealScalar(0))) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
+    // For a complex matrix, some diagonal coefficients might note have been
+    // treated by svd_precondition_2x2_block_to_be_real, and the imaginary part
+    // of some diagonal entry might not be null.
+    if(NumTraits<Scalar>::IsComplex && abs(numext::imag(m_workMatrix.coeff(i,i)))>considerAsZero)
+    {
+      RealScalar a = abs(m_workMatrix.coeff(i,i));
+      m_singularValues.coeffRef(i) = abs(a);
+      if(computeU()) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
+    }
+    else
+    {
+      // m_workMatrix.coeff(i,i) is already real, no difficulty:
+      RealScalar a = numext::real(m_workMatrix.coeff(i,i));
+      m_singularValues.coeffRef(i) = abs(a);
+      if(computeU() && (a<RealScalar(0))) m_matrixU.col(i) = -m_matrixU.col(i);
+    }
   }
+  
+  m_singularValues *= scale;
 
   /*** step 4. Sort singular values in descending order and compute the number of nonzero singular values ***/
 
@@ -925,38 +780,11 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
       if(computeV()) m_matrixV.col(pos).swap(m_matrixV.col(i));
     }
   }
-  
-  m_singularValues *= scale;
 
   m_isInitialized = true;
   return *this;
 }
 
-namespace internal {
-template<typename _MatrixType, int QRPreconditioner, typename Rhs>
-struct solve_retval<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-  : solve_retval_base<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-{
-  typedef JacobiSVD<_MatrixType, QRPreconditioner> JacobiSVDType;
-  EIGEN_MAKE_SOLVE_HELPERS(JacobiSVDType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    eigen_assert(rhs().rows() == dec().rows());
-
-    // A = U S V^*
-    // So A^{-1} = V S^{-1} U^*
-
-    Matrix<Scalar, Dynamic, Rhs::ColsAtCompileTime, 0, _MatrixType::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime> tmp;
-    Index rank = dec().rank();
-    
-    tmp.noalias() = dec().matrixU().leftCols(rank).adjoint() * rhs();
-    tmp = dec().singularValues().head(rank).asDiagonal().inverse() * tmp;
-    dst = dec().matrixV().leftCols(rank) * tmp;
-  }
-};
-} // end namespace internal
-
 /** \svd_module
   *
   * \return the singular value decomposition of \c *this computed by two-sided
diff --git a/Eigen/src/SVD/JacobiSVD_MKL.h b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
index decda7540..50272154f 100644
--- a/Eigen/src/SVD/JacobiSVD_MKL.h
+++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
@@ -25,68 +25,66 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Singular Value Decomposition - SVD.
  ********************************************************************************
 */
 
-#ifndef EIGEN_JACOBISVD_MKL_H
-#define EIGEN_JACOBISVD_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_JACOBISVD_LAPACKE_H
+#define EIGEN_JACOBISVD_LAPACKE_H
 
 namespace Eigen { 
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_SVD(EIGTYPE, MKLTYPE, MKLRTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
+#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW) \
 template<> inline \
 JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>& \
 JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) \
 { \
   typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
-  typedef MatrixType::RealScalar RealScalar; \
+  /*typedef MatrixType::Scalar Scalar;*/ \
+  /*typedef MatrixType::RealScalar RealScalar;*/ \
   allocate(matrix.rows(), matrix.cols(), computationOptions); \
 \
   /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/ \
   m_nonzeroSingularValues = m_diagSize; \
 \
-  lapack_int lda = matrix.outerStride(), ldu, ldvt; \
-  lapack_int matrix_order = MKLCOLROW; \
+  lapack_int lda = internal::convert_index<lapack_int>(matrix.outerStride()), ldu, ldvt; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
   char jobu, jobvt; \
-  MKLTYPE *u, *vt, dummy; \
+  LAPACKE_TYPE *u, *vt, dummy; \
   jobu  = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N'; \
   jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N'; \
   if (computeU()) { \
-    ldu  = m_matrixU.outerStride(); \
-    u    = (MKLTYPE*)m_matrixU.data(); \
+    ldu  = internal::convert_index<lapack_int>(m_matrixU.outerStride()); \
+    u    = (LAPACKE_TYPE*)m_matrixU.data(); \
   } else { ldu=1; u=&dummy; }\
   MatrixType localV; \
-  ldvt = (m_computeFullV) ? m_cols : (m_computeThinV) ? m_diagSize : 1; \
+  ldvt = (m_computeFullV) ? internal::convert_index<lapack_int>(m_cols) : (m_computeThinV) ? internal::convert_index<lapack_int>(m_diagSize) : 1; \
   if (computeV()) { \
     localV.resize(ldvt, m_cols); \
-    vt   = (MKLTYPE*)localV.data(); \
+    vt   = (LAPACKE_TYPE*)localV.data(); \
   } else { ldvt=1; vt=&dummy; }\
-  Matrix<MKLRTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
+  Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
   MatrixType m_temp; m_temp = matrix; \
-  LAPACKE_##MKLPREFIX##gesvd( matrix_order, jobu, jobvt, m_rows, m_cols, (MKLTYPE*)m_temp.data(), lda, (MKLRTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
+  LAPACKE_##LAPACKE_PREFIX##gesvd( matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(m_rows), internal::convert_index<lapack_int>(m_cols), (LAPACKE_TYPE*)m_temp.data(), lda, (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
   if (computeV()) m_matrixV = localV.adjoint(); \
  /* for(int i=0;i<m_diagSize;i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--; m_singularValues.coeffRef(i)=RealScalar(0);}*/ \
   m_isInitialized = true; \
   return *this; \
 }
 
-EIGEN_MKL_SVD(double,   double,        double, d, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(float,    float,         float , s, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(dcomplex, MKL_Complex16, double, z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(scomplex, MKL_Complex8,  float , c, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(double,   double,                double, d, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(float,    float,                 float , s, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(dcomplex, lapack_complex_double, double, z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(scomplex, lapack_complex_float,  float , c, ColMajor, LAPACK_COL_MAJOR)
 
-EIGEN_MKL_SVD(double,   double,        double, d, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(float,    float,         float , s, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(dcomplex, MKL_Complex16, double, z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(scomplex, MKL_Complex8,  float , c, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(double,   double,                double, d, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(float,    float,                 float , s, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(dcomplex, lapack_complex_double, double, z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(scomplex, lapack_complex_float,  float , c, RowMajor, LAPACK_ROW_MAJOR)
 
 } // end namespace Eigen
 
-#endif // EIGEN_JACOBISVD_MKL_H
+#endif // EIGEN_JACOBISVD_LAPACKE_H
diff --git a/unsupported/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index fd8af3b8c..cc90a3b75 100644
--- a/unsupported/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
 // Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
@@ -12,8 +13,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_SVD_H
-#define EIGEN_SVD_H
+#ifndef EIGEN_SVDBASE_H
+#define EIGEN_SVDBASE_H
 
 namespace Eigen {
 /** \ingroup SVD_Module
@@ -21,9 +22,10 @@ namespace Eigen {
  *
  * \class SVDBase
  *
- * \brief Mother class of SVD classes algorithms
+ * \brief Base class of SVD algorithms
+ *
+ * \tparam Derived the type of the actual SVD decomposition
  *
- * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
  * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
  *   \f[ A = U S V^* \f]
  * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero outside of its main diagonal;
@@ -40,17 +42,18 @@ namespace Eigen {
  *  
  * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
  * terminate in finite (and reasonable) time.
- * \sa MatrixBase::genericSvd()
+ * \sa class BDCSVD, class JacobiSVD
  */
-template<typename _MatrixType> 
+template<typename Derived>
 class SVDBase
 {
 
 public:
-  typedef _MatrixType MatrixType;
+  typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
@@ -61,47 +64,17 @@ public:
     MatrixOptions = MatrixType::Options
   };
 
-  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime,
-		 MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-  MatrixUType;
-  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime,
-		 MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime>
-  MatrixVType;
+  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixUType;
+  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime, MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime> MatrixVType;
   typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
-  typedef typename internal::plain_row_type<MatrixType>::type RowType;
-  typedef typename internal::plain_col_type<MatrixType>::type ColType;
-  typedef Matrix<Scalar, DiagSizeAtCompileTime, DiagSizeAtCompileTime,
-		 MatrixOptions, MaxDiagSizeAtCompileTime, MaxDiagSizeAtCompileTime>
-  WorkMatrixType;
-	
-
-
-
-  /** \brief Method performing the decomposition of given matrix using custom options.
-   *
-   * \param matrix the matrix to decompose
-   * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-   *                           By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU,
-   *                           #ComputeFullV, #ComputeThinV.
-   *
-   * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-   * available with the (non-default) FullPivHouseholderQR preconditioner.
-   */
-  SVDBase& compute(const MatrixType& matrix, unsigned int computationOptions);
-
-  /** \brief Method performing the decomposition of given matrix using current options.
-   *
-   * \param matrix the matrix to decompose
-   *
-   * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int).
-   */
-  //virtual SVDBase& compute(const MatrixType& matrix) = 0;
-  SVDBase& compute(const MatrixType& matrix);
+  
+  Derived& derived() { return *static_cast<Derived*>(this); }
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
   /** \returns the \a U matrix.
    *
-   * For the SVDBase decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
-   * the U matrix is n-by-n if you asked for #ComputeFullU, and is n-by-m if you asked for #ComputeThinU.
+   * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
+   * the U matrix is n-by-n if you asked for \link Eigen::ComputeFullU ComputeFullU \endlink, and is n-by-m if you asked for \link Eigen::ComputeThinU ComputeThinU \endlink.
    *
    * The \a m first columns of \a U are the left singular vectors of the matrix being decomposed.
    *
@@ -117,7 +90,7 @@ public:
   /** \returns the \a V matrix.
    *
    * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
-   * the V matrix is p-by-p if you asked for #ComputeFullV, and is p-by-m if you asked for ComputeThinV.
+   * the V matrix is p-by-p if you asked for \link Eigen::ComputeFullV ComputeFullV \endlink, and is p-by-m if you asked for \link Eigen::ComputeThinV ComputeThinV \endlink.
    *
    * The \a m first columns of \a V are the right singular vectors of the matrix being decomposed.
    *
@@ -141,39 +114,127 @@ public:
     return m_singularValues;
   }
 
-  
-
   /** \returns the number of singular values that are not exactly 0 */
   Index nonzeroSingularValues() const
   {
     eigen_assert(m_isInitialized && "SVD is not initialized.");
     return m_nonzeroSingularValues;
   }
+  
+  /** \returns the rank of the matrix of which \c *this is the SVD.
+    *
+    * \note This method has to determine which singular values should be considered nonzero.
+    *       For that, it uses the threshold value that you can control by calling
+    *       setThreshold(const RealScalar&).
+    */
+  inline Index rank() const
+  {
+    using std::abs;
+    eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+    if(m_singularValues.size()==0) return 0;
+    RealScalar premultiplied_threshold = numext::maxi<RealScalar>(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
+    Index i = m_nonzeroSingularValues-1;
+    while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
+    return i+1;
+  }
+  
+  /** Allows to prescribe a threshold to be used by certain methods, such as rank() and solve(),
+    * which need to determine when singular values are to be considered nonzero.
+    * This is not used for the SVD decomposition itself.
+    *
+    * When it needs to get the threshold value, Eigen calls threshold().
+    * The default is \c NumTraits<Scalar>::epsilon()
+    *
+    * \param threshold The new value to use as the threshold.
+    *
+    * A singular value will be considered nonzero if its value is strictly greater than
+    *  \f$ \vert singular value \vert \leqslant threshold \times \vert max singular value \vert \f$.
+    *
+    * If you want to come back to the default behavior, call setThreshold(Default_t)
+    */
+  Derived& setThreshold(const RealScalar& threshold)
+  {
+    m_usePrescribedThreshold = true;
+    m_prescribedThreshold = threshold;
+    return derived();
+  }
+
+  /** Allows to come back to the default behavior, letting Eigen use its default formula for
+    * determining the threshold.
+    *
+    * You should pass the special object Eigen::Default as parameter here.
+    * \code svd.setThreshold(Eigen::Default); \endcode
+    *
+    * See the documentation of setThreshold(const RealScalar&).
+    */
+  Derived& setThreshold(Default_t)
+  {
+    m_usePrescribedThreshold = false;
+    return derived();
+  }
 
+  /** Returns the threshold that will be used by certain methods such as rank().
+    *
+    * See the documentation of setThreshold(const RealScalar&).
+    */
+  RealScalar threshold() const
+  {
+    eigen_assert(m_isInitialized || m_usePrescribedThreshold);
+    return m_usePrescribedThreshold ? m_prescribedThreshold
+                                    : (std::max<Index>)(1,m_diagSize)*NumTraits<Scalar>::epsilon();
+  }
 
   /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */
   inline bool computeU() const { return m_computeFullU || m_computeThinU; }
   /** \returns true if \a V (full or thin) is asked for in this SVD decomposition */
   inline bool computeV() const { return m_computeFullV || m_computeThinV; }
 
-
   inline Index rows() const { return m_rows; }
   inline Index cols() const { return m_cols; }
-
+  
+  /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
+    *
+    * \param b the right-hand-side of the equation to solve.
+    *
+    * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
+    *
+    * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and least-squares solving.
+    * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
+    */
+  template<typename Rhs>
+  inline const Solve<Derived, Rhs>
+  solve(const MatrixBase<Rhs>& b) const
+  {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
+    return Solve<Derived, Rhs>(derived(), b.derived());
+  }
+  
+  #ifndef EIGEN_PARSED_BY_DOXYGEN
+  template<typename RhsType, typename DstType>
+  EIGEN_DEVICE_FUNC
+  void _solve_impl(const RhsType &rhs, DstType &dst) const;
+  #endif
 
 protected:
+  
+  static void check_template_parameters()
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+  }
+  
   // return true if already allocated
   bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
 
   MatrixUType m_matrixU;
   MatrixVType m_matrixV;
   SingularValuesType m_singularValues;
-  bool m_isInitialized, m_isAllocated;
+  bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold;
   bool m_computeFullU, m_computeThinU;
   bool m_computeFullV, m_computeThinV;
   unsigned int m_computationOptions;
   Index m_nonzeroSingularValues, m_rows, m_cols, m_diagSize;
-
+  RealScalar m_prescribedThreshold;
 
   /** \brief Default Constructor.
    *
@@ -182,13 +243,33 @@ protected:
   SVDBase()
     : m_isInitialized(false),
       m_isAllocated(false),
+      m_usePrescribedThreshold(false),
       m_computationOptions(0),
-      m_rows(-1), m_cols(-1)
-  {}
+      m_rows(-1), m_cols(-1), m_diagSize(0)
+  {
+    check_template_parameters();
+  }
 
 
 };
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename Derived>
+template<typename RhsType, typename DstType>
+void SVDBase<Derived>::_solve_impl(const RhsType &rhs, DstType &dst) const
+{
+  eigen_assert(rhs.rows() == rows());
+
+  // A = U S V^*
+  // So A^{-1} = V S^{-1} U^*
+
+  Matrix<Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+  Index l_rank = rank();
+  tmp.noalias() =  m_matrixU.leftCols(l_rank).adjoint() * rhs;
+  tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
+  dst = m_matrixV.leftCols(l_rank) * tmp;
+}
+#endif
 
 template<typename MatrixType>
 bool SVDBase<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
@@ -220,17 +301,13 @@ bool SVDBase<MatrixType>::allocate(Index rows, Index cols, unsigned int computat
   m_diagSize = (std::min)(m_rows, m_cols);
   m_singularValues.resize(m_diagSize);
   if(RowsAtCompileTime==Dynamic)
-    m_matrixU.resize(m_rows, m_computeFullU ? m_rows
-		     : m_computeThinU ? m_diagSize
-		     : 0);
+    m_matrixU.resize(m_rows, m_computeFullU ? m_rows : m_computeThinU ? m_diagSize : 0);
   if(ColsAtCompileTime==Dynamic)
-    m_matrixV.resize(m_cols, m_computeFullV ? m_cols
-		     : m_computeThinV ? m_diagSize
-		     : 0);
+    m_matrixV.resize(m_cols, m_computeFullV ? m_cols : m_computeThinV ? m_diagSize : 0);
 
   return false;
 }
 
 }// end namespace
 
-#endif // EIGEN_SVD_H
+#endif // EIGEN_SVDBASE_H
diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index 587de37a5..0b1460894 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -28,15 +29,15 @@ template<typename _MatrixType> class UpperBidiagonalization
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
     typedef Matrix<Scalar, 1, ColsAtCompileTime> RowVectorType;
     typedef Matrix<Scalar, RowsAtCompileTime, 1> ColVectorType;
-    typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0> BidiagonalType;
+    typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0, RowMajor> BidiagonalType;
     typedef Matrix<Scalar, ColsAtCompileTime, 1> DiagVectorType;
     typedef Matrix<Scalar, ColsAtCompileTimeMinusOne, 1> SuperDiagVectorType;
     typedef HouseholderSequence<
               const MatrixType,
-              CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Diagonal<const MatrixType,0> >
+              const typename internal::remove_all<typename Diagonal<const MatrixType,0>::ConjugateReturnType>::type
             > HouseholderUSequenceType;
     typedef HouseholderSequence<
               const typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type,
@@ -52,7 +53,7 @@ template<typename _MatrixType> class UpperBidiagonalization
     */
     UpperBidiagonalization() : m_householder(), m_bidiagonal(), m_isInitialized(false) {}
 
-    UpperBidiagonalization(const MatrixType& matrix)
+    explicit UpperBidiagonalization(const MatrixType& matrix)
       : m_householder(matrix.rows(), matrix.cols()),
         m_bidiagonal(matrix.cols(), matrix.cols()),
         m_isInitialized(false)
@@ -61,6 +62,7 @@ template<typename _MatrixType> class UpperBidiagonalization
     }
     
     UpperBidiagonalization& compute(const MatrixType& matrix);
+    UpperBidiagonalization& computeUnblocked(const MatrixType& matrix);
     
     const MatrixType& householder() const { return m_householder; }
     const BidiagonalType& bidiagonal() const { return m_bidiagonal; }
@@ -85,45 +87,307 @@ template<typename _MatrixType> class UpperBidiagonalization
     bool m_isInitialized;
 };
 
-template<typename _MatrixType>
-UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::compute(const _MatrixType& matrix)
+// Standard upper bidiagonalization without fancy optimizations
+// This version should be faster for small matrix size
+template<typename MatrixType>
+void upperbidiagonalization_inplace_unblocked(MatrixType& mat,
+                                              typename MatrixType::RealScalar *diagonal,
+                                              typename MatrixType::RealScalar *upper_diagonal,
+                                              typename MatrixType::Scalar* tempData = 0)
 {
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
-  
-  eigen_assert(rows >= cols && "UpperBidiagonalization is only for matrices satisfying rows>=cols.");
-  
-  m_householder = matrix;
+  typedef typename MatrixType::Scalar Scalar;
 
-  ColVectorType temp(rows);
+  Index rows = mat.rows();
+  Index cols = mat.cols();
+
+  typedef Matrix<Scalar,Dynamic,1,ColMajor,MatrixType::MaxRowsAtCompileTime,1> TempType;
+  TempType tempVector;
+  if(tempData==0)
+  {
+    tempVector.resize(rows);
+    tempData = tempVector.data();
+  }
 
   for (Index k = 0; /* breaks at k==cols-1 below */ ; ++k)
   {
     Index remainingRows = rows - k;
     Index remainingCols = cols - k - 1;
 
-    // construct left householder transform in-place in m_householder
-    m_householder.col(k).tail(remainingRows)
-                 .makeHouseholderInPlace(m_householder.coeffRef(k,k),
-                                         m_bidiagonal.template diagonal<0>().coeffRef(k));
-    // apply householder transform to remaining part of m_householder on the left
-    m_householder.bottomRightCorner(remainingRows, remainingCols)
-                 .applyHouseholderOnTheLeft(m_householder.col(k).tail(remainingRows-1),
-                                            m_householder.coeff(k,k),
-                                            temp.data());
+    // construct left householder transform in-place in A
+    mat.col(k).tail(remainingRows)
+       .makeHouseholderInPlace(mat.coeffRef(k,k), diagonal[k]);
+    // apply householder transform to remaining part of A on the left
+    mat.bottomRightCorner(remainingRows, remainingCols)
+       .applyHouseholderOnTheLeft(mat.col(k).tail(remainingRows-1), mat.coeff(k,k), tempData);
 
     if(k == cols-1) break;
+
+    // construct right householder transform in-place in mat
+    mat.row(k).tail(remainingCols)
+       .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]);
+    // apply householder transform to remaining part of mat on the left
+    mat.bottomRightCorner(remainingRows-1, remainingCols)
+       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData);
+  }
+}
+
+/** \internal
+  * Helper routine for the block reduction to upper bidiagonal form.
+  *
+  * Let's partition the matrix A:
+  * 
+  *      | A00 A01 |
+  *  A = |         |
+  *      | A10 A11 |
+  *
+  * This function reduces to bidiagonal form the left \c rows x \a blockSize vertical panel [A00/A10]
+  * and the \a blockSize x \c cols horizontal panel [A00 A01] of the matrix \a A. The bottom-right block A11
+  * is updated using matrix-matrix products:
+  *   A22 -= V * Y^T - X * U^T
+  * where V and U contains the left and right Householder vectors. U and V are stored in A10, and A01
+  * respectively, and the update matrices X and Y are computed during the reduction.
+  * 
+  */
+template<typename MatrixType>
+void upperbidiagonalization_blocked_helper(MatrixType& A,
+                                           typename MatrixType::RealScalar *diagonal,
+                                           typename MatrixType::RealScalar *upper_diagonal,
+                                           Index bs,
+                                           Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic,
+                                                      traits<MatrixType>::Flags & RowMajorBit> > X,
+                                           Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic,
+                                                      traits<MatrixType>::Flags & RowMajorBit> > Y)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
+  typedef InnerStride<int(StorageOrder) == int(ColMajor) ? 1 : Dynamic> ColInnerStride;
+  typedef InnerStride<int(StorageOrder) == int(ColMajor) ? Dynamic : 1> RowInnerStride;
+  typedef Ref<Matrix<Scalar, Dynamic, 1>, 0, ColInnerStride>    SubColumnType;
+  typedef Ref<Matrix<Scalar, 1, Dynamic>, 0, RowInnerStride>    SubRowType;
+  typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder > > SubMatType;
+  
+  Index brows = A.rows();
+  Index bcols = A.cols();
+
+  Scalar tau_u, tau_u_prev(0), tau_v;
+
+  for(Index k = 0; k < bs; ++k)
+  {
+    Index remainingRows = brows - k;
+    Index remainingCols = bcols - k - 1;
+
+    SubMatType X_k1( X.block(k,0, remainingRows,k) );
+    SubMatType V_k1( A.block(k,0, remainingRows,k) );
+
+    // 1 - update the k-th column of A
+    SubColumnType v_k = A.col(k).tail(remainingRows);
+          v_k -= V_k1 * Y.row(k).head(k).adjoint();
+    if(k) v_k -= X_k1 * A.col(k).head(k);
+    
+    // 2 - construct left Householder transform in-place
+    v_k.makeHouseholderInPlace(tau_v, diagonal[k]);
+       
+    if(k+1<bcols)
+    {
+      SubMatType Y_k  ( Y.block(k+1,0, remainingCols, k+1) );
+      SubMatType U_k1 ( A.block(0,k+1, k,remainingCols) );
+      
+      // this eases the application of Householder transforAions
+      // A(k,k) will store tau_v later
+      A(k,k) = Scalar(1);
+
+      // 3 - Compute y_k^T = tau_v * ( A^T*v_k - Y_k-1*V_k-1^T*v_k - U_k-1*X_k-1^T*v_k )
+      {
+        SubColumnType y_k( Y.col(k).tail(remainingCols) );
+        
+        // let's use the begining of column k of Y as a temporary vector
+        SubColumnType tmp( Y.col(k).head(k) );
+        y_k.noalias()  = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck
+        tmp.noalias()  = V_k1.adjoint()  * v_k;
+        y_k.noalias() -= Y_k.leftCols(k) * tmp;
+        tmp.noalias()  = X_k1.adjoint()  * v_k;
+        y_k.noalias() -= U_k1.adjoint()  * tmp;
+        y_k *= numext::conj(tau_v);
+      }
+
+      // 4 - update k-th row of A (it will become u_k)
+      SubRowType u_k( A.row(k).tail(remainingCols) );
+      u_k = u_k.conjugate();
+      {
+        u_k -= Y_k * A.row(k).head(k+1).adjoint();
+        if(k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint();
+      }
+
+      // 5 - construct right Householder transform in-place
+      u_k.makeHouseholderInPlace(tau_u, upper_diagonal[k]);
+
+      // this eases the application of Householder transformations
+      // A(k,k+1) will store tau_u later
+      A(k,k+1) = Scalar(1);
+
+      // 6 - Compute x_k = tau_u * ( A*u_k - X_k-1*U_k-1^T*u_k - V_k*Y_k^T*u_k )
+      {
+        SubColumnType x_k ( X.col(k).tail(remainingRows-1) );
+        
+        // let's use the begining of column k of X as a temporary vectors
+        // note that tmp0 and tmp1 overlaps
+        SubColumnType tmp0 ( X.col(k).head(k) ),
+                      tmp1 ( X.col(k).head(k+1) );
+                    
+        x_k.noalias()   = A.block(k+1,k+1, remainingRows-1,remainingCols) * u_k.transpose(); // bottleneck
+        tmp0.noalias()  = U_k1 * u_k.transpose();
+        x_k.noalias()  -= X_k1.bottomRows(remainingRows-1) * tmp0;
+        tmp1.noalias()  = Y_k.adjoint() * u_k.transpose();
+        x_k.noalias()  -= A.block(k+1,0, remainingRows-1,k+1) * tmp1;
+        x_k *= numext::conj(tau_u);
+        tau_u = numext::conj(tau_u);
+        u_k = u_k.conjugate();
+      }
+
+      if(k>0) A.coeffRef(k-1,k) = tau_u_prev;
+      tau_u_prev = tau_u;
+    }
+    else
+      A.coeffRef(k-1,k) = tau_u_prev;
+
+    A.coeffRef(k,k) = tau_v;
+  }
+  
+  if(bs<bcols)
+    A.coeffRef(bs-1,bs) = tau_u_prev;
+
+  // update A22
+  if(bcols>bs && brows>bs)
+  {
+    SubMatType A11( A.bottomRightCorner(brows-bs,bcols-bs) );
+    SubMatType A10( A.block(bs,0, brows-bs,bs) );
+    SubMatType A01( A.block(0,bs, bs,bcols-bs) );
+    Scalar tmp = A01(bs-1,0);
+    A01(bs-1,0) = 1;
+    A11.noalias() -= A10 * Y.topLeftCorner(bcols,bs).bottomRows(bcols-bs).adjoint();
+    A11.noalias() -= X.topLeftCorner(brows,bs).bottomRows(brows-bs) * A01;
+    A01(bs-1,0) = tmp;
+  }
+}
+
+/** \internal
+  *
+  * Implementation of a block-bidiagonal reduction.
+  * It is based on the following paper:
+  *   The Design of a Parallel Dense Linear Algebra Software Library: Reduction to Hessenberg, Tridiagonal, and Bidiagonal Form.
+  *   by Jaeyoung Choi, Jack J. Dongarra, David W. Walker. (1995)
+  *   section 3.3
+  */
+template<typename MatrixType, typename BidiagType>
+void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagonal,
+                                            Index maxBlockSize=32,
+                                            typename MatrixType::Scalar* /*tempData*/ = 0)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
+
+  Index rows = A.rows();
+  Index cols = A.cols();
+  Index size = (std::min)(rows, cols);
+
+  // X and Y are work space
+  enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
+  Matrix<Scalar,
+         MatrixType::RowsAtCompileTime,
+         Dynamic,
+         StorageOrder,
+         MatrixType::MaxRowsAtCompileTime> X(rows,maxBlockSize);
+  Matrix<Scalar,
+         MatrixType::ColsAtCompileTime,
+         Dynamic,
+         StorageOrder,
+         MatrixType::MaxColsAtCompileTime> Y(cols,maxBlockSize);
+  Index blockSize = (std::min)(maxBlockSize,size);
+
+  Index k = 0;
+  for(k = 0; k < size; k += blockSize)
+  {
+    Index bs = (std::min)(size-k,blockSize);  // actual size of the block
+    Index brows = rows - k;                   // rows of the block
+    Index bcols = cols - k;                   // columns of the block
+
+    // partition the matrix A:
+    // 
+    //      | A00 A01 A02 |
+    //      |             |
+    // A  = | A10 A11 A12 |
+    //      |             |
+    //      | A20 A21 A22 |
+    //
+    // where A11 is a bs x bs diagonal block,
+    // and let:
+    //      | A11 A12 |
+    //  B = |         |
+    //      | A21 A22 |
+
+    BlockType B = A.block(k,k,brows,bcols);
     
-    // construct right householder transform in-place in m_householder
-    m_householder.row(k).tail(remainingCols)
-                 .makeHouseholderInPlace(m_householder.coeffRef(k,k+1),
-                                         m_bidiagonal.template diagonal<1>().coeffRef(k));
-    // apply householder transform to remaining part of m_householder on the left
-    m_householder.bottomRightCorner(remainingRows-1, remainingCols)
-                 .applyHouseholderOnTheRight(m_householder.row(k).tail(remainingCols-1).transpose(),
-                                             m_householder.coeff(k,k+1),
-                                             temp.data());
+    // This stage performs the bidiagonalization of A11, A21, A12, and updating of A22.
+    // Finally, the algorithm continue on the updated A22.
+    //
+    // However, if B is too small, or A22 empty, then let's use an unblocked strategy
+    if(k+bs==cols || bcols<48) // somewhat arbitrary threshold
+    {
+      upperbidiagonalization_inplace_unblocked(B,
+                                               &(bidiagonal.template diagonal<0>().coeffRef(k)),
+                                               &(bidiagonal.template diagonal<1>().coeffRef(k)),
+                                               X.data()
+                                              );
+      break; // We're done
+    }
+    else
+    {
+      upperbidiagonalization_blocked_helper<BlockType>( B,
+                                                        &(bidiagonal.template diagonal<0>().coeffRef(k)),
+                                                        &(bidiagonal.template diagonal<1>().coeffRef(k)),
+                                                        bs,
+                                                        X.topLeftCorner(brows,bs),
+                                                        Y.topLeftCorner(bcols,bs)
+                                                      );
+    }
   }
+}
+
+template<typename _MatrixType>
+UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::computeUnblocked(const _MatrixType& matrix)
+{
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+  EIGEN_ONLY_USED_FOR_DEBUG(cols);
+
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+
+  m_householder = matrix;
+
+  ColVectorType temp(rows);
+
+  upperbidiagonalization_inplace_unblocked(m_householder,
+                                           &(m_bidiagonal.template diagonal<0>().coeffRef(0)),
+                                           &(m_bidiagonal.template diagonal<1>().coeffRef(0)),
+                                           temp.data());
+
+  m_isInitialized = true;
+  return *this;
+}
+
+template<typename _MatrixType>
+UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::compute(const _MatrixType& matrix)
+{
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+  EIGEN_ONLY_USED_FOR_DEBUG(rows);
+  EIGEN_ONLY_USED_FOR_DEBUG(cols);
+
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+
+  m_householder = matrix;
+  upperbidiagonalization_inplace_blocked(m_householder, m_bidiagonal);
+            
   m_isInitialized = true;
   return *this;
 }
diff --git a/Eigen/src/SparseCholesky/CMakeLists.txt b/Eigen/src/SparseCholesky/CMakeLists.txt
deleted file mode 100644
index 375a59d7a..000000000
--- a/Eigen/src/SparseCholesky/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseCholesky_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SparseCholesky_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SparseCholesky COMPONENT Devel
-  )
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index e1f96ba5a..2907f6529 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -17,43 +17,74 @@ enum SimplicialCholeskyMode {
   SimplicialCholeskyLDLT
 };
 
+namespace internal {
+  template<typename CholMatrixType, typename InputMatrixType>
+  struct simplicial_cholesky_grab_input {
+    typedef CholMatrixType const * ConstCholMatrixPtr;
+    static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp)
+    {
+      tmp = input;
+      pmat = &tmp;
+    }
+  };
+  
+  template<typename MatrixType>
+  struct simplicial_cholesky_grab_input<MatrixType,MatrixType> {
+    typedef MatrixType const * ConstMatrixPtr;
+    static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/)
+    {
+      pmat = &input;
+    }
+  };
+} // end namespace internal
+
 /** \ingroup SparseCholesky_Module
-  * \brief A direct sparse Cholesky factorizations
+  * \brief A base class for direct sparse Cholesky factorizations
   *
-  * These classes provide LL^T and LDL^T Cholesky factorizations of sparse matrices that are
-  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
+  * This is a base class for LL^T and LDL^T Cholesky factorizations of sparse matrices that are
+  * selfadjoint and positive definite. These factorizations allow for solving A.X = B where
   * X and B can be either dense or sparse.
   * 
   * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
   * such that the factorized matrix is P A P^-1.
   *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
+  * \tparam Derived the type of the derived class, that is the actual factorization type.
   *
   */
 template<typename Derived>
-class SimplicialCholeskyBase : internal::noncopyable
+class SimplicialCholeskyBase : public SparseSolverBase<Derived>
 {
+    typedef SparseSolverBase<Derived> Base;
+    using Base::m_isInitialized;
+    
   public:
     typedef typename internal::traits<Derived>::MatrixType MatrixType;
     typedef typename internal::traits<Derived>::OrderingType OrderingType;
     enum { UpLo = internal::traits<Derived>::UpLo };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
+    typedef CholMatrixType const * ConstCholMatrixPtr;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
+    
+    using Base::derived;
 
     /** Default constructor */
     SimplicialCholeskyBase()
-      : m_info(Success), m_isInitialized(false), m_shiftOffset(0), m_shiftScale(1)
+      : m_info(Success), m_shiftOffset(0), m_shiftScale(1)
     {}
 
-    SimplicialCholeskyBase(const MatrixType& matrix)
-      : m_info(Success), m_isInitialized(false), m_shiftOffset(0), m_shiftScale(1)
+    explicit SimplicialCholeskyBase(const MatrixType& matrix)
+      : m_info(Success), m_shiftOffset(0), m_shiftScale(1)
     {
       derived().compute(matrix);
     }
@@ -79,42 +110,14 @@ class SimplicialCholeskyBase : internal::noncopyable
       return m_info;
     }
     
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SimplicialCholeskyBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Simplicial LLT or LDLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SimplicialCholeskyBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<SimplicialCholeskyBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SimplicialCholeskyBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Simplicial LLT or LDLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SimplicialCholesky::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<SimplicialCholeskyBase, Rhs>(*this, b.derived());
-    }
-    
     /** \returns the permutation P
       * \sa permutationPinv() */
-    const PermutationMatrix<Dynamic,Dynamic,Index>& permutationP() const
+    const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& permutationP() const
     { return m_P; }
     
     /** \returns the inverse P^-1 of the permutation P
       * \sa permutationP() */
-    const PermutationMatrix<Dynamic,Dynamic,Index>& permutationPinv() const
+    const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& permutationPinv() const
     { return m_Pinv; }
 
     /** Sets the shift parameters that will be used to adjust the diagonal coefficients during the numerical factorization.
@@ -150,7 +153,7 @@ class SimplicialCholeskyBase : internal::noncopyable
 
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
       eigen_assert(m_matrix.rows()==b.rows());
@@ -175,6 +178,12 @@ class SimplicialCholeskyBase : internal::noncopyable
       if(m_P.size()>0)
         dest = m_Pinv * dest;
     }
+    
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const SparseMatrixBase<Rhs> &b, SparseMatrixBase<Dest> &dest) const
+    {
+      internal::solve_sparse_through_dense_panels(derived(), b, dest);
+    }
 
 #endif // EIGEN_PARSED_BY_DOXYGEN
 
@@ -186,20 +195,33 @@ class SimplicialCholeskyBase : internal::noncopyable
     {
       eigen_assert(matrix.rows()==matrix.cols());
       Index size = matrix.cols();
-      CholMatrixType ap(size,size);
-      ordering(matrix, ap);
-      analyzePattern_preordered(ap, DoLDLT);
-      factorize_preordered<DoLDLT>(ap);
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      ordering(matrix, pmat, tmp);
+      analyzePattern_preordered(*pmat, DoLDLT);
+      factorize_preordered<DoLDLT>(*pmat);
     }
     
     template<bool DoLDLT>
     void factorize(const MatrixType& a)
     {
       eigen_assert(a.rows()==a.cols());
-      int size = a.cols();
-      CholMatrixType ap(size,size);
-      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
-      factorize_preordered<DoLDLT>(ap);
+      Index size = a.cols();
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      
+      if(m_P.size()==0 && (UpLo&Upper)==Upper)
+      {
+        // If there is no ordering, try to directly use the input matrix without any copy
+        internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp);
+      }
+      else
+      {
+        tmp.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+        pmat = &tmp;
+      }
+      
+      factorize_preordered<DoLDLT>(*pmat);
     }
 
     template<bool DoLDLT>
@@ -208,14 +230,15 @@ class SimplicialCholeskyBase : internal::noncopyable
     void analyzePattern(const MatrixType& a, bool doLDLT)
     {
       eigen_assert(a.rows()==a.cols());
-      int size = a.cols();
-      CholMatrixType ap(size,size);
-      ordering(a, ap);
-      analyzePattern_preordered(ap,doLDLT);
+      Index size = a.cols();
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      ordering(a, pmat, tmp);
+      analyzePattern_preordered(*pmat,doLDLT);
     }
     void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT);
     
-    void ordering(const MatrixType& a, CholMatrixType& ap);
+    void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap);
 
     /** keeps off-diagonal entries; drops diagonal entries */
     struct keep_diag {
@@ -226,24 +249,23 @@ class SimplicialCholeskyBase : internal::noncopyable
     };
 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     bool m_factorizationIsOk;
     bool m_analysisIsOk;
     
     CholMatrixType m_matrix;
     VectorType m_diag;                                // the diagonal coefficients (LDLT mode)
-    VectorXi m_parent;                                // elimination tree
-    VectorXi m_nonZerosPerCol;
-    PermutationMatrix<Dynamic,Dynamic,Index> m_P;     // the permutation
-    PermutationMatrix<Dynamic,Dynamic,Index> m_Pinv;  // the inverse permutation
+    VectorI m_parent;                                 // elimination tree
+    VectorI m_nonZerosPerCol;
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_P;     // the permutation
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_Pinv;  // the inverse permutation
 
     RealScalar m_shiftOffset;
     RealScalar m_shiftScale;
 };
 
-template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::Index> > class SimplicialLLT;
-template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::Index> > class SimplicialLDLT;
-template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::Index> > class SimplicialCholesky;
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::StorageIndex> > class SimplicialLLT;
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::StorageIndex> > class SimplicialLDLT;
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::StorageIndex> > class SimplicialCholesky;
 
 namespace internal {
 
@@ -253,12 +275,12 @@ template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<Simp
   typedef _Ordering OrderingType;
   enum { UpLo = _UpLo };
   typedef typename MatrixType::Scalar                         Scalar;
-  typedef typename MatrixType::Index                          Index;
-  typedef SparseMatrix<Scalar, ColMajor, Index>               CholMatrixType;
-  typedef SparseTriangularView<CholMatrixType, Eigen::Lower>  MatrixL;
-  typedef SparseTriangularView<typename CholMatrixType::AdjointReturnType, Eigen::Upper>   MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  typedef typename MatrixType::StorageIndex                   StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex>        CholMatrixType;
+  typedef TriangularView<const CholMatrixType, Eigen::Lower>  MatrixL;
+  typedef TriangularView<const typename CholMatrixType::AdjointReturnType, Eigen::Upper>   MatrixU;
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
 };
 
 template<typename _MatrixType,int _UpLo, typename _Ordering> struct traits<SimplicialLDLT<_MatrixType,_UpLo,_Ordering> >
@@ -267,12 +289,12 @@ template<typename _MatrixType,int _UpLo, typename _Ordering> struct traits<Simpl
   typedef _Ordering OrderingType;
   enum { UpLo = _UpLo };
   typedef typename MatrixType::Scalar                             Scalar;
-  typedef typename MatrixType::Index                              Index;
-  typedef SparseMatrix<Scalar, ColMajor, Index>                   CholMatrixType;
-  typedef SparseTriangularView<CholMatrixType, Eigen::UnitLower>  MatrixL;
-  typedef SparseTriangularView<typename CholMatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  typedef typename MatrixType::StorageIndex                       StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex>            CholMatrixType;
+  typedef TriangularView<const CholMatrixType, Eigen::UnitLower>  MatrixL;
+  typedef TriangularView<const typename CholMatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
 };
 
 template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<SimplicialCholesky<_MatrixType,_UpLo,_Ordering> >
@@ -300,6 +322,8 @@ template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<Simp
   *               or Upper. Default is Lower.
   * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
   *
+  * \implsparsesolverconcept
+  *
   * \sa class SimplicialLDLT, class AMDOrdering, class NaturalOrdering
   */
 template<typename _MatrixType, int _UpLo, typename _Ordering>
@@ -311,7 +335,7 @@ public:
     typedef SimplicialCholeskyBase<SimplicialLLT> Base;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
     typedef internal::traits<SimplicialLLT> Traits;
@@ -321,7 +345,7 @@ public:
     /** Default constructor */
     SimplicialLLT() : Base() {}
     /** Constructs and performs the LLT factorization of \a matrix */
-    SimplicialLLT(const MatrixType& matrix)
+    explicit SimplicialLLT(const MatrixType& matrix)
         : Base(matrix) {}
 
     /** \returns an expression of the factor L */
@@ -389,6 +413,8 @@ public:
   *               or Upper. Default is Lower.
   * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
   *
+  * \implsparsesolverconcept
+  *
   * \sa class SimplicialLLT, class AMDOrdering, class NaturalOrdering
   */
 template<typename _MatrixType, int _UpLo, typename _Ordering>
@@ -400,8 +426,8 @@ public:
     typedef SimplicialCholeskyBase<SimplicialLDLT> Base;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
     typedef internal::traits<SimplicialLDLT> Traits;
     typedef typename Traits::MatrixL  MatrixL;
@@ -411,7 +437,7 @@ public:
     SimplicialLDLT() : Base() {}
 
     /** Constructs and performs the LLT factorization of \a matrix */
-    SimplicialLDLT(const MatrixType& matrix)
+    explicit SimplicialLDLT(const MatrixType& matrix)
         : Base(matrix) {}
 
     /** \returns a vector expression of the diagonal D */
@@ -482,8 +508,8 @@ public:
     typedef SimplicialCholeskyBase<SimplicialCholesky> Base;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
     typedef internal::traits<SimplicialCholesky> Traits;
     typedef internal::traits<SimplicialLDLT<MatrixType,UpLo> > LDLTTraits;
@@ -491,7 +517,7 @@ public:
   public:
     SimplicialCholesky() : Base(), m_LDLT(true) {}
 
-    SimplicialCholesky(const MatrixType& matrix)
+    explicit SimplicialCholesky(const MatrixType& matrix)
       : Base(), m_LDLT(true)
     {
       compute(matrix);
@@ -560,7 +586,7 @@ public:
 
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
     {
       eigen_assert(Base::m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
       eigen_assert(Base::m_matrix.rows()==b.rows());
@@ -596,6 +622,13 @@ public:
         dest = Base::m_Pinv * dest;
     }
     
+    /** \internal */
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const SparseMatrixBase<Rhs> &b, SparseMatrixBase<Dest> &dest) const
+    {
+      internal::solve_sparse_through_dense_panels(*this, b, dest);
+    }
+    
     Scalar determinant() const
     {
       if(m_LDLT)
@@ -614,58 +647,43 @@ public:
 };
 
 template<typename Derived>
-void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixType& ap)
+void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap)
 {
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
-  // Note that amd compute the inverse permutation
+  pmat = &ap;
+  // Note that ordering methods compute the inverse permutation
+  if(!internal::is_same<OrderingType,NaturalOrdering<Index> >::value)
   {
-    CholMatrixType C;
-    C = a.template selfadjointView<UpLo>();
+    {
+      CholMatrixType C;
+      C = a.template selfadjointView<UpLo>();
+      
+      OrderingType ordering;
+      ordering(C,m_Pinv);
+    }
+
+    if(m_Pinv.size()>0) m_P = m_Pinv.inverse();
+    else                m_P.resize(0);
     
-    OrderingType ordering;
-    ordering(C,m_Pinv);
+    ap.resize(size,size);
+    ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
   }
-
-  if(m_Pinv.size()>0)
-    m_P = m_Pinv.inverse();
   else
+  {
+    m_Pinv.resize(0);
     m_P.resize(0);
-
-  ap.resize(size,size);
-  ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+    if(int(UpLo)==int(Lower) || MatrixType::IsRowMajor)
+    {
+      // we have to transpose the lower part to to the upper one
+      ap.resize(size,size);
+      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>();
+    }
+    else
+      internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, ap);
+  }  
 }
 
-namespace internal {
-  
-template<typename Derived, typename Rhs>
-struct solve_retval<SimplicialCholeskyBase<Derived>, Rhs>
-  : solve_retval_base<SimplicialCholeskyBase<Derived>, Rhs>
-{
-  typedef SimplicialCholeskyBase<Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve(rhs(),dst);
-  }
-};
-
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<SimplicialCholeskyBase<Derived>, Rhs>
-  : sparse_solve_retval_base<SimplicialCholeskyBase<Derived>, Rhs>
-{
-  typedef SimplicialCholeskyBase<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_SIMPLICIAL_CHOLESKY_H
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 7aaf702be..31e06995b 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -50,14 +50,14 @@ namespace Eigen {
 template<typename Derived>
 void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrixType& ap, bool doLDLT)
 {
-  const Index size = ap.rows();
+  const StorageIndex size = StorageIndex(ap.rows());
   m_matrix.resize(size, size);
   m_parent.resize(size);
   m_nonZerosPerCol.resize(size);
 
-  ei_declare_aligned_stack_constructed_variable(Index, tags, size, 0);
+  ei_declare_aligned_stack_constructed_variable(StorageIndex, tags, size, 0);
 
-  for(Index k = 0; k < size; ++k)
+  for(StorageIndex k = 0; k < size; ++k)
   {
     /* L(k,:) pattern: all nodes reachable in etree from nz in A(0:k-1,k) */
     m_parent[k] = -1;             /* parent of k is not yet known */
@@ -65,7 +65,7 @@ void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrix
     m_nonZerosPerCol[k] = 0;      /* count of nonzeros in column k of L */
     for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
     {
-      Index i = it.index();
+      StorageIndex i = it.index();
       if(i < k)
       {
         /* follow path from i to root of etree, stop at flagged node */
@@ -82,9 +82,9 @@ void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrix
   }
 
   /* construct Lp index array from m_nonZerosPerCol column counts */
-  Index* Lp = m_matrix.outerIndexPtr();
+  StorageIndex* Lp = m_matrix.outerIndexPtr();
   Lp[0] = 0;
-  for(Index k = 0; k < size; ++k)
+  for(StorageIndex k = 0; k < size; ++k)
     Lp[k+1] = Lp[k] + m_nonZerosPerCol[k] + (doLDLT ? 0 : 1);
 
   m_matrix.resizeNonZeros(Lp[size]);
@@ -104,31 +104,31 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
 
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
   eigen_assert(ap.rows()==ap.cols());
-  const Index size = ap.rows();
-  eigen_assert(m_parent.size()==size);
-  eigen_assert(m_nonZerosPerCol.size()==size);
+  eigen_assert(m_parent.size()==ap.rows());
+  eigen_assert(m_nonZerosPerCol.size()==ap.rows());
 
-  const Index* Lp = m_matrix.outerIndexPtr();
-  Index* Li = m_matrix.innerIndexPtr();
+  const StorageIndex size = StorageIndex(ap.rows());
+  const StorageIndex* Lp = m_matrix.outerIndexPtr();
+  StorageIndex* Li = m_matrix.innerIndexPtr();
   Scalar* Lx = m_matrix.valuePtr();
 
   ei_declare_aligned_stack_constructed_variable(Scalar, y, size, 0);
-  ei_declare_aligned_stack_constructed_variable(Index,  pattern, size, 0);
-  ei_declare_aligned_stack_constructed_variable(Index,  tags, size, 0);
+  ei_declare_aligned_stack_constructed_variable(StorageIndex,  pattern, size, 0);
+  ei_declare_aligned_stack_constructed_variable(StorageIndex,  tags, size, 0);
 
   bool ok = true;
   m_diag.resize(DoLDLT ? size : 0);
 
-  for(Index k = 0; k < size; ++k)
+  for(StorageIndex k = 0; k < size; ++k)
   {
     // compute nonzero pattern of kth row of L, in topological order
     y[k] = 0.0;                     // Y(0:k) is now all zero
-    Index top = size;               // stack for pattern is empty
+    StorageIndex top = size;               // stack for pattern is empty
     tags[k] = k;                    // mark node k as visited
     m_nonZerosPerCol[k] = 0;        // count of nonzeros in column k of L
-    for(typename MatrixType::InnerIterator it(ap,k); it; ++it)
+    for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
     {
-      Index i = it.index();
+      StorageIndex i = it.index();
       if(i <= k)
       {
         y[i] += numext::conj(it.value());            /* scatter A(i,k) into Y (sum duplicates) */
diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h
index 220c6451c..8a5cc91f2 100644
--- a/Eigen/src/SparseCore/AmbiVector.h
+++ b/Eigen/src/SparseCore/AmbiVector.h
@@ -19,15 +19,15 @@ namespace internal {
   *
   * See BasicSparseLLT and SparseProduct for usage examples.
   */
-template<typename _Scalar, typename _Index>
+template<typename _Scalar, typename _StorageIndex>
 class AmbiVector
 {
   public:
     typedef _Scalar Scalar;
-    typedef _Index Index;
+    typedef _StorageIndex StorageIndex;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
-    AmbiVector(Index size)
+    explicit AmbiVector(Index size)
       : m_buffer(0), m_zero(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1)
     {
       resize(size);
@@ -39,7 +39,7 @@ class AmbiVector
     Index nonZeros() const;
 
     /** Specifies a sub-vector to work on */
-    void setBounds(Index start, Index end) { m_start = start; m_end = end; }
+    void setBounds(Index start, Index end) { m_start = convert_index(start); m_end = convert_index(end); }
 
     void setZero();
 
@@ -55,12 +55,16 @@ class AmbiVector
     {
       if (m_allocatedSize < size)
         reallocate(size);
-      m_size = size;
+      m_size = convert_index(size);
     }
 
-    Index size() const { return m_size; }
+    StorageIndex size() const { return m_size; }
 
   protected:
+    StorageIndex convert_index(Index idx)
+    {
+      return internal::convert_index<StorageIndex>(idx);
+    }
 
     void reallocate(Index size)
     {
@@ -70,15 +74,15 @@ class AmbiVector
       if (size<1000)
       {
         Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar);
-        m_allocatedElements = (allocSize*sizeof(Scalar))/sizeof(ListEl);
+        m_allocatedElements = convert_index((allocSize*sizeof(Scalar))/sizeof(ListEl));
         m_buffer = new Scalar[allocSize];
       }
       else
       {
-        m_allocatedElements = (size*sizeof(Scalar))/sizeof(ListEl);
+        m_allocatedElements = convert_index((size*sizeof(Scalar))/sizeof(ListEl));
         m_buffer = new Scalar[size];
       }
-      m_size = size;
+      m_size = convert_index(size);
       m_start = 0;
       m_end = m_size;
     }
@@ -86,7 +90,7 @@ class AmbiVector
     void reallocateSparse()
     {
       Index copyElements = m_allocatedElements;
-      m_allocatedElements = (std::min)(Index(m_allocatedElements*1.5),m_size);
+      m_allocatedElements = (std::min)(StorageIndex(m_allocatedElements*1.5),m_size);
       Index allocSize = m_allocatedElements * sizeof(ListEl);
       allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
       Scalar* newBuffer = new Scalar[allocSize];
@@ -99,30 +103,30 @@ class AmbiVector
     // element type of the linked list
     struct ListEl
     {
-      Index next;
-      Index index;
+      StorageIndex next;
+      StorageIndex index;
       Scalar value;
     };
 
     // used to store data in both mode
     Scalar* m_buffer;
     Scalar m_zero;
-    Index m_size;
-    Index m_start;
-    Index m_end;
-    Index m_allocatedSize;
-    Index m_allocatedElements;
-    Index m_mode;
+    StorageIndex m_size;
+    StorageIndex m_start;
+    StorageIndex m_end;
+    StorageIndex m_allocatedSize;
+    StorageIndex m_allocatedElements;
+    StorageIndex m_mode;
 
     // linked list mode
-    Index m_llStart;
-    Index m_llCurrent;
-    Index m_llSize;
+    StorageIndex m_llStart;
+    StorageIndex m_llCurrent;
+    StorageIndex m_llSize;
 };
 
 /** \returns the number of non zeros in the current sub vector */
-template<typename _Scalar,typename _Index>
-_Index AmbiVector<_Scalar,_Index>::nonZeros() const
+template<typename _Scalar,typename _StorageIndex>
+Index AmbiVector<_Scalar,_StorageIndex>::nonZeros() const
 {
   if (m_mode==IsSparse)
     return m_llSize;
@@ -130,8 +134,8 @@ _Index AmbiVector<_Scalar,_Index>::nonZeros() const
     return m_end - m_start;
 }
 
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::init(double estimatedDensity)
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::init(double estimatedDensity)
 {
   if (estimatedDensity>0.1)
     init(IsDense);
@@ -139,8 +143,8 @@ void AmbiVector<_Scalar,_Index>::init(double estimatedDensity)
     init(IsSparse);
 }
 
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::init(int mode)
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::init(int mode)
 {
   m_mode = mode;
   if (m_mode==IsSparse)
@@ -155,15 +159,15 @@ void AmbiVector<_Scalar,_Index>::init(int mode)
   *
   * Don't worry, this function is extremely cheap.
   */
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::restart()
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::restart()
 {
   m_llCurrent = m_llStart;
 }
 
 /** Set all coefficients of current subvector to zero */
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::setZero()
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::setZero()
 {
   if (m_mode==IsDense)
   {
@@ -178,8 +182,8 @@ void AmbiVector<_Scalar,_Index>::setZero()
   }
 }
 
-template<typename _Scalar,typename _Index>
-_Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
+template<typename _Scalar,typename _StorageIndex>
+_Scalar& AmbiVector<_Scalar,_StorageIndex>::coeffRef(Index i)
 {
   if (m_mode==IsDense)
     return m_buffer[i];
@@ -195,7 +199,7 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
       m_llCurrent = 0;
       ++m_llSize;
       llElements[0].value = Scalar(0);
-      llElements[0].index = i;
+      llElements[0].index = convert_index(i);
       llElements[0].next = -1;
       return llElements[0].value;
     }
@@ -204,7 +208,7 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
       // this is going to be the new first element of the list
       ListEl& el = llElements[m_llSize];
       el.value = Scalar(0);
-      el.index = i;
+      el.index = convert_index(i);
       el.next = m_llStart;
       m_llStart = m_llSize;
       ++m_llSize;
@@ -213,7 +217,7 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
     }
     else
     {
-      Index nextel = llElements[m_llCurrent].next;
+      StorageIndex nextel = llElements[m_llCurrent].next;
       eigen_assert(i>=llElements[m_llCurrent].index && "you must call restart() before inserting an element with lower or equal index");
       while (nextel >= 0 && llElements[nextel].index<=i)
       {
@@ -237,7 +241,7 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
         // let's insert a new coefficient
         ListEl& el = llElements[m_llSize];
         el.value = Scalar(0);
-        el.index = i;
+        el.index = convert_index(i);
         el.next = llElements[m_llCurrent].next;
         llElements[m_llCurrent].next = m_llSize;
         ++m_llSize;
@@ -247,8 +251,8 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
   }
 }
 
-template<typename _Scalar,typename _Index>
-_Scalar& AmbiVector<_Scalar,_Index>::coeff(_Index i)
+template<typename _Scalar,typename _StorageIndex>
+_Scalar& AmbiVector<_Scalar,_StorageIndex>::coeff(Index i)
 {
   if (m_mode==IsDense)
     return m_buffer[i];
@@ -275,8 +279,8 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeff(_Index i)
 }
 
 /** Iterator over the nonzero coefficients */
-template<typename _Scalar,typename _Index>
-class AmbiVector<_Scalar,_Index>::Iterator
+template<typename _Scalar,typename _StorageIndex>
+class AmbiVector<_Scalar,_StorageIndex>::Iterator
 {
   public:
     typedef _Scalar Scalar;
@@ -288,7 +292,7 @@ class AmbiVector<_Scalar,_Index>::Iterator
       * In practice, all coefficients having a magnitude smaller than \a epsilon
       * are skipped.
       */
-    Iterator(const AmbiVector& vec, const RealScalar& epsilon = 0)
+    explicit Iterator(const AmbiVector& vec, const RealScalar& epsilon = 0)
       : m_vector(vec)
     {
       using std::abs;
@@ -320,7 +324,7 @@ class AmbiVector<_Scalar,_Index>::Iterator
       }
     }
 
-    Index index() const { return m_cachedIndex; }
+    StorageIndex index() const { return m_cachedIndex; }
     Scalar value() const { return m_cachedValue; }
 
     operator bool() const { return m_cachedIndex>=0; }
@@ -332,7 +336,7 @@ class AmbiVector<_Scalar,_Index>::Iterator
       {
         do {
           ++m_cachedIndex;
-        } while (m_cachedIndex<m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex])<m_epsilon);
+        } while (m_cachedIndex<m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex])<=m_epsilon);
         if (m_cachedIndex<m_vector.m_end)
           m_cachedValue = m_vector.m_buffer[m_cachedIndex];
         else
@@ -343,7 +347,7 @@ class AmbiVector<_Scalar,_Index>::Iterator
         ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_vector.m_buffer);
         do {
           m_currentEl = llElements[m_currentEl].next;
-        } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<m_epsilon);
+        } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<=m_epsilon);
         if (m_currentEl<0)
         {
           m_cachedIndex = -1;
@@ -359,9 +363,9 @@ class AmbiVector<_Scalar,_Index>::Iterator
 
   protected:
     const AmbiVector& m_vector; // the target vector
-    Index m_currentEl;            // the current element in sparse/linked-list mode
+    StorageIndex m_currentEl;   // the current element in sparse/linked-list mode
     RealScalar m_epsilon;       // epsilon used to prune zero coefficients
-    Index m_cachedIndex;          // current coordinate
+    StorageIndex m_cachedIndex; // current coordinate
     Scalar m_cachedValue;       // current value
     bool m_isDense;             // mode of the vector
 };
diff --git a/Eigen/src/SparseCore/CMakeLists.txt b/Eigen/src/SparseCore/CMakeLists.txt
deleted file mode 100644
index d860452a6..000000000
--- a/Eigen/src/SparseCore/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseCore_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_SparseCore_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SparseCore COMPONENT Devel
-  )
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index a667cb56e..d89fa0dae 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,13 +18,13 @@ namespace internal {
   * Stores a sparse set of values as a list of values and a list of indices.
   *
   */
-template<typename _Scalar,typename _Index>
+template<typename _Scalar,typename _StorageIndex>
 class CompressedStorage
 {
   public:
 
     typedef _Scalar Scalar;
-    typedef _Index Index;
+    typedef _StorageIndex StorageIndex;
 
   protected:
 
@@ -36,7 +36,7 @@ class CompressedStorage
       : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
     {}
 
-    CompressedStorage(size_t size)
+    explicit CompressedStorage(Index size)
       : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
     {
       resize(size);
@@ -51,8 +51,11 @@ class CompressedStorage
     CompressedStorage& operator=(const CompressedStorage& other)
     {
       resize(other.size());
-      internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
-      internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
+      if(other.size()>0)
+      {
+        internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
+        internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
+      }
       return *this;
     }
 
@@ -70,9 +73,9 @@ class CompressedStorage
       delete[] m_indices;
     }
 
-    void reserve(size_t size)
+    void reserve(Index size)
     {
-      size_t newAllocatedSize = m_size + size;
+      Index newAllocatedSize = m_size + size;
       if (newAllocatedSize > m_allocatedSize)
         reallocate(newAllocatedSize);
     }
@@ -83,39 +86,40 @@ class CompressedStorage
         reallocate(m_size);
     }
 
-    void resize(size_t size, double reserveSizeFactor = 0)
+    void resize(Index size, double reserveSizeFactor = 0)
     {
       if (m_allocatedSize<size)
-        reallocate(size + size_t(reserveSizeFactor*double(size)));
+      {
+        Index realloc_size = (std::min<Index>)(NumTraits<StorageIndex>::highest(),  size + Index(reserveSizeFactor*double(size)));
+        if(realloc_size<size)
+          internal::throw_std_bad_alloc();
+        reallocate(realloc_size);
+      }
       m_size = size;
     }
 
     void append(const Scalar& v, Index i)
     {
-      Index id = static_cast<Index>(m_size);
+      Index id = m_size;
       resize(m_size+1, 1);
       m_values[id] = v;
-      m_indices[id] = i;
+      m_indices[id] = internal::convert_index<StorageIndex>(i);
     }
 
-    inline size_t size() const { return m_size; }
-    inline size_t allocatedSize() const { return m_allocatedSize; }
+    inline Index size() const { return m_size; }
+    inline Index allocatedSize() const { return m_allocatedSize; }
     inline void clear() { m_size = 0; }
 
-    inline Scalar& value(size_t i) { return m_values[i]; }
-    inline const Scalar& value(size_t i) const { return m_values[i]; }
+    const Scalar* valuePtr() const { return m_values; }
+    Scalar* valuePtr() { return m_values; }
+    const StorageIndex* indexPtr() const { return m_indices; }
+    StorageIndex* indexPtr() { return m_indices; }
 
-    inline Index& index(size_t i) { return m_indices[i]; }
-    inline const Index& index(size_t i) const { return m_indices[i]; }
+    inline Scalar& value(Index i) { eigen_internal_assert(m_values!=0); return m_values[i]; }
+    inline const Scalar& value(Index i) const { eigen_internal_assert(m_values!=0); return m_values[i]; }
 
-    static CompressedStorage Map(Index* indices, Scalar* values, size_t size)
-    {
-      CompressedStorage res;
-      res.m_indices = indices;
-      res.m_values = values;
-      res.m_allocatedSize = res.m_size = size;
-      return res;
-    }
+    inline StorageIndex& index(Index i) { eigen_internal_assert(m_indices!=0); return m_indices[i]; }
+    inline const StorageIndex& index(Index i) const { eigen_internal_assert(m_indices!=0); return m_indices[i]; }
 
     /** \returns the largest \c k such that for all \c j in [0,k) index[\c j]\<\a key */
     inline Index searchLowerIndex(Index key) const
@@ -124,17 +128,17 @@ class CompressedStorage
     }
 
     /** \returns the largest \c k in [start,end) such that for all \c j in [start,k) index[\c j]\<\a key */
-    inline Index searchLowerIndex(size_t start, size_t end, Index key) const
+    inline Index searchLowerIndex(Index start, Index end, Index key) const
     {
       while(end>start)
       {
-        size_t mid = (end+start)>>1;
+        Index mid = (end+start)>>1;
         if (m_indices[mid]<key)
           start = mid+1;
         else
           end = mid;
       }
-      return static_cast<Index>(start);
+      return start;
     }
 
     /** \returns the stored value at index \a key
@@ -147,20 +151,20 @@ class CompressedStorage
         return m_values[m_size-1];
       // ^^  optimization: let's first check if it is the last coefficient
       // (very common in high level algorithms)
-      const size_t id = searchLowerIndex(0,m_size-1,key);
+      const Index id = searchLowerIndex(0,m_size-1,key);
       return ((id<m_size) && (m_indices[id]==key)) ? m_values[id] : defaultValue;
     }
 
     /** Like at(), but the search is performed in the range [start,end) */
-    inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const
+    inline Scalar atInRange(Index start, Index end, Index key, const Scalar &defaultValue = Scalar(0)) const
     {
       if (start>=end)
-        return Scalar(0);
+        return defaultValue;
       else if (end>start && key==m_indices[end-1])
         return m_values[end-1];
       // ^^  optimization: let's first check if it is the last coefficient
       // (very common in high level algorithms)
-      const size_t id = searchLowerIndex(start,end-1,key);
+      const Index id = searchLowerIndex(start,end-1,key);
       return ((id<end) && (m_indices[id]==key)) ? m_values[id] : defaultValue;
     }
 
@@ -169,16 +173,35 @@ class CompressedStorage
       * such that the keys are sorted. */
     inline Scalar& atWithInsertion(Index key, const Scalar& defaultValue = Scalar(0))
     {
-      size_t id = searchLowerIndex(0,m_size,key);
+      Index id = searchLowerIndex(0,m_size,key);
       if (id>=m_size || m_indices[id]!=key)
       {
-        resize(m_size+1,1);
-        for (size_t j=m_size-1; j>id; --j)
+        if (m_allocatedSize<m_size+1)
+        {
+          m_allocatedSize = 2*(m_size+1);
+          internal::scoped_array<Scalar> newValues(m_allocatedSize);
+          internal::scoped_array<StorageIndex> newIndices(m_allocatedSize);
+
+          // copy first chunk
+          internal::smart_copy(m_values,  m_values +id, newValues.ptr());
+          internal::smart_copy(m_indices, m_indices+id, newIndices.ptr());
+
+          // copy the rest
+          if(m_size>id)
+          {
+            internal::smart_copy(m_values +id,  m_values +m_size, newValues.ptr() +id+1);
+            internal::smart_copy(m_indices+id,  m_indices+m_size, newIndices.ptr()+id+1);
+          }
+          std::swap(m_values,newValues.ptr());
+          std::swap(m_indices,newIndices.ptr());
+        }
+        else if(m_size>id)
         {
-          m_indices[j] = m_indices[j-1];
-          m_values[j] = m_values[j-1];
+          internal::smart_memmove(m_values +id, m_values +m_size, m_values +id+1);
+          internal::smart_memmove(m_indices+id, m_indices+m_size, m_indices+id+1);
         }
-        m_indices[id] = key;
+        m_size++;
+        m_indices[id] = internal::convert_index<StorageIndex>(key);
         m_values[id] = defaultValue;
       }
       return m_values[id];
@@ -186,9 +209,9 @@ class CompressedStorage
 
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
-      size_t k = 0;
-      size_t n = size();
-      for (size_t i=0; i<n; ++i)
+      Index k = 0;
+      Index n = size();
+      for (Index i=0; i<n; ++i)
       {
         if (!internal::isMuchSmallerThan(value(i), reference, epsilon))
         {
@@ -202,27 +225,29 @@ class CompressedStorage
 
   protected:
 
-    inline void reallocate(size_t size)
+    inline void reallocate(Index size)
     {
-      Scalar* newValues  = new Scalar[size];
-      Index* newIndices = new Index[size];
-      size_t copySize = (std::min)(size, m_size);
-      // copy
-      internal::smart_copy(m_values, m_values+copySize, newValues);
-      internal::smart_copy(m_indices, m_indices+copySize, newIndices);
-      // delete old stuff
-      delete[] m_values;
-      delete[] m_indices;
-      m_values = newValues;
-      m_indices = newIndices;
+      #ifdef EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+        EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+      #endif
+      eigen_internal_assert(size!=m_allocatedSize);
+      internal::scoped_array<Scalar> newValues(size);
+      internal::scoped_array<StorageIndex> newIndices(size);
+      Index copySize = (std::min)(size, m_size);
+      if (copySize>0) {
+        internal::smart_copy(m_values, m_values+copySize, newValues.ptr());
+        internal::smart_copy(m_indices, m_indices+copySize, newIndices.ptr());
+      }
+      std::swap(m_values,newValues.ptr());
+      std::swap(m_indices,newIndices.ptr());
       m_allocatedSize = size;
     }
 
   protected:
     Scalar* m_values;
-    Index* m_indices;
-    size_t m_size;
-    size_t m_allocatedSize;
+    StorageIndex* m_indices;
+    Index m_size;
+    Index m_allocatedSize;
 
 };
 
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 5c320e2d2..492eb0a29 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,27 +15,31 @@ namespace Eigen {
 namespace internal {
 
 template<typename Lhs, typename Rhs, typename ResultType>
-static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false)
 {
   typedef typename remove_all<Lhs>::type::Scalar Scalar;
-  typedef typename remove_all<Lhs>::type::Index Index;
 
   // make sure to call innerSize/outerSize since we fake the storage order.
   Index rows = lhs.innerSize();
   Index cols = rhs.outerSize();
   eigen_assert(lhs.outerSize() == rhs.innerSize());
-
-  std::vector<bool> mask(rows,false);
-  Matrix<Scalar,Dynamic,1> values(rows);
-  Matrix<Index,Dynamic,1>  indices(rows);
-
+  
+  ei_declare_aligned_stack_constructed_variable(bool,   mask,     rows, 0);
+  ei_declare_aligned_stack_constructed_variable(Scalar, values,   rows, 0);
+  ei_declare_aligned_stack_constructed_variable(Index,  indices,  rows, 0);
+  
+  std::memset(mask,0,sizeof(bool)*rows);
+
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
+  
   // estimate the number of non zero entries
   // given a rhs column containing Y non zeros, we assume that the respective Y columns
   // of the lhs differs in average of one non zeros, thus the number of non zeros for
   // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
   // per column of the lhs.
   // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
   res.setZero();
   res.reserve(Index(estimated_nnz_prod));
@@ -45,11 +49,11 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
 
     res.startVec(j);
     Index nnz = 0;
-    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
     {
       Scalar y = rhsIt.value();
       Index k = rhsIt.index();
-      for (typename Lhs::InnerIterator lhsIt(lhs, k); lhsIt; ++lhsIt)
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
       {
         Index i = lhsIt.index();
         Scalar x = lhsIt.value();
@@ -64,53 +68,51 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
           values[i] += x * y;
       }
     }
-
-    // unordered insertion
-    for(Index k=0; k<nnz; ++k)
-    {
-      Index i = indices[k];
-      res.insertBackByOuterInnerUnordered(j,i) = values[i];
-      mask[i] = false;
-    }
-
-#if 0
-    // alternative ordered insertion code:
-
-    Index t200 = rows/(log2(200)*1.39);
-    Index t = (rows*100)/139;
-
-    // FIXME reserve nnz non zeros
-    // FIXME implement fast sort algorithms for very small nnz
-    // if the result is sparse enough => use a quick sort
-    // otherwise => loop through the entire vector
-    // In order to avoid to perform an expensive log2 when the
-    // result is clearly very sparse we use a linear bound up to 200.
-    //if((nnz<200 && nnz<t200) || nnz * log2(nnz) < t)
-    //res.startVec(j);
-    if(true)
+    if(!sortedInsertion)
     {
-      if(nnz>1) std::sort(indices.data(),indices.data()+nnz);
+      // unordered insertion
       for(Index k=0; k<nnz; ++k)
       {
         Index i = indices[k];
-        res.insertBackByOuterInner(j,i) = values[i];
+        res.insertBackByOuterInnerUnordered(j,i) = values[i];
         mask[i] = false;
       }
     }
     else
     {
-      // dense path
-      for(Index i=0; i<rows; ++i)
+      // alternative ordered insertion code:
+      const Index t200 = rows/11; // 11 == (log2(200)*1.39)
+      const Index t = (rows*100)/139;
+
+      // FIXME reserve nnz non zeros
+      // FIXME implement faster sorting algorithms for very small nnz
+      // if the result is sparse enough => use a quick sort
+      // otherwise => loop through the entire vector
+      // In order to avoid to perform an expensive log2 when the
+      // result is clearly very sparse we use a linear bound up to 200.
+      if((nnz<200 && nnz<t200) || nnz * numext::log2(int(nnz)) < t)
       {
-        if(mask[i])
+        if(nnz>1) std::sort(indices,indices+nnz);
+        for(Index k=0; k<nnz; ++k)
         {
-          mask[i] = false;
+          Index i = indices[k];
           res.insertBackByOuterInner(j,i) = values[i];
+          mask[i] = false;
+        }
+      }
+      else
+      {
+        // dense path
+        for(Index i=0; i<rows; ++i)
+        {
+          if(mask[i])
+          {
+            mask[i] = false;
+            res.insertBackByOuterInner(j,i) = values[i];
+          }
         }
       }
     }
-#endif
-
   }
   res.finalize();
 }
@@ -134,13 +136,28 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,C
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
-    ColMajorMatrix resCol(lhs.rows(),rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
-    // sort the non zeros:
-    RowMajorMatrix resRow(resCol);
-    res = resRow;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrixAux;
+    typedef typename sparse_eval<ColMajorMatrixAux,ResultType::RowsAtCompileTime,ResultType::ColsAtCompileTime,ColMajorMatrixAux::Flags>::type ColMajorMatrix;
+    
+    // If the result is tall and thin (in the extreme case a column vector)
+    // then it is faster to sort the coefficients inplace instead of transposing twice.
+    // FIXME, the following heuristic is probably not very good.
+    if(lhs.rows()>rhs.cols())
+    {
+      ColMajorMatrix resCol(lhs.rows(),rhs.cols());
+      // perform sorted insertion
+      internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol, true);
+      res = resCol.markAsRValue();
+    }
+    else
+    {
+      ColMajorMatrixAux resCol(lhs.rows(),rhs.cols());
+      // ressort to transpose to sort the entries
+      internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrixAux>(lhs, rhs, resCol, false);
+      RowMajorMatrix resRow(resCol);
+      res = resRow.markAsRValue();
+    }
   }
 };
 
@@ -149,7 +166,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-     typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
+     typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
      RowMajorMatrix rhsRow = rhs;
      RowMajorMatrix resRow(lhs.rows(), rhs.cols());
      internal::conservative_sparse_sparse_product_impl<RowMajorMatrix,Lhs,RowMajorMatrix>(rhsRow, lhs, resRow);
@@ -162,7 +179,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
     RowMajorMatrix lhsRow = lhs;
     RowMajorMatrix resRow(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorMatrix,RowMajorMatrix>(rhs, lhsRow, resRow);
@@ -175,7 +192,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,R
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
     RowMajorMatrix resRow(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
     res = resRow;
@@ -190,7 +207,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,C
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
     ColMajorMatrix resCol(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
     res = resCol;
@@ -202,7 +219,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
     ColMajorMatrix lhsCol = lhs;
     ColMajorMatrix resCol(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<ColMajorMatrix,Rhs,ColMajorMatrix>(lhsCol, rhs, resCol);
@@ -215,7 +232,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
     ColMajorMatrix rhsCol = rhs;
     ColMajorMatrix resCol(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorMatrix,ColMajorMatrix>(lhs, rhsCol, resCol);
@@ -228,8 +245,8 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,R
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
     RowMajorMatrix resRow(lhs.rows(),rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
     // sort the non zeros:
@@ -240,6 +257,89 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,R
 
 } // end namespace internal
 
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename ResultType>
+static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+{
+  typedef typename remove_all<Lhs>::type::Scalar Scalar;
+  Index cols = rhs.outerSize();
+  eigen_assert(lhs.outerSize() == rhs.innerSize());
+
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
+
+  for (Index j=0; j<cols; ++j)
+  {
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
+    {
+      Scalar y = rhsIt.value();
+      Index k = rhsIt.index();
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
+      {
+        Index i = lhsIt.index();
+        Scalar x = lhsIt.value();
+        res.coeffRef(i,j) += x * y;
+      }
+    }
+  }
+}
+
+
+} // end namespace internal
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename ResultType,
+  int LhsStorageOrder = (traits<Lhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+  int RhsStorageOrder = (traits<Rhs>::Flags&RowMajorBit) ? RowMajor : ColMajor>
+struct sparse_sparse_to_dense_product_selector;
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    internal::sparse_sparse_to_dense_product_impl<Lhs,Rhs,ResultType>(lhs, rhs, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
+    ColMajorMatrix lhsCol(lhs);
+    internal::sparse_sparse_to_dense_product_impl<ColMajorMatrix,Rhs,ResultType>(lhsCol, rhs, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
+    ColMajorMatrix rhsCol(rhs);
+    internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorMatrix,ResultType>(lhs, rhsCol, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    Transpose<ResultType> trRes(res);
+    internal::sparse_sparse_to_dense_product_impl<Rhs,Lhs,Transpose<ResultType> >(rhs, lhs, trRes);
+  }
+};
+
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
diff --git a/Eigen/src/SparseCore/MappedSparseMatrix.h b/Eigen/src/SparseCore/MappedSparseMatrix.h
index ab1a266a9..67718c85b 100644
--- a/Eigen/src/SparseCore/MappedSparseMatrix.h
+++ b/Eigen/src/SparseCore/MappedSparseMatrix.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,9 +10,10 @@
 #ifndef EIGEN_MAPPED_SPARSEMATRIX_H
 #define EIGEN_MAPPED_SPARSEMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
-/** \class MappedSparseMatrix
+/** \deprecated Use Map<SparseMatrix<> >
+  * \class MappedSparseMatrix
   *
   * \brief Sparse matrix
   *
@@ -22,160 +23,45 @@ namespace Eigen {
   *
   */
 namespace internal {
-template<typename _Scalar, int _Flags, typename _Index>
-struct traits<MappedSparseMatrix<_Scalar, _Flags, _Index> > : traits<SparseMatrix<_Scalar, _Flags, _Index> >
+template<typename _Scalar, int _Flags, typename _StorageIndex>
+struct traits<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> > : traits<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
 {};
-}
+} // end namespace internal
 
-template<typename _Scalar, int _Flags, typename _Index>
+template<typename _Scalar, int _Flags, typename _StorageIndex>
 class MappedSparseMatrix
-  : public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _Index> >
+  : public Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
 {
-  public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix)
-    enum { IsRowMajor = Base::IsRowMajor };
-
-  protected:
-
-    Index   m_outerSize;
-    Index   m_innerSize;
-    Index   m_nnz;
-    Index*  m_outerIndex;
-    Index*  m_innerIndices;
-    Scalar* m_values;
+    typedef Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> > Base;
 
   public:
-
-    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
-    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
-    inline Index innerSize() const { return m_innerSize; }
-    inline Index outerSize() const { return m_outerSize; }
     
-    bool isCompressed() const { return true; }
-
-    //----------------------------------------
-    // direct access interface
-    inline const Scalar* valuePtr() const { return m_values; }
-    inline Scalar* valuePtr() { return m_values; }
-
-    inline const Index* innerIndexPtr() const { return m_innerIndices; }
-    inline Index* innerIndexPtr() { return m_innerIndices; }
-
-    inline const Index* outerIndexPtr() const { return m_outerIndex; }
-    inline Index* outerIndexPtr() { return m_outerIndex; }
-    //----------------------------------------
-
-    inline Scalar coeff(Index row, Index col) const
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
+    typedef typename Base::StorageIndex StorageIndex;
+    typedef typename Base::Scalar Scalar;
 
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      if (start==end)
-        return Scalar(0);
-      else if (end>0 && inner==m_innerIndices[end-1])
-        return m_values[end-1];
-      // ^^  optimization: let's first check if it is the last coefficient
-      // (very common in high level algorithms)
-
-      const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
-      const Index id = r-&m_innerIndices[0];
-      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
-      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
-      Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner);
-      const Index id = r-&m_innerIndices[0];
-      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
-      return m_values[id];
-    }
-
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return m_nnz; }
-
-    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, Scalar* valuePtr)
-      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
-        m_innerIndices(innerIndexPtr), m_values(valuePtr)
+    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZeroPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr)
     {}
 
     /** Empty destructor */
     inline ~MappedSparseMatrix() {}
 };
 
-template<typename Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix<Scalar,_Flags,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer]),
-        m_start(m_id),
-        m_end(mat.outerIndexPtr()[outer+1])
-    {}
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); }
-
-    inline Index index() const { return m_matrix.innerIndexPtr()[m_id]; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
-};
+namespace internal {
 
-template<typename Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix<Scalar,_Flags,_Index>::ReverseInnerIterator
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct evaluator<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> >
+  : evaluator<SparseCompressedBase<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> > >
 {
-  public:
-    ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer+1]),
-        m_start(mat.outerIndexPtr()[outer]),
-        m_end(m_id)
-    {}
-
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); }
-
-    inline Index index() const { return m_matrix.innerIndexPtr()[m_id-1]; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
+  typedef MappedSparseMatrix<_Scalar,_Options,_StorageIndex> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
 };
 
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_MAPPED_SPARSEMATRIX_H
diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
new file mode 100644
index 000000000..18352a847
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseAssign.h
@@ -0,0 +1,216 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEASSIGN_H
+#define EIGEN_SPARSEASSIGN_H
+
+namespace Eigen { 
+
+template<typename Derived>    
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
+{
+  internal::call_assignment_no_alias(derived(), other.derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+{
+  // TODO use the evaluator mechanism
+  other.evalTo(derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+inline Derived& SparseMatrixBase<Derived>::operator=(const SparseMatrixBase<OtherDerived>& other)
+{
+  // by default sparse evaluation do not alias, so we can safely bypass the generic call_assignment routine
+  internal::Assignment<Derived,OtherDerived,internal::assign_op<Scalar,typename OtherDerived::Scalar> >
+          ::run(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+inline Derived& SparseMatrixBase<Derived>::operator=(const Derived& other)
+{
+  internal::call_assignment_no_alias(derived(), other.derived());
+  return derived();
+}
+
+namespace internal {
+
+template<>
+struct storage_kind_to_evaluator_kind<Sparse> {
+  typedef IteratorBased Kind;
+};
+
+template<>
+struct storage_kind_to_shape<Sparse> {
+  typedef SparseShape Shape;
+};
+
+struct Sparse2Sparse {};
+struct Sparse2Dense  {};
+
+template<> struct AssignmentKind<SparseShape, SparseShape>           { typedef Sparse2Sparse Kind; };
+template<> struct AssignmentKind<SparseShape, SparseTriangularShape> { typedef Sparse2Sparse Kind; };
+template<> struct AssignmentKind<DenseShape,  SparseShape>           { typedef Sparse2Dense  Kind; };
+template<> struct AssignmentKind<DenseShape,  SparseTriangularShape> { typedef Sparse2Dense  Kind; };
+
+
+template<typename DstXprType, typename SrcXprType>
+void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
+{
+  typedef typename DstXprType::Scalar Scalar;
+  typedef internal::evaluator<DstXprType> DstEvaluatorType;
+  typedef internal::evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  const bool transpose = (DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit);
+  const Index outerEvaluationSize = (SrcEvaluatorType::Flags&RowMajorBit) ? src.rows() : src.cols();
+  if ((!transpose) && src.isRValue())
+  {
+    // eval without temporary
+    dst.resize(src.rows(), src.cols());
+    dst.setZero();
+    dst.reserve((std::max)(src.rows(),src.cols())*2);
+    for (Index j=0; j<outerEvaluationSize; ++j)
+    {
+      dst.startVec(j);
+      for (typename SrcEvaluatorType::InnerIterator it(srcEvaluator, j); it; ++it)
+      {
+        Scalar v = it.value();
+        dst.insertBackByOuterInner(j,it.index()) = v;
+      }
+    }
+    dst.finalize();
+  }
+  else
+  {
+    // eval through a temporary
+    eigen_assert(( ((internal::traits<DstXprType>::SupportedAccessPatterns & OuterRandomAccessPattern)==OuterRandomAccessPattern) ||
+              (!((DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit)))) &&
+              "the transpose operation is supposed to be handled in SparseMatrix::operator=");
+
+    enum { Flip = (DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit) };
+
+    
+    DstXprType temp(src.rows(), src.cols());
+
+    temp.reserve((std::max)(src.rows(),src.cols())*2);
+    for (Index j=0; j<outerEvaluationSize; ++j)
+    {
+      temp.startVec(j);
+      for (typename SrcEvaluatorType::InnerIterator it(srcEvaluator, j); it; ++it)
+      {
+        Scalar v = it.value();
+        temp.insertBackByOuterInner(Flip?it.index():j,Flip?j:it.index()) = v;
+      }
+    }
+    temp.finalize();
+
+    dst = temp.markAsRValue();
+  }
+}
+
+// Generic Sparse to Sparse assignment
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    assign_sparse_to_sparse(dst.derived(), src.derived());
+  }
+};
+
+// Generic Sparse to Dense assignment
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    if(internal::is_same<Functor,internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> >::value)
+      dst.setZero();
+    
+    internal::evaluator<SrcXprType> srcEval(src);
+    resize_if_allowed(dst, src, func);
+    internal::evaluator<DstXprType> dstEval(dst);
+    
+    const Index outerEvaluationSize = (internal::evaluator<SrcXprType>::Flags&RowMajorBit) ? src.rows() : src.cols();
+    for (Index j=0; j<outerEvaluationSize; ++j)
+      for (typename internal::evaluator<SrcXprType>::InnerIterator i(srcEval,j); i; ++i)
+        func.assignCoeff(dstEval.coeffRef(i.row(),i.col()), i.value());
+  }
+};
+
+// Specialization for "dst = dec.solve(rhs)"
+// NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Sparse2Sparse>
+{
+  typedef Solve<DecType,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec()._solve_impl(src.rhs(), dst);
+  }
+};
+
+struct Diagonal2Sparse {};
+
+template<> struct AssignmentKind<SparseShape,DiagonalShape> { typedef Diagonal2Sparse Kind; };
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse>
+{
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef Array<StorageIndex,Dynamic,1> ArrayXI;
+  typedef Array<Scalar,Dynamic,1> ArrayXS;
+  template<int Options>
+  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    Index size = src.diagonal().size();
+    dst.makeCompressed();
+    dst.resizeNonZeros(size);
+    Map<ArrayXI>(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1);
+    Map<ArrayXI>(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size));
+    Map<ArrayXS>(dst.valuePtr(), size) = src.diagonal();
+  }
+  
+  template<typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    dst.diagonal() = src.diagonal();
+  }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
+};
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEASSIGN_H
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 0c90bafbe..511e92b2f 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,8 +10,9 @@
 #ifndef EIGEN_SPARSE_BLOCK_H
 #define EIGEN_SPARSE_BLOCK_H
 
-namespace Eigen { 
+namespace Eigen {
 
+// Subset of columns or rows
 template<typename XprType, int BlockRows, int BlockCols>
 class BlockImpl<XprType,BlockRows,BlockCols,true,Sparse>
   : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,true> >
@@ -22,185 +23,189 @@ public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
 protected:
     enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
+    typedef SparseMatrixBase<BlockType> Base;
+    using Base::convert_index;
 public:
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
-    
-    class InnerIterator: public XprType::InnerIterator
-    {
-        typedef typename BlockImpl::Index Index;
-      public:
-        inline InnerIterator(const BlockType& xpr, Index outer)
-          : XprType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-    class ReverseInnerIterator: public XprType::ReverseInnerIterator
-    {
-        typedef typename BlockImpl::Index Index;
-      public:
-        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
-          : XprType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
 
-    inline BlockImpl(const XprType& xpr, int i)
-      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
+    inline BlockImpl(XprType& xpr, Index i)
+      : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize)
     {}
 
-    inline BlockImpl(const XprType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols))
     {}
-    
-    inline const Scalar coeff(int row, int col) const
+
+    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+    Index nonZeros() const
     {
-      return m_matrix.coeff(row + IsRowMajor ? m_outerStart : 0, col +IsRowMajor ? 0 :  m_outerStart);
+      typedef internal::evaluator<XprType> EvaluatorType;
+      EvaluatorType matEval(m_matrix);
+      Index nnz = 0;
+      Index end = m_outerStart + m_outerSize.value();
+      for(Index j=m_outerStart; j<end; ++j)
+        for(typename EvaluatorType::InnerIterator it(matEval, j); it; ++it)
+          ++nnz;
+      return nnz;
     }
-    
-    inline const Scalar coeff(int index) const
+
+    inline const Scalar coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+
+    inline const Scalar coeff(Index index) const
     {
       return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
     }
 
-    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+    inline const XprType& nestedExpression() const { return m_matrix; }
+    inline XprType& nestedExpression() { return m_matrix; }
+    Index startRow() const { return IsRowMajor ? m_outerStart : 0; }
+    Index startCol() const { return IsRowMajor ? 0 : m_outerStart; }
+    Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    Index blockCols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
 
   protected:
 
-    typename XprType::Nested m_matrix;
+    typename internal::ref_selector<XprType>::non_const_type m_matrix;
     Index m_outerStart;
     const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
 
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-  private:
-    Index nonZeros() const;
+  protected:
+    // Disable assignment with clear error message.
+    // Note that simply removing operator= yields compilation errors with ICC+MSVC
+    template<typename T>
+    BlockImpl& operator=(const T&)
+    {
+      EIGEN_STATIC_ASSERT(sizeof(T)==0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY);
+      return *this;
+    }
 };
 
 
 /***************************************************************************
-* specialisation for SparseMatrix
+* specialization for SparseMatrix
 ***************************************************************************/
 
-template<typename _Scalar, int _Options, typename _Index, int BlockRows, int BlockCols>
-class BlockImpl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true,Sparse>
-  : public SparseMatrixBase<Block<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true> >
+namespace internal {
+
+template<typename SparseMatrixType, int BlockRows, int BlockCols>
+class sparse_matrix_block_impl
+  : public SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
 {
-    typedef SparseMatrix<_Scalar, _Options, _Index> SparseMatrixType;
     typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
     typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
-    typedef Block<const SparseMatrixType, BlockRows, BlockCols, true> ConstBlockType;
+    typedef SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> > Base;
+    using Base::convert_index;
 public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 protected:
+    typedef typename Base::IndexVector IndexVector;
     enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
 public:
-    
-    class InnerIterator: public SparseMatrixType::InnerIterator
-    {
-      public:
-        inline InnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-    class ReverseInnerIterator: public SparseMatrixType::ReverseInnerIterator
-    {
-      public:
-        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
 
-    inline BlockImpl(const SparseMatrixType& xpr, int i)
-      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
+    inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index i)
+      : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize)
     {}
 
-    inline BlockImpl(const SparseMatrixType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
+    inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols))
     {}
 
     template<typename OtherDerived>
     inline BlockType& operator=(const SparseMatrixBase<OtherDerived>& other)
     {
       typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _NestedMatrixType;
-      _NestedMatrixType& matrix = const_cast<_NestedMatrixType&>(m_matrix);;
-      // This assignement is slow if this vector set is not empty
+      _NestedMatrixType& matrix = m_matrix;
+      // This assignment is slow if this vector set is not empty
       // and/or it is not at the end of the nonzeros of the underlying matrix.
 
       // 1 - eval to a temporary to avoid transposition and/or aliasing issues
-      SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, Index> tmp(other);
+      Ref<const SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, StorageIndex> > tmp(other.derived());
+      eigen_internal_assert(tmp.outerSize()==m_outerSize.value());
 
       // 2 - let's check whether there is enough allocated memory
       Index nnz           = tmp.nonZeros();
-      Index start         = m_outerStart==0 ? 0 : matrix.outerIndexPtr()[m_outerStart]; // starting position of the current block
-      Index end           = m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()]; // ending posiiton of the current block
+      Index start         = m_outerStart==0 ? 0 : m_matrix.outerIndexPtr()[m_outerStart]; // starting position of the current block
+      Index end           = m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()]; // ending position of the current block
       Index block_size    = end - start;                                                // available room in the current block
       Index tail_size     = m_matrix.outerIndexPtr()[m_matrix.outerSize()] - end;
-      
+
       Index free_size     = m_matrix.isCompressed()
                           ? Index(matrix.data().allocatedSize()) + block_size
                           : block_size;
 
-      if(nnz>free_size) 
+      Index tmp_start = tmp.outerIndexPtr()[0];
+
+      bool update_trailing_pointers = false;
+      if(nnz>free_size)
       {
         // realloc manually to reduce copies
         typename SparseMatrixType::Storage newdata(m_matrix.data().allocatedSize() - block_size + nnz);
 
-        std::memcpy(&newdata.value(0), &m_matrix.data().value(0), start*sizeof(Scalar));
-        std::memcpy(&newdata.index(0), &m_matrix.data().index(0), start*sizeof(Index));
+        internal::smart_copy(m_matrix.valuePtr(),       m_matrix.valuePtr() + start,      newdata.valuePtr());
+        internal::smart_copy(m_matrix.innerIndexPtr(),  m_matrix.innerIndexPtr() + start, newdata.indexPtr());
 
-        std::memcpy(&newdata.value(start), &tmp.data().value(0), nnz*sizeof(Scalar));
-        std::memcpy(&newdata.index(start), &tmp.data().index(0), nnz*sizeof(Index));
+        internal::smart_copy(tmp.valuePtr() + tmp_start,      tmp.valuePtr() + tmp_start + nnz,       newdata.valuePtr() + start);
+        internal::smart_copy(tmp.innerIndexPtr() + tmp_start, tmp.innerIndexPtr() + tmp_start + nnz,  newdata.indexPtr() + start);
+
+        internal::smart_copy(matrix.valuePtr()+end,       matrix.valuePtr()+end + tail_size,      newdata.valuePtr()+start+nnz);
+        internal::smart_copy(matrix.innerIndexPtr()+end,  matrix.innerIndexPtr()+end + tail_size, newdata.indexPtr()+start+nnz);
 
-        std::memcpy(&newdata.value(start+nnz), &matrix.data().value(end), tail_size*sizeof(Scalar));
-        std::memcpy(&newdata.index(start+nnz), &matrix.data().index(end), tail_size*sizeof(Index));
-        
         newdata.resize(m_matrix.outerIndexPtr()[m_matrix.outerSize()] - block_size + nnz);
 
         matrix.data().swap(newdata);
+
+        update_trailing_pointers = true;
       }
       else
       {
-        // no need to realloc, simply copy the tail at its respective position and insert tmp
-        matrix.data().resize(start + nnz + tail_size);
+        if(m_matrix.isCompressed())
+        {
+          // no need to realloc, simply copy the tail at its respective position and insert tmp
+          matrix.data().resize(start + nnz + tail_size);
 
-        std::memmove(&matrix.data().value(start+nnz), &matrix.data().value(end), tail_size*sizeof(Scalar));
-        std::memmove(&matrix.data().index(start+nnz), &matrix.data().index(end), tail_size*sizeof(Index));
+          internal::smart_memmove(matrix.valuePtr()+end,      matrix.valuePtr() + end+tail_size,      matrix.valuePtr() + start+nnz);
+          internal::smart_memmove(matrix.innerIndexPtr()+end, matrix.innerIndexPtr() + end+tail_size, matrix.innerIndexPtr() + start+nnz);
 
-        std::memcpy(&matrix.data().value(start), &tmp.data().value(0), nnz*sizeof(Scalar));
-        std::memcpy(&matrix.data().index(start), &tmp.data().index(0), nnz*sizeof(Index));
+          update_trailing_pointers = true;
+        }
+
+        internal::smart_copy(tmp.valuePtr() + tmp_start,      tmp.valuePtr() + tmp_start + nnz,       matrix.valuePtr() + start);
+        internal::smart_copy(tmp.innerIndexPtr() + tmp_start, tmp.innerIndexPtr() + tmp_start + nnz,  matrix.innerIndexPtr() + start);
       }
-      
-      // update innerNonZeros
-      if(!m_matrix.isCompressed())
-        for(Index j=0; j<m_outerSize.value(); ++j)
-          matrix.innerNonZeroPtr()[m_outerStart+j] = tmp.innerVector(j).nonZeros();
-
-      // update outer index pointers
-      Index p = start;
-      for(Index k=0; k<m_outerSize.value(); ++k)
+
+      // update outer index pointers and innerNonZeros
+      if(IsVectorAtCompileTime)
       {
-        matrix.outerIndexPtr()[m_outerStart+k] = p;
-        p += tmp.innerVector(k).nonZeros();
+        if(!m_matrix.isCompressed())
+          matrix.innerNonZeroPtr()[m_outerStart] = StorageIndex(nnz);
+        matrix.outerIndexPtr()[m_outerStart] = StorageIndex(start);
       }
-      std::ptrdiff_t offset = nnz - block_size;
-      for(Index k = m_outerStart + m_outerSize.value(); k<=matrix.outerSize(); ++k)
+      else
       {
-        matrix.outerIndexPtr()[k] += offset;
+        StorageIndex p = StorageIndex(start);
+        for(Index k=0; k<m_outerSize.value(); ++k)
+        {
+          StorageIndex nnz_k = internal::convert_index<StorageIndex>(tmp.innerVector(k).nonZeros());
+          if(!m_matrix.isCompressed())
+            matrix.innerNonZeroPtr()[m_outerStart+k] = nnz_k;
+          matrix.outerIndexPtr()[m_outerStart+k] = p;
+          p += nnz_k;
+        }
+      }
+
+      if(update_trailing_pointers)
+      {
+        StorageIndex offset = internal::convert_index<StorageIndex>(nnz - block_size);
+        for(Index k = m_outerStart + m_outerSize.value(); k<=matrix.outerSize(); ++k)
+        {
+          matrix.outerIndexPtr()[k] += offset;
+        }
       }
 
       return derived();
@@ -212,50 +217,46 @@ public:
     }
 
     inline const Scalar* valuePtr() const
-    { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.valuePtr(); }
     inline Scalar* valuePtr()
-    { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.valuePtr(); }
 
-    inline const Index* innerIndexPtr() const
-    { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-    inline Index* innerIndexPtr()
-    { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    inline const StorageIndex* innerIndexPtr() const
+    { return m_matrix.innerIndexPtr(); }
+    inline StorageIndex* innerIndexPtr()
+    { return m_matrix.innerIndexPtr(); }
 
-    inline const Index* outerIndexPtr() const
+    inline const StorageIndex* outerIndexPtr() const
+    { return m_matrix.outerIndexPtr() + m_outerStart; }
+    inline StorageIndex* outerIndexPtr()
     { return m_matrix.outerIndexPtr() + m_outerStart; }
-    inline Index* outerIndexPtr()
-    { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
 
-    Index nonZeros() const
-    {
-      if(m_matrix.isCompressed())
-        return  std::size_t(m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
-              - std::size_t(m_matrix.outerIndexPtr()[m_outerStart]);
-      else if(m_outerSize.value()==0)
-        return 0;
-      else
-        return Map<const Matrix<Index,OuterSize,1> >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
-    }
-    
-    inline Scalar& coeffRef(int row, int col)
+    inline const StorageIndex* innerNonZeroPtr() const
+    { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
+    inline StorageIndex* innerNonZeroPtr()
+    { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
+
+    bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
+
+    inline Scalar& coeffRef(Index row, Index col)
     {
-      return m_matrix.const_cast_derived().coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+      return m_matrix.coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
     }
-    
-    inline const Scalar coeff(int row, int col) const
+
+    inline const Scalar coeff(Index row, Index col) const
     {
       return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
     }
-    
-    inline const Scalar coeff(int index) const
+
+    inline const Scalar coeff(Index index) const
     {
       return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
     }
 
     const Scalar& lastCoeff() const
     {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(BlockImpl);
-      eigen_assert(nonZeros()>0);
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(sparse_matrix_block_impl);
+      eigen_assert(Base::nonZeros()>0);
       if(m_matrix.isCompressed())
         return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
       else
@@ -265,109 +266,62 @@ public:
     EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
     EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
 
+    inline const SparseMatrixType& nestedExpression() const { return m_matrix; }
+    inline SparseMatrixType& nestedExpression() { return m_matrix; }
+    Index startRow() const { return IsRowMajor ? m_outerStart : 0; }
+    Index startCol() const { return IsRowMajor ? 0 : m_outerStart; }
+    Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    Index blockCols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
   protected:
 
-    typename SparseMatrixType::Nested m_matrix;
+    typename internal::ref_selector<SparseMatrixType>::non_const_type m_matrix;
     Index m_outerStart;
     const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
 
 };
 
+} // namespace internal
 
-template<typename _Scalar, int _Options, typename _Index, int BlockRows, int BlockCols>
-class BlockImpl<const SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true,Sparse>
-  : public SparseMatrixBase<Block<const SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true> >
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+class BlockImpl<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true,Sparse>
+  : public internal::sparse_matrix_block_impl<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols>
 {
-    typedef SparseMatrix<_Scalar, _Options, _Index> SparseMatrixType;
-    typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
-    typedef Block<const SparseMatrixType, BlockRows, BlockCols, true> BlockType;
 public:
-    enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
-    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
-protected:
-    enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
-public:
-    
-    class InnerIterator: public SparseMatrixType::InnerIterator
-    {
-      public:
-        inline InnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-    class ReverseInnerIterator: public SparseMatrixType::ReverseInnerIterator
-    {
-      public:
-        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-
-    inline BlockImpl(const SparseMatrixType& xpr, int i)
-      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
-    {}
-
-    inline BlockImpl(const SparseMatrixType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
-    {}
-
-    inline const Scalar* valuePtr() const
-    { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-
-    inline const Index* innerIndexPtr() const
-    { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-
-    inline const Index* outerIndexPtr() const
-    { return m_matrix.outerIndexPtr() + m_outerStart; }
-
-    Index nonZeros() const
-    {
-      if(m_matrix.isCompressed())
-        return  std::size_t(m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
-              - std::size_t(m_matrix.outerIndexPtr()[m_outerStart]);
-      else if(m_outerSize.value()==0)
-        return 0;
-      else
-        return Map<const Matrix<Index,OuterSize,1> >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
-    }
-    
-    inline const Scalar coeff(int row, int col) const
-    {
-      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
-    }
-    
-    inline const Scalar coeff(int index) const
-    {
-      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
-    }
-
-    const Scalar& lastCoeff() const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(BlockImpl);
-      eigen_assert(nonZeros()>0);
-      if(m_matrix.isCompressed())
-        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
-      else
-        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart]+m_matrix.innerNonZeroPtr()[m_outerStart]-1];
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
-
-  protected:
-
-    typename SparseMatrixType::Nested m_matrix;
-    Index m_outerStart;
-    const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
+  typedef _StorageIndex StorageIndex;
+  typedef SparseMatrix<_Scalar, _Options, _StorageIndex> SparseMatrixType;
+  typedef internal::sparse_matrix_block_impl<SparseMatrixType,BlockRows,BlockCols> Base;
+  inline BlockImpl(SparseMatrixType& xpr, Index i)
+    : Base(xpr, i)
+  {}
+
+  inline BlockImpl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    : Base(xpr, startRow, startCol, blockRows, blockCols)
+  {}
+
+  using Base::operator=;
+};
 
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+class BlockImpl<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true,Sparse>
+  : public internal::sparse_matrix_block_impl<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols>
+{
+public:
+  typedef _StorageIndex StorageIndex;
+  typedef const SparseMatrix<_Scalar, _Options, _StorageIndex> SparseMatrixType;
+  typedef internal::sparse_matrix_block_impl<SparseMatrixType,BlockRows,BlockCols> Base;
+  inline BlockImpl(SparseMatrixType& xpr, Index i)
+    : Base(xpr, i)
+  {}
+
+  inline BlockImpl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    : Base(xpr, startRow, startCol, blockRows, blockCols)
+  {}
+
+  using Base::operator=;
+private:
+  template<typename Derived> BlockImpl(const SparseMatrixBase<Derived>& xpr, Index i);
+  template<typename Derived> BlockImpl(const SparseMatrixBase<Derived>& xpr);
 };
 
 //----------
@@ -396,7 +350,7 @@ SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
   return Block<Derived,Dynamic,Dynamic,true>(derived(),
                                              IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
                                              IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-  
+
 }
 
 /** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
@@ -409,129 +363,241 @@ SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
   return Block<const Derived,Dynamic,Dynamic,true>(derived(),
                                                   IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
                                                   IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-  
+
 }
 
 /** Generic implementation of sparse Block expression.
-  * Real-only. 
+  * Real-only.
   */
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
 class BlockImpl<XprType,BlockRows,BlockCols,InnerPanel,Sparse>
   : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,InnerPanel> >, internal::no_assignment_operator
 {
-  typedef typename internal::remove_all<typename XprType::Nested>::type _MatrixTypeNested;
-  typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef SparseMatrixBase<BlockType> Base;
+    using Base::convert_index;
 public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 
+    typedef typename internal::remove_all<typename XprType::Nested>::type _MatrixTypeNested;
+
     /** Column or Row constructor
       */
-    inline BlockImpl(const XprType& xpr, int i)
+    inline BlockImpl(XprType& xpr, Index i)
       : m_matrix(xpr),
-        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
-        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? convert_index(i) : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? convert_index(i) : 0),
         m_blockRows(BlockRows==1 ? 1 : xpr.rows()),
         m_blockCols(BlockCols==1 ? 1 : xpr.cols())
     {}
 
     /** Dynamic-size constructor
       */
-    inline BlockImpl(const XprType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(blockRows), m_blockCols(blockCols)
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr), m_startRow(convert_index(startRow)), m_startCol(convert_index(startCol)), m_blockRows(convert_index(blockRows)), m_blockCols(convert_index(blockCols))
     {}
 
-    inline int rows() const { return m_blockRows.value(); }
-    inline int cols() const { return m_blockCols.value(); }
+    inline Index rows() const { return m_blockRows.value(); }
+    inline Index cols() const { return m_blockCols.value(); }
 
-    inline Scalar& coeffRef(int row, int col)
+    inline Scalar& coeffRef(Index row, Index col)
     {
-      return m_matrix.const_cast_derived()
-               .coeffRef(row + m_startRow.value(), col + m_startCol.value());
+      return m_matrix.coeffRef(row + m_startRow.value(), col + m_startCol.value());
     }
 
-    inline const Scalar coeff(int row, int col) const
+    inline const Scalar coeff(Index row, Index col) const
     {
       return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
     }
 
-    inline Scalar& coeffRef(int index)
+    inline Scalar& coeffRef(Index index)
     {
-      return m_matrix.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_matrix.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                               m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
-    inline const Scalar coeff(int index) const
+    inline const Scalar coeff(Index index) const
     {
-      return m_matrix
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_matrix.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
-    
-    inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
-    
-    class InnerIterator : public _MatrixTypeNested::InnerIterator
-    {
-      typedef typename _MatrixTypeNested::InnerIterator Base;
-      const BlockType& m_block;
-      Index m_end;
-    public:
-
-      EIGEN_STRONG_INLINE InnerIterator(const BlockType& block, Index outer)
-        : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
-          m_block(block),
-          m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
-      {
-        while( (Base::operator bool()) && (Base::index() < (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value())) )
-          Base::operator++();
-      }
 
-      inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-      inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
-      inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
-      inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
-      
-      inline operator bool() const { return Base::operator bool() && Base::index() < m_end; }
-    };
-    class ReverseInnerIterator : public _MatrixTypeNested::ReverseInnerIterator
-    {
-      typedef typename _MatrixTypeNested::ReverseInnerIterator Base;
-      const BlockType& m_block;
-      Index m_begin;
-    public:
-
-      EIGEN_STRONG_INLINE ReverseInnerIterator(const BlockType& block, Index outer)
-        : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
-          m_block(block),
-          m_begin(IsRowMajor ? block.m_startCol.value() : block.m_startRow.value())
-      {
-        while( (Base::operator bool()) && (Base::index() >= (IsRowMajor ? m_block.m_startCol.value()+block.m_blockCols.value() : m_block.m_startRow.value()+block.m_blockRows.value())) )
-          Base::operator--();
-      }
+    inline const XprType& nestedExpression() const { return m_matrix; }
+    inline XprType& nestedExpression() { return m_matrix; }
+    Index startRow() const { return m_startRow.value(); }
+    Index startCol() const { return m_startCol.value(); }
+    Index blockRows() const { return m_blockRows.value(); }
+    Index blockCols() const { return m_blockCols.value(); }
 
-      inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-      inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
-      inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
-      inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
-      
-      inline operator bool() const { return Base::operator bool() && Base::index() >= m_begin; }
-    };
   protected:
-    friend class InnerIterator;
-    friend class ReverseInnerIterator;
-    
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
+//     friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
+    friend struct internal::unary_evaluator<Block<XprType,BlockRows,BlockCols,InnerPanel>, internal::IteratorBased, Scalar >;
 
-    typename XprType::Nested m_matrix;
+    Index nonZeros() const { return Dynamic; }
+
+    typename internal::ref_selector<XprType>::non_const_type m_matrix;
     const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
     const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
     const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
     const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
 
+  protected:
+    // Disable assignment with clear error message.
+    // Note that simply removing operator= yields compilation errors with ICC+MSVC
+    template<typename T>
+    BlockImpl& operator=(const T&)
+    {
+      EIGEN_STATIC_ASSERT(sizeof(T)==0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY);
+      return *this;
+    }
+
+};
+
+namespace internal {
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased >
+ : public evaluator_base<Block<ArgType,BlockRows,BlockCols,InnerPanel> >
+{
+    class InnerVectorInnerIterator;
+    class OuterVectorInnerIterator;
+  public:
+    typedef Block<ArgType,BlockRows,BlockCols,InnerPanel> XprType;
+    typedef typename XprType::StorageIndex StorageIndex;
+    typedef typename XprType::Scalar Scalar;
+
+    enum {
+      IsRowMajor = XprType::IsRowMajor,
+
+      OuterVector =  (BlockCols==1 && ArgType::IsRowMajor)
+                    | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
+                      // revert to || as soon as not needed anymore.
+                     (BlockRows==1 && !ArgType::IsRowMajor),
+
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+
+    typedef typename internal::conditional<OuterVector,OuterVectorInnerIterator,InnerVectorInnerIterator>::type InnerIterator;
+
+    explicit unary_evaluator(const XprType& op)
+      : m_argImpl(op.nestedExpression()), m_block(op)
+    {}
+
+    inline Index nonZerosEstimate() const {
+      Index nnz = m_block.nonZeros();
+      if(nnz<0)
+        return m_argImpl.nonZerosEstimate() * m_block.size() / m_block.nestedExpression().size();
+      return nnz;
+    }
+
+  protected:
+    typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+
+    evaluator<ArgType> m_argImpl;
+    const XprType &m_block;
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::InnerVectorInnerIterator
+ : public EvalIterator
+{
+  enum { IsRowMajor = unary_evaluator::IsRowMajor };
+  const XprType& m_block;
+  Index m_end;
+public:
+
+  EIGEN_STRONG_INLINE InnerVectorInnerIterator(const unary_evaluator& aEval, Index outer)
+    : EvalIterator(aEval.m_argImpl, outer + (IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())),
+      m_block(aEval.m_block),
+      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
+  {
+    while( (EvalIterator::operator bool()) && (EvalIterator::index() < (IsRowMajor ? m_block.startCol() : m_block.startRow())) )
+      EvalIterator::operator++();
+  }
+
+  inline StorageIndex index() const { return EvalIterator::index() - convert_index<StorageIndex>(IsRowMajor ? m_block.startCol() : m_block.startRow()); }
+  inline Index outer()  const { return EvalIterator::outer() - (IsRowMajor ? m_block.startRow() : m_block.startCol()); }
+  inline Index row()    const { return EvalIterator::row()   - m_block.startRow(); }
+  inline Index col()    const { return EvalIterator::col()   - m_block.startCol(); }
+
+  inline operator bool() const { return EvalIterator::operator bool() && EvalIterator::index() < m_end; }
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::OuterVectorInnerIterator
+{
+  enum { IsRowMajor = unary_evaluator::IsRowMajor };
+  const unary_evaluator& m_eval;
+  Index m_outerPos;
+  const Index m_innerIndex;
+  Index m_end;
+  EvalIterator m_it;
+public:
+
+  EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer)
+    : m_eval(aEval),
+      m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ),
+      m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
+      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()),
+      m_it(m_eval.m_argImpl, m_outerPos)
+  {
+    EIGEN_UNUSED_VARIABLE(outer);
+    eigen_assert(outer==0);
+
+    while(m_it && m_it.index() < m_innerIndex) ++m_it;
+    if((!m_it) || (m_it.index()!=m_innerIndex))
+      ++(*this);
+  }
+
+  inline StorageIndex index() const { return convert_index<StorageIndex>(m_outerPos - (IsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); }
+  inline Index outer()  const { return 0; }
+  inline Index row()    const { return IsRowMajor ? 0 : index(); }
+  inline Index col()    const { return IsRowMajor ? index() : 0; }
+
+  inline Scalar value() const { return m_it.value(); }
+  inline Scalar& valueRef() { return m_it.valueRef(); }
+
+  inline OuterVectorInnerIterator& operator++()
+  {
+    // search next non-zero entry
+    while(++m_outerPos<m_end)
+    {
+      // Restart iterator at the next inner-vector:
+      m_it.~EvalIterator();
+      ::new (&m_it) EvalIterator(m_eval.m_argImpl, m_outerPos);
+      // search for the key m_innerIndex in the current outer-vector
+      while(m_it && m_it.index() < m_innerIndex) ++m_it;
+      if(m_it && m_it.index()==m_innerIndex) break;
+    }
+    return *this;
+  }
+
+  inline operator bool() const { return m_outerPos < m_end; }
 };
 
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+struct unary_evaluator<Block<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true>, IteratorBased>
+  : evaluator<SparseCompressedBase<Block<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> > >
+{
+  typedef Block<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  explicit unary_evaluator(const XprType &xpr) : Base(xpr) {}
+};
+
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+struct unary_evaluator<Block<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true>, IteratorBased>
+  : evaluator<SparseCompressedBase<Block<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> > >
+{
+  typedef Block<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  explicit unary_evaluator(const XprType &xpr) : Base(xpr) {}
+};
+
+} // end namespace internal
+
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSE_BLOCK_H
-
diff --git a/Eigen/src/SparseCore/SparseColEtree.h b/Eigen/src/SparseCore/SparseColEtree.h
index f8745f461..ebe02d1ab 100644
--- a/Eigen/src/SparseCore/SparseColEtree.h
+++ b/Eigen/src/SparseCore/SparseColEtree.h
@@ -58,30 +58,29 @@ Index etree_find (Index i, IndexVector& pp)
   * \param perm The permutation to apply to the column of \b mat
   */
 template <typename MatrixType, typename IndexVector>
-int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowElt, typename MatrixType::Index *perm=0)
+int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowElt, typename MatrixType::StorageIndex *perm=0)
 {
-  typedef typename MatrixType::Index Index;
-  Index nc = mat.cols(); // Number of columns 
-  Index m = mat.rows();
-  Index diagSize = (std::min)(nc,m);
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  StorageIndex nc = convert_index<StorageIndex>(mat.cols()); // Number of columns
+  StorageIndex m = convert_index<StorageIndex>(mat.rows());
+  StorageIndex diagSize = (std::min)(nc,m);
   IndexVector root(nc); // root of subtree of etree 
   root.setZero();
   IndexVector pp(nc); // disjoint sets 
   pp.setZero(); // Initialize disjoint sets 
   parent.resize(mat.cols());
   //Compute first nonzero column in each row 
-  Index row,col; 
   firstRowElt.resize(m);
   firstRowElt.setConstant(nc);
   firstRowElt.segment(0, diagSize).setLinSpaced(diagSize, 0, diagSize-1);
   bool found_diag;
-  for (col = 0; col < nc; col++)
+  for (StorageIndex col = 0; col < nc; col++)
   {
-    Index pcol = col;
+    StorageIndex pcol = col;
     if(perm) pcol  = perm[col];
     for (typename MatrixType::InnerIterator it(mat, pcol); it; ++it)
     { 
-      row = it.row();
+      Index row = it.row();
       firstRowElt(row) = (std::min)(firstRowElt(row), col);
     }
   }
@@ -89,8 +88,8 @@ int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowEl
           except use (firstRowElt[r],c) in place of an edge (r,c) of A.
     Thus each row clique in A'*A is replaced by a star
     centered at its first vertex, which has the same fill. */
-  Index rset, cset, rroot; 
-  for (col = 0; col < nc; col++) 
+  StorageIndex rset, cset, rroot;
+  for (StorageIndex col = 0; col < nc; col++) 
   {
     found_diag = col>=m;
     pp(col) = col; 
@@ -99,7 +98,7 @@ int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowEl
     parent(col) = nc; 
     /* The diagonal element is treated here even if it does not exist in the matrix
      * hence the loop is executed once more */ 
-    Index pcol = col;
+    StorageIndex pcol = col;
     if(perm) pcol  = perm[col];
     for (typename MatrixType::InnerIterator it(mat, pcol); it||!found_diag; ++it)
     { //  A sequence of interleaved find and union is performed 
@@ -107,7 +106,7 @@ int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowEl
       if(it) i = it.index();
       if (i == col) found_diag = true;
       
-      row = firstRowElt(i);
+      StorageIndex row = firstRowElt(i);
       if (row >= col) continue; 
       rset = internal::etree_find(row, pp); // Find the name of the set containing row
       rroot = root(rset);
@@ -127,10 +126,11 @@ int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowEl
   * Depth-first search from vertex n.  No recursion.
   * This routine was contributed by Cédric Doucet, CEDRAT Group, Meylan, France.
 */
-template <typename Index, typename IndexVector>
-void nr_etdfs (Index n, IndexVector& parent, IndexVector& first_kid, IndexVector& next_kid, IndexVector& post, Index postnum)
+template <typename IndexVector>
+void nr_etdfs (typename IndexVector::Scalar n, IndexVector& parent, IndexVector& first_kid, IndexVector& next_kid, IndexVector& post, typename IndexVector::Scalar postnum)
 {
-  Index current = n, first, next;
+  typedef typename IndexVector::Scalar StorageIndex;
+  StorageIndex current = n, first, next;
   while (postnum != n) 
   {
     // No kid for the current node
@@ -174,22 +174,22 @@ void nr_etdfs (Index n, IndexVector& parent, IndexVector& first_kid, IndexVector
   * \param parent Input tree
   * \param post postordered tree
   */
-template <typename Index, typename IndexVector>
-void treePostorder(Index n, IndexVector& parent, IndexVector& post)
+template <typename IndexVector>
+void treePostorder(typename IndexVector::Scalar n, IndexVector& parent, IndexVector& post)
 {
+  typedef typename IndexVector::Scalar StorageIndex;
   IndexVector first_kid, next_kid; // Linked list of children 
-  Index postnum; 
+  StorageIndex postnum; 
   // Allocate storage for working arrays and results 
   first_kid.resize(n+1); 
   next_kid.setZero(n+1);
   post.setZero(n+1);
   
   // Set up structure describing children
-  Index v, dad; 
   first_kid.setConstant(-1); 
-  for (v = n-1; v >= 0; v--) 
+  for (StorageIndex v = n-1; v >= 0; v--) 
   {
-    dad = parent(v);
+    StorageIndex dad = parent(v);
     next_kid(v) = first_kid(dad); 
     first_kid(dad) = v; 
   }
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
new file mode 100644
index 000000000..5ccb46656
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -0,0 +1,341 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H
+#define EIGEN_SPARSE_COMPRESSED_BASE_H
+
+namespace Eigen { 
+
+template<typename Derived> class SparseCompressedBase;
+  
+namespace internal {
+
+template<typename Derived>
+struct traits<SparseCompressedBase<Derived> > : traits<Derived>
+{};
+
+} // end namespace internal
+
+/** \ingroup SparseCore_Module
+  * \class SparseCompressedBase
+  * \brief Common base class for sparse [compressed]-{row|column}-storage format.
+  *
+  * This class defines the common interface for all derived classes implementing the compressed sparse storage format, such as:
+  *  - SparseMatrix
+  *  - Ref<SparseMatrixType,Options>
+  *  - Map<SparseMatrixType>
+  *
+  */
+template<typename Derived>
+class SparseCompressedBase
+  : public SparseMatrixBase<Derived>
+{
+  public:
+    typedef SparseMatrixBase<Derived> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase)
+    using Base::operator=;
+    using Base::IsRowMajor;
+    
+    class InnerIterator;
+    class ReverseInnerIterator;
+    
+  protected:
+    typedef typename Base::IndexVector IndexVector;
+    Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
+    const  Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
+        
+  public:
+    
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const
+    {
+      if(Derived::IsVectorAtCompileTime && outerIndexPtr()==0)
+        return derived().nonZeros();
+      else if(isCompressed())
+        return outerIndexPtr()[derived().outerSize()]-outerIndexPtr()[0];
+      else if(derived().outerSize()==0)
+        return 0;
+      else
+        return innerNonZeros().sum();
+    }
+    
+    /** \returns a const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline const Scalar* valuePtr() const { return derived().valuePtr(); }
+    /** \returns a non-const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline Scalar* valuePtr() { return derived().valuePtr(); }
+
+    /** \returns a const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline const StorageIndex* innerIndexPtr() const { return derived().innerIndexPtr(); }
+    /** \returns a non-const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline StorageIndex* innerIndexPtr() { return derived().innerIndexPtr(); }
+
+    /** \returns a const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 for SparseVector
+      * \sa valuePtr(), innerIndexPtr() */
+    inline const StorageIndex* outerIndexPtr() const { return derived().outerIndexPtr(); }
+    /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 for SparseVector
+      * \sa valuePtr(), innerIndexPtr() */
+    inline StorageIndex* outerIndexPtr() { return derived().outerIndexPtr(); }
+
+    /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline const StorageIndex* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); }
+    /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline StorageIndex* innerNonZeroPtr() { return derived().innerNonZeroPtr(); }
+    
+    /** \returns whether \c *this is in compressed form. */
+    inline bool isCompressed() const { return innerNonZeroPtr()==0; }
+
+    /** \returns a read-only view of the stored coefficients as a 1D array expression.
+      *
+      * \warning this method is for \b compressed \b storage \b only, and it will trigger an assertion otherwise.
+      *
+      * \sa valuePtr(), isCompressed() */
+    const Map<const Array<Scalar,Dynamic,1> > coeffs() const { eigen_assert(isCompressed()); return Array<Scalar,Dynamic,1>::Map(valuePtr(),nonZeros()); }
+
+    /** \returns a read-write view of the stored coefficients as a 1D array expression
+      *
+      * \warning this method is for \b compressed \b storage \b only, and it will trigger an assertion otherwise.
+      *
+      * Here is an example:
+      * \include SparseMatrix_coeffs.cpp
+      * and the output is:
+      * \include SparseMatrix_coeffs.out
+      *
+      * \sa valuePtr(), isCompressed() */
+    Map<Array<Scalar,Dynamic,1> > coeffs() { eigen_assert(isCompressed()); return Array<Scalar,Dynamic,1>::Map(valuePtr(),nonZeros()); }
+
+  protected:
+    /** Default constructor. Do nothing. */
+    SparseCompressedBase() {}
+  private:
+    template<typename OtherDerived> explicit SparseCompressedBase(const SparseCompressedBase<OtherDerived>&);
+};
+
+template<typename Derived>
+class SparseCompressedBase<Derived>::InnerIterator
+{
+  public:
+    InnerIterator()
+      : m_values(0), m_indices(0), m_outer(0), m_id(0), m_end(0)
+    {}
+
+    InnerIterator(const InnerIterator& other)
+      : m_values(other.m_values), m_indices(other.m_indices), m_outer(other.m_outer), m_id(other.m_id), m_end(other.m_end)
+    {}
+
+    InnerIterator& operator=(const InnerIterator& other)
+    {
+      m_values = other.m_values;
+      m_indices = other.m_indices;
+      const_cast<OuterType&>(m_outer).setValue(other.m_outer.value());
+      m_id = other.m_id;
+      m_end = other.m_end;
+      return *this;
+    }
+
+    InnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer)
+    {
+      if(Derived::IsVectorAtCompileTime && mat.outerIndexPtr()==0)
+      {
+        m_id = 0;
+        m_end = mat.nonZeros();
+      }
+      else
+      {
+        m_id = mat.outerIndexPtr()[outer];
+        if(mat.isCompressed())
+          m_end = mat.outerIndexPtr()[outer+1];
+        else
+          m_end = m_id + mat.innerNonZeroPtr()[outer];
+      }
+    }
+
+    explicit InnerIterator(const SparseCompressedBase& mat)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_id(0), m_end(mat.nonZeros())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    explicit InnerIterator(const internal::CompressedStorage<Scalar,StorageIndex>& data)
+      : m_values(data.valuePtr()), m_indices(data.indexPtr()), m_outer(0), m_id(0), m_end(data.size())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    inline InnerIterator& operator++() { m_id++; return *this; }
+
+    inline const Scalar& value() const { return m_values[m_id]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
+
+    inline StorageIndex index() const { return m_indices[m_id]; }
+    inline Index outer() const { return m_outer.value(); }
+    inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }
+
+    inline operator bool() const { return (m_id < m_end); }
+
+  protected:
+    const Scalar* m_values;
+    const StorageIndex* m_indices;
+    typedef internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> OuterType;
+    const OuterType m_outer;
+    Index m_id;
+    Index m_end;
+  private:
+    // If you get here, then you're not using the right InnerIterator type, e.g.:
+    //   SparseMatrix<double,RowMajor> A;
+    //   SparseMatrix<double>::InnerIterator it(A,0);
+    template<typename T> InnerIterator(const SparseMatrixBase<T>&, Index outer);
+};
+
+template<typename Derived>
+class SparseCompressedBase<Derived>::ReverseInnerIterator
+{
+  public:
+    ReverseInnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer)
+    {
+      if(Derived::IsVectorAtCompileTime && mat.outerIndexPtr()==0)
+      {
+        m_start = 0;
+        m_id = mat.nonZeros();
+      }
+      else
+      {
+        m_start = mat.outerIndexPtr()[outer];
+        if(mat.isCompressed())
+          m_id = mat.outerIndexPtr()[outer+1];
+        else
+          m_id = m_start + mat.innerNonZeroPtr()[outer];
+      }
+    }
+
+    explicit ReverseInnerIterator(const SparseCompressedBase& mat)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_start(0), m_id(mat.nonZeros())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    explicit ReverseInnerIterator(const internal::CompressedStorage<Scalar,StorageIndex>& data)
+      : m_values(data.valuePtr()), m_indices(data.indexPtr()), m_outer(0), m_start(0), m_id(data.size())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
+
+    inline const Scalar& value() const { return m_values[m_id-1]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
+
+    inline StorageIndex index() const { return m_indices[m_id-1]; }
+    inline Index outer() const { return m_outer.value(); }
+    inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }
+
+    inline operator bool() const { return (m_id > m_start); }
+
+  protected:
+    const Scalar* m_values;
+    const StorageIndex* m_indices;
+    typedef internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> OuterType;
+    const OuterType m_outer;
+    Index m_start;
+    Index m_id;
+};
+
+namespace internal {
+
+template<typename Derived>
+struct evaluator<SparseCompressedBase<Derived> >
+  : evaluator_base<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::InnerIterator InnerIterator;
+  
+  enum {
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = Derived::Flags
+  };
+  
+  evaluator() : m_matrix(0), m_zero(0)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  explicit evaluator(const Derived &mat) : m_matrix(&mat), m_zero(0)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_matrix->nonZeros();
+  }
+  
+  operator Derived&() { return m_matrix->const_cast_derived(); }
+  operator const Derived&() const { return *m_matrix; }
+  
+  typedef typename DenseCoeffsBase<Derived,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
+  const Scalar& coeff(Index row, Index col) const
+  {
+    Index p = find(row,col);
+
+    if(p==Dynamic)
+      return m_zero;
+    else
+      return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
+  Scalar& coeffRef(Index row, Index col)
+  {
+    Index p = find(row,col);
+    eigen_assert(p!=Dynamic && "written coefficient does not exist");
+    return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
+protected:
+
+  Index find(Index row, Index col) const
+  {
+    eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
+
+    const Index outer = Derived::IsRowMajor ? row : col;
+    const Index inner = Derived::IsRowMajor ? col : row;
+
+    Index start = m_matrix->outerIndexPtr()[outer];
+    Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
+    eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+    const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr();
+
+    return ((p<end) && (m_matrix->innerIndexPtr()[p]==inner)) ? p : Dynamic;
+  }
+
+  const Derived *m_matrix;
+  const Scalar m_zero;
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_COMPRESSED_BASE_H
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 60ca7690c..e315e3550 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -28,89 +28,57 @@ namespace Eigen {
 //                         generic      sparse
 //  4 - dense op dense     product      dense
 //                         generic      dense
-
-namespace internal {
-
-template<> struct promote_storage_type<Dense,Sparse>
-{ typedef Sparse ret; };
-
-template<> struct promote_storage_type<Sparse,Dense>
-{ typedef Sparse ret; };
-
-template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived,
-  typename _LhsStorageMode = typename traits<Lhs>::StorageKind,
-  typename _RhsStorageMode = typename traits<Rhs>::StorageKind>
-class sparse_cwise_binary_op_inner_iterator_selector;
-
-} // end namespace internal
+//
+// TODO to ease compiler job, we could specialize product/quotient with a scalar
+//      and fallback to cwise-unary evaluator using bind1st_op and bind2nd_op.
 
 template<typename BinaryOp, typename Lhs, typename Rhs>
 class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>
   : public SparseMatrixBase<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
   public:
-    class InnerIterator;
-    class ReverseInnerIterator;
     typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
+    typedef SparseMatrixBase<Derived> Base;
     EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
     CwiseBinaryOpImpl()
     {
-      typedef typename internal::traits<Lhs>::StorageKind LhsStorageKind;
-      typedef typename internal::traits<Rhs>::StorageKind RhsStorageKind;
       EIGEN_STATIC_ASSERT((
-                (!internal::is_same<LhsStorageKind,RhsStorageKind>::value)
-            ||  ((Lhs::Flags&RowMajorBit) == (Rhs::Flags&RowMajorBit))),
+                (!internal::is_same<typename internal::traits<Lhs>::StorageKind,
+                                    typename internal::traits<Rhs>::StorageKind>::value)
+            ||  ((internal::evaluator<Lhs>::Flags&RowMajorBit) == (internal::evaluator<Rhs>::Flags&RowMajorBit))),
             THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH);
     }
 };
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOpImpl<BinaryOp,Lhs,Rhs,Sparse>::InnerIterator
-  : public internal::sparse_cwise_binary_op_inner_iterator_selector<BinaryOp,Lhs,Rhs,typename CwiseBinaryOpImpl<BinaryOp,Lhs,Rhs,Sparse>::InnerIterator>
-{
-  public:
-    typedef typename Lhs::Index Index;
-    typedef internal::sparse_cwise_binary_op_inner_iterator_selector<
-      BinaryOp,Lhs,Rhs, InnerIterator> Base;
-
-    // NOTE: we have to prefix Index by "typename Lhs::" to avoid an ICE with VC11
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseBinaryOpImpl& binOp, typename Lhs::Index outer)
-      : Base(binOp.derived(),outer)
-    {}
-};
-
-/***************************************************************************
-* Implementation of inner-iterators
-***************************************************************************/
-
-// template<typename T> struct internal::func_is_conjunction { enum { ret = false }; };
-// template<typename T> struct internal::func_is_conjunction<internal::scalar_product_op<T> > { enum { ret = true }; };
-
-// TODO generalize the internal::scalar_product_op specialization to all conjunctions if any !
-
 namespace internal {
 
-// sparse - sparse  (generic)
-template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<BinaryOp, Lhs, Rhs, Derived, Sparse, Sparse>
-{
-    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename traits<CwiseBinaryXpr>::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
+  
+// Generic "sparse OP sparse"
+template<typename XprType> struct binary_sparse_evaluator;
 
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+protected:
+  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
+  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
+
+  class InnerIterator
+  {
   public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_lhsIter(xpr.lhs(),outer), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor())
+    
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor)
     {
       this->operator++();
     }
 
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
       if (m_lhsIter && m_rhsIter && (m_lhsIter.index() == m_rhsIter.index()))
       {
@@ -136,12 +104,13 @@ class sparse_cwise_binary_op_inner_iterator_selector<BinaryOp, Lhs, Rhs, Derived
         m_value = 0; // this is to avoid a compilation warning
         m_id = -1;
       }
-      return *static_cast<Derived*>(this);
+      return *this;
     }
 
     EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_id; }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
     EIGEN_STRONG_INLINE Index row() const { return Lhs::IsRowMajor ? m_lhsIter.row() : index(); }
     EIGEN_STRONG_INLINE Index col() const { return Lhs::IsRowMajor ? index() : m_lhsIter.col(); }
 
@@ -152,25 +121,303 @@ class sparse_cwise_binary_op_inner_iterator_selector<BinaryOp, Lhs, Rhs, Derived
     RhsIterator m_rhsIter;
     const BinaryOp& m_functor;
     Scalar m_value;
-    Index m_id;
+    StorageIndex m_id;
+  };
+  
+  
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    Flags = XprType::Flags
+  };
+  
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_lhsImpl.nonZerosEstimate() + m_rhsImpl.nonZerosEstimate();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
 };
 
-// sparse - sparse  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Sparse, Sparse>
+// dense op sparse
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
+protected:
+  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
+
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
   public:
 
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_lhsIter(xpr.lhs(),outer), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor())
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_value(0), m_id(-1), m_innerSize(aEval.m_expr.rhs().innerSize())
+    {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    {
+      ++m_id;
+      if(m_id<m_innerSize)
+      {
+        Scalar lhsVal = m_lhsEval.coeff(IsRowMajor?m_rhsIter.outer():m_id,
+                                        IsRowMajor?m_id:m_rhsIter.outer());
+        if(m_rhsIter && m_rhsIter.index()==m_id)
+        {
+          m_value = m_functor(lhsVal, m_rhsIter.value());
+          ++m_rhsIter;
+        }
+        else
+          m_value = m_functor(lhsVal, Scalar(0));
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const { eigen_internal_assert(m_id<m_innerSize); return m_value; }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_rhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_rhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_rhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id<m_innerSize; }
+
+  protected:
+    const evaluator<Lhs> &m_lhsEval;
+    RhsIterator m_rhsIter;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit)
+  };
+
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()),
+      m_rhsImpl(xpr.rhs()),
+      m_expr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const {
+    return m_expr.size();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+  const XprType &m_expr;
+};
+
+// sparse op dense
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+protected:
+  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
+
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
+  public:
+
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_value(0), m_id(-1), m_innerSize(aEval.m_expr.lhs().innerSize())
+    {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    {
+      ++m_id;
+      if(m_id<m_innerSize)
+      {
+        Scalar rhsVal = m_rhsEval.coeff(IsRowMajor?m_lhsIter.outer():m_id,
+                                        IsRowMajor?m_id:m_lhsIter.outer());
+        if(m_lhsIter && m_lhsIter.index()==m_id)
+        {
+          m_value = m_functor(m_lhsIter.value(), rhsVal);
+          ++m_lhsIter;
+        }
+        else
+          m_value = m_functor(Scalar(0),rhsVal);
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const { eigen_internal_assert(m_id<m_innerSize); return m_value; }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_lhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_lhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id<m_innerSize; }
+
+  protected:
+    LhsIterator m_lhsIter;
+    const evaluator<Rhs> &m_rhsEval;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit)
+  };
+
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()),
+      m_rhsImpl(xpr.rhs()),
+      m_expr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const {
+    return m_expr.size();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+  const XprType &m_expr;
+};
+
+template<typename T,
+         typename LhsKind   = typename evaluator_traits<typename T::Lhs>::Kind,
+         typename RhsKind   = typename evaluator_traits<typename T::Rhs>::Kind,
+         typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+         typename RhsScalar = typename traits<typename T::Rhs>::Scalar> struct sparse_conjunction_evaluator;
+
+// "sparse .* sparse"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IteratorBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "dense .* sparse"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IndexBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "sparse .* dense"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IteratorBased, IndexBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// "sparse ./ dense"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs>, IteratorBased, IndexBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// "sparse && sparse"
+template<typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IteratorBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "dense && sparse"
+template<typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IndexBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "sparse && dense"
+template<typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IteratorBased, IndexBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// "sparse ^ sparse"
+template<typename XprType>
+struct sparse_conjunction_evaluator<XprType, IteratorBased, IteratorBased>
+  : evaluator_base<XprType>
+{
+protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef typename evaluator<LhsArg>::InnerIterator  LhsIterator;
+  typedef typename evaluator<RhsArg>::InnerIterator  RhsIterator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+public:
+
+  class InnerIterator
+  {
+  public:
+    
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor)
     {
       while (m_lhsIter && m_rhsIter && (m_lhsIter.index() != m_rhsIter.index()))
       {
@@ -181,7 +428,7 @@ class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs,
       }
     }
 
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
       ++m_lhsIter;
       ++m_rhsIter;
@@ -192,12 +439,13 @@ class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs,
         else
           ++m_rhsIter;
       }
-      return *static_cast<Derived*>(this);
+      return *this;
     }
-
+    
     EIGEN_STRONG_INLINE Scalar value() const { return m_functor(m_lhsIter.value(), m_rhsIter.value()); }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
     EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
     EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
 
@@ -206,91 +454,184 @@ class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs,
   protected:
     LhsIterator m_lhsIter;
     RhsIterator m_rhsIter;
-    const BinaryFunc& m_functor;
+    const BinaryOp& m_functor;
+  };
+  
+  
+  enum {
+    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    Flags = XprType::Flags
+  };
+  
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return (std::min)(m_lhsImpl.nonZerosEstimate(), m_rhsImpl.nonZerosEstimate());
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };
 
-// sparse - dense  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Sparse, Dense>
+// "dense ^ sparse"
+template<typename XprType>
+struct sparse_conjunction_evaluator<XprType, IndexBased, IteratorBased>
+  : evaluator_base<XprType>
 {
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename traits<CwiseBinaryXpr>::RhsNested RhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename Lhs::Index Index;
-    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
-  public:
+protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef evaluator<LhsArg> LhsEvaluator;
+  typedef typename evaluator<RhsArg>::InnerIterator  RhsIterator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+public:
+
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(RhsArg::Flags)&RowMajorBit)==RowMajorBit };
 
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_rhs(xpr.rhs()), m_lhsIter(xpr.lhs(),outer), m_functor(xpr.functor()), m_outer(outer)
+  public:
+    
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
+      : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_outer(outer)
     {}
 
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
-      ++m_lhsIter;
-      return *static_cast<Derived*>(this);
+      ++m_rhsIter;
+      return *this;
     }
 
     EIGEN_STRONG_INLINE Scalar value() const
-    { return m_functor(m_lhsIter.value(),
-                       m_rhs.coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }
+    { return m_functor(m_lhsEval.coeff(IsRowMajor?m_outer:m_rhsIter.index(),IsRowMajor?m_rhsIter.index():m_outer), m_rhsIter.value()); }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_lhsIter.index(); }
-    EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
-    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
-
-    EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_rhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_rhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return m_rhsIter.row(); }
+    EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }
 
+    EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
+    
   protected:
-    RhsNested m_rhs;
-    LhsIterator m_lhsIter;
-    const BinaryFunc m_functor;
+    const LhsEvaluator &m_lhsEval;
+    RhsIterator m_rhsIter;
+    const BinaryOp& m_functor;
     const Index m_outer;
+  };
+  
+  
+  enum {
+    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(RhsArg::Flags)&RowMajorBit)
+  };
+  
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_rhsImpl.nonZerosEstimate();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };
 
-// sparse - dense  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Dense, Sparse>
+// "sparse ^ dense"
+template<typename XprType>
+struct sparse_conjunction_evaluator<XprType, IteratorBased, IndexBased>
+  : evaluator_base<XprType>
 {
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
+protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef typename evaluator<LhsArg>::InnerIterator LhsIterator;
+  typedef evaluator<RhsArg> RhsEvaluator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+public:
+
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(LhsArg::Flags)&RowMajorBit)==RowMajorBit };
 
-    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
   public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_xpr(xpr), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor()), m_outer(outer)
+    
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_outer(outer)
     {}
 
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
-      ++m_rhsIter;
-      return *static_cast<Derived*>(this);
+      ++m_lhsIter;
+      return *this;
     }
 
     EIGEN_STRONG_INLINE Scalar value() const
-    { return m_functor(m_xpr.lhs().coeff(IsRowMajor?m_outer:m_rhsIter.index(),IsRowMajor?m_rhsIter.index():m_outer), m_rhsIter.value()); }
-
-    EIGEN_STRONG_INLINE Index index() const { return m_rhsIter.index(); }
-    EIGEN_STRONG_INLINE Index row() const { return m_rhsIter.row(); }
-    EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }
+    { return m_functor(m_lhsIter.value(),
+                       m_rhsEval.coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }
 
-    EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
+    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
 
+    EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
+    
   protected:
-    const CwiseBinaryXpr& m_xpr;
-    RhsIterator m_rhsIter;
-    const BinaryFunc& m_functor;
+    LhsIterator m_lhsIter;
+    const evaluator<RhsArg> &m_rhsEval;
+    const BinaryOp& m_functor;
     const Index m_outer;
+  };
+  
+  
+  enum {
+    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(LhsArg::Flags)&RowMajorBit)
+  };
+  
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_lhsImpl.nonZerosEstimate();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };
 
-} // end namespace internal
+}
 
 /***************************************************************************
 * Implementation of SparseMatrixBase and SparseCwise functions/operators
@@ -298,6 +639,22 @@ class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs,
 
 template<typename Derived>
 template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 SparseMatrixBase<Derived>::operator-=(const SparseMatrixBase<OtherDerived> &other)
 {
@@ -314,10 +671,54 @@ SparseMatrixBase<Derived>::operator+=(const SparseMatrixBase<OtherDerived>& othe
 
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
+Derived& SparseMatrixBase<Derived>::operator+=(const DiagonalBase<OtherDerived>& other)
+{
+  call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const DiagonalBase<OtherDerived>& other)
+{
+  call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+    
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const typename SparseMatrixBase<Derived>::template CwiseProductDenseReturnType<OtherDerived>::Type
 SparseMatrixBase<Derived>::cwiseProduct(const MatrixBase<OtherDerived> &other) const
 {
-  return EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE(derived(), other.derived());
+  return typename CwiseProductDenseReturnType<OtherDerived>::Type(derived(), other.derived());
+}
+
+template<typename DenseDerived, typename SparseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>
+operator+(const MatrixBase<DenseDerived> &a, const SparseMatrixBase<SparseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+}
+
+template<typename SparseDerived, typename DenseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
+operator+(const SparseMatrixBase<SparseDerived> &a, const MatrixBase<DenseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_sum_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
+}
+
+template<typename DenseDerived, typename SparseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>
+operator-(const MatrixBase<DenseDerived> &a, const SparseMatrixBase<SparseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+}
+
+template<typename SparseDerived, typename DenseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
+operator-(const SparseMatrixBase<SparseDerived> &a, const MatrixBase<DenseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_difference_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index 5a50c7803..ea7973790 100644
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,138 +12,121 @@
 
 namespace Eigen { 
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>
-  : public SparseMatrixBase<CwiseUnaryOp<UnaryOp, MatrixType> >
+namespace internal {
+  
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
+  : public evaluator_base<CwiseUnaryOp<UnaryOp,ArgType> >
 {
   public:
+    typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
 
     class InnerIterator;
-    class ReverseInnerIterator;
-
-    typedef CwiseUnaryOp<UnaryOp, MatrixType> Derived;
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression())
+    {
+      EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+      EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+    }
+    
+    inline Index nonZerosEstimate() const {
+      return m_argImpl.nonZerosEstimate();
+    }
 
   protected:
-    typedef typename internal::traits<Derived>::_XprTypeNested _MatrixTypeNested;
-    typedef typename _MatrixTypeNested::InnerIterator MatrixTypeIterator;
-    typedef typename _MatrixTypeNested::ReverseInnerIterator MatrixTypeReverseIterator;
+    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
+    
+    const UnaryOp m_functor;
+    evaluator<ArgType> m_argImpl;
 };
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::InnerIterator
-    : public CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeIterator
+template<typename UnaryOp, typename ArgType>
+class unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::InnerIterator
+    : public unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalIterator
 {
-    typedef typename CwiseUnaryOpImpl::Scalar Scalar;
-    typedef typename CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeIterator Base;
+    typedef typename XprType::Scalar Scalar;
+    typedef typename unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalIterator Base;
   public:
 
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseUnaryOpImpl& unaryOp, typename CwiseUnaryOpImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+    EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+      : Base(unaryOp.m_argImpl,outer), m_functor(unaryOp.m_functor)
     {}
 
     EIGEN_STRONG_INLINE InnerIterator& operator++()
     { Base::operator++(); return *this; }
 
-    EIGEN_STRONG_INLINE typename CwiseUnaryOpImpl::Scalar value() const { return m_functor(Base::value()); }
-
-  protected:
-    const UnaryOp m_functor;
-  private:
-    typename CwiseUnaryOpImpl::Scalar& valueRef();
-};
-
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::ReverseInnerIterator
-    : public CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeReverseIterator
-{
-    typedef typename CwiseUnaryOpImpl::Scalar Scalar;
-    typedef typename CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeReverseIterator Base;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const CwiseUnaryOpImpl& unaryOp, typename CwiseUnaryOpImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-    {}
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
-
-    EIGEN_STRONG_INLINE typename CwiseUnaryOpImpl::Scalar value() const { return m_functor(Base::value()); }
+    EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
 
   protected:
     const UnaryOp m_functor;
   private:
-    typename CwiseUnaryOpImpl::Scalar& valueRef();
+    Scalar& valueRef();
 };
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>
-  : public SparseMatrixBase<CwiseUnaryView<ViewOp, MatrixType> >
+template<typename ViewOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>
+  : public evaluator_base<CwiseUnaryView<ViewOp,ArgType> >
 {
   public:
+    typedef CwiseUnaryView<ViewOp, ArgType> XprType;
 
     class InnerIterator;
-    class ReverseInnerIterator;
-
-    typedef CwiseUnaryView<ViewOp, MatrixType> Derived;
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<ViewOp>::Cost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression())
+    {
+      EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<ViewOp>::Cost);
+      EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+    }
 
   protected:
-    typedef typename internal::traits<Derived>::_MatrixTypeNested _MatrixTypeNested;
-    typedef typename _MatrixTypeNested::InnerIterator MatrixTypeIterator;
-    typedef typename _MatrixTypeNested::ReverseInnerIterator MatrixTypeReverseIterator;
+    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
+    
+    const ViewOp m_functor;
+    evaluator<ArgType> m_argImpl;
 };
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::InnerIterator
-    : public CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeIterator
+template<typename ViewOp, typename ArgType>
+class unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::InnerIterator
+    : public unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalIterator
 {
-    typedef typename CwiseUnaryViewImpl::Scalar Scalar;
-    typedef typename CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeIterator Base;
+    typedef typename XprType::Scalar Scalar;
+    typedef typename unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalIterator Base;
   public:
 
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseUnaryViewImpl& unaryOp, typename CwiseUnaryViewImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+    EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+      : Base(unaryOp.m_argImpl,outer), m_functor(unaryOp.m_functor)
     {}
 
     EIGEN_STRONG_INLINE InnerIterator& operator++()
     { Base::operator++(); return *this; }
 
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar value() const { return m_functor(Base::value()); }
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar& valueRef() { return m_functor(Base::valueRef()); }
+    EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
+    EIGEN_STRONG_INLINE Scalar& valueRef() { return m_functor(Base::valueRef()); }
 
   protected:
     const ViewOp m_functor;
 };
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::ReverseInnerIterator
-    : public CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeReverseIterator
-{
-    typedef typename CwiseUnaryViewImpl::Scalar Scalar;
-    typedef typename CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeReverseIterator Base;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const CwiseUnaryViewImpl& unaryOp, typename CwiseUnaryViewImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-    {}
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
-
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar value() const { return m_functor(Base::value()); }
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar& valueRef() { return m_functor(Base::valueRef()); }
-
-  protected:
-    const ViewOp m_functor;
-};
+} // end namespace internal
 
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 SparseMatrixBase<Derived>::operator*=(const Scalar& other)
 {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
   for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+    for (EvalIterator i(thisEval,j); i; ++i)
       i.valueRef() *= other;
   return derived();
 }
@@ -152,8 +135,10 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 SparseMatrixBase<Derived>::operator/=(const Scalar& other)
 {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
   for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+    for (EvalIterator i(thisEval,j); i; ++i)
       i.valueRef() /= other;
   return derived();
 }
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index ccb6ae7b7..0547db596 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,196 +12,93 @@
 
 namespace Eigen { 
 
-template<typename Lhs, typename Rhs, int InnerSize> struct SparseDenseProductReturnType
-{
-  typedef SparseTimeDenseProduct<Lhs,Rhs> Type;
-};
-
-template<typename Lhs, typename Rhs> struct SparseDenseProductReturnType<Lhs,Rhs,1>
-{
-  typedef typename internal::conditional<
-    Lhs::IsRowMajor,
-    SparseDenseOuterProduct<Rhs,Lhs,true>,
-    SparseDenseOuterProduct<Lhs,Rhs,false> >::type Type;
-};
-
-template<typename Lhs, typename Rhs, int InnerSize> struct DenseSparseProductReturnType
-{
-  typedef DenseTimeSparseProduct<Lhs,Rhs> Type;
-};
-
-template<typename Lhs, typename Rhs> struct DenseSparseProductReturnType<Lhs,Rhs,1>
-{
-  typedef typename internal::conditional<
-    Rhs::IsRowMajor,
-    SparseDenseOuterProduct<Rhs,Lhs,true>,
-    SparseDenseOuterProduct<Lhs,Rhs,false> >::type Type;
-};
-
 namespace internal {
 
-template<typename Lhs, typename Rhs, bool Tr>
-struct traits<SparseDenseOuterProduct<Lhs,Rhs,Tr> >
-{
-  typedef Sparse StorageKind;
-  typedef typename scalar_product_traits<typename traits<Lhs>::Scalar,
-                                         typename traits<Rhs>::Scalar>::ReturnType Scalar;
-  typedef typename Lhs::Index Index;
-  typedef typename Lhs::Nested LhsNested;
-  typedef typename Rhs::Nested RhsNested;
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-
-  enum {
-    LhsCoeffReadCost = traits<_LhsNested>::CoeffReadCost,
-    RhsCoeffReadCost = traits<_RhsNested>::CoeffReadCost,
-
-    RowsAtCompileTime    = Tr ? int(traits<Rhs>::RowsAtCompileTime)     : int(traits<Lhs>::RowsAtCompileTime),
-    ColsAtCompileTime    = Tr ? int(traits<Lhs>::ColsAtCompileTime)     : int(traits<Rhs>::ColsAtCompileTime),
-    MaxRowsAtCompileTime = Tr ? int(traits<Rhs>::MaxRowsAtCompileTime)  : int(traits<Lhs>::MaxRowsAtCompileTime),
-    MaxColsAtCompileTime = Tr ? int(traits<Lhs>::MaxColsAtCompileTime)  : int(traits<Rhs>::MaxColsAtCompileTime),
-
-    Flags = Tr ? RowMajorBit : 0,
-
-    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + NumTraits<Scalar>::MulCost
-  };
-};
-
-} // end namespace internal
-
-template<typename Lhs, typename Rhs, bool Tr>
-class SparseDenseOuterProduct
- : public SparseMatrixBase<SparseDenseOuterProduct<Lhs,Rhs,Tr> >
-{
-  public:
-
-    typedef SparseMatrixBase<SparseDenseOuterProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SparseDenseOuterProduct)
-    typedef internal::traits<SparseDenseOuterProduct> Traits;
-
-  private:
-
-    typedef typename Traits::LhsNested LhsNested;
-    typedef typename Traits::RhsNested RhsNested;
-    typedef typename Traits::_LhsNested _LhsNested;
-    typedef typename Traits::_RhsNested _RhsNested;
-
-  public:
-
-    class InnerIterator;
-
-    EIGEN_STRONG_INLINE SparseDenseOuterProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      EIGEN_STATIC_ASSERT(!Tr,YOU_MADE_A_PROGRAMMING_MISTAKE);
-    }
-
-    EIGEN_STRONG_INLINE SparseDenseOuterProduct(const Rhs& rhs, const Lhs& lhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      EIGEN_STATIC_ASSERT(Tr,YOU_MADE_A_PROGRAMMING_MISTAKE);
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return Tr ? m_rhs.rows() : m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return Tr ? m_lhs.cols() : m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
-  protected:
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-};
-
-template<typename Lhs, typename Rhs, bool Transpose>
-class SparseDenseOuterProduct<Lhs,Rhs,Transpose>::InnerIterator : public _LhsNested::InnerIterator
-{
-    typedef typename _LhsNested::InnerIterator Base;
-    typedef typename SparseDenseOuterProduct::Index Index;
-  public:
-    EIGEN_STRONG_INLINE InnerIterator(const SparseDenseOuterProduct& prod, Index outer)
-      : Base(prod.lhs(), 0), m_outer(outer), m_factor(get(prod.rhs(), outer, typename internal::traits<Rhs>::StorageKind() ))
-    { }
-
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return Transpose ? m_outer : Base::index(); }
-    inline Index col() const { return Transpose ? Base::index() : m_outer; }
-
-    inline Scalar value() const { return Base::value() * m_factor; }
-
-  protected:
-    static Scalar get(const _RhsNested &rhs, Index outer, Dense = Dense())
-    {
-      return rhs.coeff(outer);
-    }
-    
-    static Scalar get(const _RhsNested &rhs, Index outer, Sparse = Sparse())
-    {
-      typename Traits::_RhsNested::InnerIterator it(rhs, outer);
-      if (it && it.index()==0)
-        return it.value();
-      
-      return Scalar(0);
-    }
-    
-    Index m_outer;
-    Scalar m_factor;
-};
-
-namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<SparseTimeDenseProduct<Lhs,Rhs> >
- : traits<ProductBase<SparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs> >
-{
-  typedef Dense StorageKind;
-  typedef MatrixXpr XprKind;
-};
+template <> struct product_promote_storage_type<Sparse,Dense, OuterProduct> { typedef Sparse ret; };
+template <> struct product_promote_storage_type<Dense,Sparse, OuterProduct> { typedef Sparse ret; };
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType,
+         typename AlphaType,
          int LhsStorageOrder = ((SparseLhsType::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor,
          bool ColPerCol = ((DenseRhsType::Flags&RowMajorBit)==0) || DenseRhsType::ColsAtCompileTime==1>
 struct sparse_time_dense_product_impl;
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, RowMajor, true>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, typename DenseResType::Scalar, RowMajor, true>
 {
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::Index Index;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef evaluator<Lhs> LhsEval;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
+    LhsEval lhsEval(lhs);
+    
+    Index n = lhs.outerSize();
+#ifdef EIGEN_HAS_OPENMP
+    Eigen::initParallel();
+    Index threads = Eigen::nbThreads();
+#endif
+    
     for(Index c=0; c<rhs.cols(); ++c)
     {
-      Index n = lhs.outerSize();
-      for(Index j=0; j<n; ++j)
+#ifdef EIGEN_HAS_OPENMP
+      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+      // It basically represents the minimal amount of work to be done to be worth it.
+      if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
+      {
+        #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
+        for(Index i=0; i<n; ++i)
+          processRow(lhsEval,rhs,res,alpha,i,c);
+      }
+      else
+#endif
       {
-        typename Res::Scalar tmp(0);
-        for(LhsInnerIterator it(lhs,j); it ;++it)
-          tmp += it.value() * rhs.coeff(it.index(),c);
-        res.coeffRef(j,c) += alpha * tmp;
+        for(Index i=0; i<n; ++i)
+          processRow(lhsEval,rhs,res,alpha,i,c);
       }
     }
   }
+  
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha, Index i, Index col)
+  {
+    typename Res::Scalar tmp(0);
+    for(LhsInnerIterator it(lhsEval,i); it ;++it)
+      tmp += it.value() * rhs.coeff(it.index(),col);
+    res.coeffRef(i,col) += alpha * tmp;
+  }
+  
 };
 
-template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, ColMajor, true>
+// FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format?
+// -> let's disable it for now as it is conflicting with generic scalar*matrix and matrix*scalar operators
+// template<typename T1, typename T2/*, int _Options, typename _StrideType*/>
+// struct ScalarBinaryOpTraits<T1, Ref<T2/*, _Options, _StrideType*/> >
+// {
+//   enum {
+//     Defined = 1
+//   };
+//   typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
+// };
+
+template<typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, AlphaType, ColMajor, true>
 {
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
-  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
   {
+    evaluator<Lhs> lhsEval(lhs);
     for(Index c=0; c<rhs.cols(); ++c)
     {
       for(Index j=0; j<lhs.outerSize(); ++j)
       {
-        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
-        for(LhsInnerIterator it(lhs,j); it ;++it)
+//        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
+        typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j,c));
+        for(LhsInnerIterator it(lhsEval,j); it ;++it)
           res.coeffRef(it.index(),c) += it.value() * rhs_j;
       }
     }
@@ -209,38 +106,38 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, C
 };
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, RowMajor, false>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, typename DenseResType::Scalar, RowMajor, false>
 {
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
+    evaluator<Lhs> lhsEval(lhs);
     for(Index j=0; j<lhs.outerSize(); ++j)
     {
       typename Res::RowXpr res_j(res.row(j));
-      for(LhsInnerIterator it(lhs,j); it ;++it)
+      for(LhsInnerIterator it(lhsEval,j); it ;++it)
         res_j += (alpha*it.value()) * rhs.row(it.index());
     }
   }
 };
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, ColMajor, false>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, typename DenseResType::Scalar, ColMajor, false>
 {
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
+    evaluator<Lhs> lhsEval(lhs);
     for(Index j=0; j<lhs.outerSize(); ++j)
     {
       typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
-      for(LhsInnerIterator it(lhs,j); it ;++it)
+      for(LhsInnerIterator it(lhsEval,j); it ;++it)
         res.row(it.index()) += (alpha*it.value()) * rhs_j;
     }
   }
@@ -249,63 +146,175 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, C
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType,typename AlphaType>
 inline void sparse_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
 {
-  sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType>::run(lhs, rhs, res, alpha);
+  sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, AlphaType>::run(lhs, rhs, res, alpha);
 }
 
 } // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class SparseTimeDenseProduct
-  : public ProductBase<SparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(SparseTimeDenseProduct)
-
-    SparseTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      internal::sparse_time_dense_product(m_lhs, m_rhs, dest, alpha);
-    }
+namespace internal {
 
-  private:
-    SparseTimeDenseProduct& operator=(const SparseTimeDenseProduct&);
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType>
+ : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SparseShape,DenseShape,ProductType> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    typedef typename nested_eval<Lhs,((Rhs::Flags&RowMajorBit)==0) ? 1 : Rhs::ColsAtCompileTime>::type LhsNested;
+    typedef typename nested_eval<Rhs,((Lhs::Flags&RowMajorBit)==0) ? 1 : Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::sparse_time_dense_product(lhsNested, rhsNested, dst, alpha);
+  }
 };
 
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, DenseShape, ProductType>
+  : generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType>
+{};
 
-// dense = dense * sparse
-namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<DenseTimeSparseProduct<Lhs,Rhs> >
- : traits<ProductBase<DenseTimeSparseProduct<Lhs,Rhs>, Lhs, Rhs> >
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SparseShape,ProductType> >
 {
-  typedef Dense StorageKind;
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dst>
+  static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    typedef typename nested_eval<Lhs,((Rhs::Flags&RowMajorBit)==0) ? Dynamic : 1>::type LhsNested;
+    typedef typename nested_eval<Rhs,((Lhs::Flags&RowMajorBit)==RowMajorBit) ? 1 : Lhs::RowsAtCompileTime>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    
+    // transpose everything
+    Transpose<Dst> dstT(dst);
+    internal::sparse_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
+  }
 };
-} // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class DenseTimeSparseProduct
-  : public ProductBase<DenseTimeSparseProduct<Lhs,Rhs>, Lhs, Rhs>
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, DenseShape, SparseTriangularShape, ProductType>
+  : generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType>
+{};
+
+template<typename LhsT, typename RhsT, bool NeedToTranspose>
+struct sparse_dense_outer_product_evaluator
 {
+protected:
+  typedef typename conditional<NeedToTranspose,RhsT,LhsT>::type Lhs1;
+  typedef typename conditional<NeedToTranspose,LhsT,RhsT>::type ActualRhs;
+  typedef Product<LhsT,RhsT,DefaultProduct> ProdXprType;
+  
+  // if the actual left-hand side is a dense vector,
+  // then build a sparse-view so that we can seamlessly iterate over it.
+  typedef typename conditional<is_same<typename internal::traits<Lhs1>::StorageKind,Sparse>::value,
+            Lhs1, SparseView<Lhs1> >::type ActualLhs;
+  typedef typename conditional<is_same<typename internal::traits<Lhs1>::StorageKind,Sparse>::value,
+            Lhs1 const&, SparseView<Lhs1> >::type LhsArg;
+            
+  typedef evaluator<ActualLhs> LhsEval;
+  typedef evaluator<ActualRhs> RhsEval;
+  typedef typename evaluator<ActualLhs>::InnerIterator LhsIterator;
+  typedef typename ProdXprType::Scalar Scalar;
+  
+public:
+  enum {
+    Flags = NeedToTranspose ? RowMajorBit : 0,
+    CoeffReadCost = HugeCost
+  };
+  
+  class InnerIterator : public LhsIterator
+  {
   public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(DenseTimeSparseProduct)
-
-    DenseTimeSparseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    InnerIterator(const sparse_dense_outer_product_evaluator &xprEval, Index outer)
+      : LhsIterator(xprEval.m_lhsXprImpl, 0),
+        m_outer(outer),
+        m_empty(false),
+        m_factor(get(xprEval.m_rhsXprImpl, outer, typename internal::traits<ActualRhs>::StorageKind() ))
     {}
+    
+    EIGEN_STRONG_INLINE Index outer() const { return m_outer; }
+    EIGEN_STRONG_INLINE Index row()   const { return NeedToTranspose ? m_outer : LhsIterator::index(); }
+    EIGEN_STRONG_INLINE Index col()   const { return NeedToTranspose ? LhsIterator::index() : m_outer; }
 
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+    EIGEN_STRONG_INLINE Scalar value() const { return LhsIterator::value() * m_factor; }
+    EIGEN_STRONG_INLINE operator bool() const { return LhsIterator::operator bool() && (!m_empty); }
+    
+  protected:
+    Scalar get(const RhsEval &rhs, Index outer, Dense = Dense()) const
+    {
+      return rhs.coeff(outer);
+    }
+    
+    Scalar get(const RhsEval &rhs, Index outer, Sparse = Sparse())
     {
-      Transpose<const _LhsNested> lhs_t(m_lhs);
-      Transpose<const _RhsNested> rhs_t(m_rhs);
-      Transpose<Dest> dest_t(dest);
-      internal::sparse_time_dense_product(rhs_t, lhs_t, dest_t, alpha);
+      typename RhsEval::InnerIterator it(rhs, outer);
+      if (it && it.index()==0 && it.value()!=Scalar(0))
+        return it.value();
+      m_empty = true;
+      return Scalar(0);
     }
+    
+    Index m_outer;
+    bool m_empty;
+    Scalar m_factor;
+  };
+  
+  sparse_dense_outer_product_evaluator(const Lhs1 &lhs, const ActualRhs &rhs)
+     : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  // transpose case
+  sparse_dense_outer_product_evaluator(const ActualRhs &rhs, const Lhs1 &lhs)
+     : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+    
+protected:
+  const LhsArg m_lhs;
+  evaluator<ActualLhs> m_lhsXprImpl;
+  evaluator<ActualRhs> m_rhsXprImpl;
+};
 
-  private:
-    DenseTimeSparseProduct& operator=(const DenseTimeSparseProduct&);
+// sparse * dense outer product
+template<typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, SparseShape, DenseShape>
+  : sparse_dense_outer_product_evaluator<Lhs,Rhs, Lhs::IsRowMajor>
+{
+  typedef sparse_dense_outer_product_evaluator<Lhs,Rhs, Lhs::IsRowMajor> Base;
+  
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+
+  explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.lhs(), xpr.rhs())
+  {}
+  
 };
 
+template<typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, DenseShape, SparseShape>
+  : sparse_dense_outer_product_evaluator<Lhs,Rhs, Rhs::IsRowMajor>
+{
+  typedef sparse_dense_outer_product_evaluator<Lhs,Rhs, Rhs::IsRowMajor> Base;
+  
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+
+  explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.lhs(), xpr.rhs())
+  {}
+  
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSEDENSEPRODUCT_H
diff --git a/Eigen/src/SparseCore/SparseDiagonalProduct.h b/Eigen/src/SparseCore/SparseDiagonalProduct.h
index 1bb590e64..941c03be3 100644
--- a/Eigen/src/SparseCore/SparseDiagonalProduct.h
+++ b/Eigen/src/SparseCore/SparseDiagonalProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -26,171 +26,113 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Lhs, typename Rhs>
-struct traits<SparseDiagonalProduct<Lhs, Rhs> >
-{
-  typedef typename remove_all<Lhs>::type _Lhs;
-  typedef typename remove_all<Rhs>::type _Rhs;
-  typedef typename _Lhs::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
-  typedef Sparse StorageKind;
-  typedef MatrixXpr XprKind;
-  enum {
-    RowsAtCompileTime = _Lhs::RowsAtCompileTime,
-    ColsAtCompileTime = _Rhs::ColsAtCompileTime,
-
-    MaxRowsAtCompileTime = _Lhs::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = _Rhs::MaxColsAtCompileTime,
-
-    SparseFlags = is_diagonal<_Lhs>::ret ? int(_Rhs::Flags) : int(_Lhs::Flags),
-    Flags = (SparseFlags&RowMajorBit),
-    CoeffReadCost = Dynamic
-  };
+enum {
+  SDP_AsScalarProduct,
+  SDP_AsCwiseProduct
 };
+  
+template<typename SparseXprType, typename DiagonalCoeffType, int SDP_Tag>
+struct sparse_diagonal_product_evaluator;
 
-enum {SDP_IsDiagonal, SDP_IsSparseRowMajor, SDP_IsSparseColMajor};
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType, int RhsMode, int LhsMode>
-class sparse_diagonal_product_inner_iterator_selector;
-
-} // end namespace internal
-
-template<typename Lhs, typename Rhs>
-class SparseDiagonalProduct
-  : public SparseMatrixBase<SparseDiagonalProduct<Lhs,Rhs> >,
-    internal::no_assignment_operator
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, DiagonalShape, SparseShape>
+  : public sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType, Rhs::Flags&RowMajorBit?SDP_AsScalarProduct:SDP_AsCwiseProduct>
 {
-    typedef typename Lhs::Nested LhsNested;
-    typedef typename Rhs::Nested RhsNested;
-
-    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
-    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
-
-    enum {
-      LhsMode = internal::is_diagonal<_LhsNested>::ret ? internal::SDP_IsDiagonal
-              : (_LhsNested::Flags&RowMajorBit) ? internal::SDP_IsSparseRowMajor : internal::SDP_IsSparseColMajor,
-      RhsMode = internal::is_diagonal<_RhsNested>::ret ? internal::SDP_IsDiagonal
-              : (_RhsNested::Flags&RowMajorBit) ? internal::SDP_IsSparseRowMajor : internal::SDP_IsSparseColMajor
-    };
-
-  public:
-
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseDiagonalProduct)
-
-    typedef internal::sparse_diagonal_product_inner_iterator_selector
-                      <_LhsNested,_RhsNested,SparseDiagonalProduct,LhsMode,RhsMode> InnerIterator;
-    
-    // We do not want ReverseInnerIterator for diagonal-sparse products,
-    // but this dummy declaration is needed to make diag * sparse * diag compile.
-    class ReverseInnerIterator;
-
-    EIGEN_STRONG_INLINE SparseDiagonalProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      eigen_assert(lhs.cols() == rhs.rows() && "invalid sparse matrix * diagonal matrix product");
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
-  protected:
-    LhsNested m_lhs;
-    RhsNested m_rhs;
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  enum { CoeffReadCost = HugeCost, Flags = Rhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
+  
+  typedef sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType, Rhs::Flags&RowMajorBit?SDP_AsScalarProduct:SDP_AsCwiseProduct> Base;
+  explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
 };
 
-namespace internal {
-
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseRowMajor>
-  : public CwiseUnaryOp<scalar_multiple_op<typename Lhs::Scalar>,const Rhs>::InnerIterator
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, SparseShape, DiagonalShape>
+  : public sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>, Lhs::Flags&RowMajorBit?SDP_AsCwiseProduct:SDP_AsScalarProduct>
 {
-    typedef typename CwiseUnaryOp<scalar_multiple_op<typename Lhs::Scalar>,const Rhs>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.rhs()*(expr.lhs().diagonal().coeff(outer)), outer)
-    {}
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  enum { CoeffReadCost = HugeCost, Flags = Lhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
+  
+  typedef sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>, Lhs::Flags&RowMajorBit?SDP_AsCwiseProduct:SDP_AsScalarProduct> Base;
+  explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal().transpose()) {}
 };
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseColMajor>
-  : public CwiseBinaryOp<
-      scalar_product_op<typename Lhs::Scalar>,
-      const typename Rhs::ConstInnerVectorReturnType,
-      const typename Lhs::DiagonalVectorType>::InnerIterator
+template<typename SparseXprType, typename DiagonalCoeffType>
+struct sparse_diagonal_product_evaluator<SparseXprType, DiagonalCoeffType, SDP_AsScalarProduct>
 {
-    typedef typename CwiseBinaryOp<
-      scalar_product_op<typename Lhs::Scalar>,
-      const typename Rhs::ConstInnerVectorReturnType,
-      const typename Lhs::DiagonalVectorType>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-    Index m_outer;
+protected:
+  typedef typename evaluator<SparseXprType>::InnerIterator SparseXprInnerIterator;
+  typedef typename SparseXprType::Scalar Scalar;
+  
+public:
+  class InnerIterator : public SparseXprInnerIterator
+  {
   public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.rhs().innerVector(outer) .cwiseProduct(expr.lhs().diagonal()), 0), m_outer(outer)
+    InnerIterator(const sparse_diagonal_product_evaluator &xprEval, Index outer)
+      : SparseXprInnerIterator(xprEval.m_sparseXprImpl, outer),
+        m_coeff(xprEval.m_diagCoeffImpl.coeff(outer))
     {}
     
-    inline Index outer() const { return m_outer; }
-    inline Index col() const { return m_outer; }
-};
+    EIGEN_STRONG_INLINE Scalar value() const { return m_coeff * SparseXprInnerIterator::value(); }
+  protected:
+    typename DiagonalCoeffType::Scalar m_coeff;
+  };
+  
+  sparse_diagonal_product_evaluator(const SparseXprType &sparseXpr, const DiagonalCoeffType &diagCoeff)
+    : m_sparseXprImpl(sparseXpr), m_diagCoeffImpl(diagCoeff)
+  {}
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseColMajor,SDP_IsDiagonal>
-  : public CwiseUnaryOp<scalar_multiple_op<typename Rhs::Scalar>,const Lhs>::InnerIterator
-{
-    typedef typename CwiseUnaryOp<scalar_multiple_op<typename Rhs::Scalar>,const Lhs>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.lhs()*expr.rhs().diagonal().coeff(outer), outer)
-    {}
+  Index nonZerosEstimate() const { return m_sparseXprImpl.nonZerosEstimate(); }
+    
+protected:
+  evaluator<SparseXprType> m_sparseXprImpl;
+  evaluator<DiagonalCoeffType> m_diagCoeffImpl;
 };
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseRowMajor,SDP_IsDiagonal>
-  : public CwiseBinaryOp<
-      scalar_product_op<typename Rhs::Scalar>,
-      const typename Lhs::ConstInnerVectorReturnType,
-      const Transpose<const typename Rhs::DiagonalVectorType> >::InnerIterator
+
+template<typename SparseXprType, typename DiagCoeffType>
+struct sparse_diagonal_product_evaluator<SparseXprType, DiagCoeffType, SDP_AsCwiseProduct>
 {
-    typedef typename CwiseBinaryOp<
-      scalar_product_op<typename Rhs::Scalar>,
-      const typename Lhs::ConstInnerVectorReturnType,
-      const Transpose<const typename Rhs::DiagonalVectorType> >::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-    Index m_outer;
+  typedef typename SparseXprType::Scalar Scalar;
+  typedef typename SparseXprType::StorageIndex StorageIndex;
+  
+  typedef typename nested_eval<DiagCoeffType,SparseXprType::IsRowMajor ? SparseXprType::RowsAtCompileTime
+                                                                       : SparseXprType::ColsAtCompileTime>::type DiagCoeffNested;
+  
+  class InnerIterator
+  {
+    typedef typename evaluator<SparseXprType>::InnerIterator SparseXprIter;
   public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.lhs().innerVector(outer) .cwiseProduct(expr.rhs().diagonal().transpose()), 0), m_outer(outer)
+    InnerIterator(const sparse_diagonal_product_evaluator &xprEval, Index outer)
+      : m_sparseIter(xprEval.m_sparseXprEval, outer), m_diagCoeffNested(xprEval.m_diagCoeffNested)
     {}
     
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return m_outer; }
+    inline Scalar value() const { return m_sparseIter.value() * m_diagCoeffNested.coeff(index()); }
+    inline StorageIndex index() const  { return m_sparseIter.index(); }
+    inline Index outer() const  { return m_sparseIter.outer(); }
+    inline Index col() const    { return SparseXprType::IsRowMajor ? m_sparseIter.index() : m_sparseIter.outer(); }
+    inline Index row() const    { return SparseXprType::IsRowMajor ? m_sparseIter.outer() : m_sparseIter.index(); }
+    
+    EIGEN_STRONG_INLINE InnerIterator& operator++() { ++m_sparseIter; return *this; }
+    inline operator bool() const  { return m_sparseIter; }
+    
+  protected:
+    SparseXprIter m_sparseIter;
+    DiagCoeffNested m_diagCoeffNested;
+  };
+  
+  sparse_diagonal_product_evaluator(const SparseXprType &sparseXpr, const DiagCoeffType &diagCoeff)
+    : m_sparseXprEval(sparseXpr), m_diagCoeffNested(diagCoeff)
+  {}
+
+  Index nonZerosEstimate() const { return m_sparseXprEval.nonZerosEstimate(); }
+    
+protected:
+  evaluator<SparseXprType> m_sparseXprEval;
+  DiagCoeffNested m_diagCoeffNested;
 };
 
 } // end namespace internal
 
-// SparseMatrixBase functions
-
-template<typename Derived>
-template<typename OtherDerived>
-const SparseDiagonalProduct<Derived,OtherDerived>
-SparseMatrixBase<Derived>::operator*(const DiagonalBase<OtherDerived> &other) const
-{
-  return SparseDiagonalProduct<Derived,OtherDerived>(this->derived(), other.derived());
-}
-
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSE_DIAGONAL_PRODUCT_H
diff --git a/Eigen/src/SparseCore/SparseDot.h b/Eigen/src/SparseCore/SparseDot.h
index db39c9aec..38bc4aa9e 100644
--- a/Eigen/src/SparseCore/SparseDot.h
+++ b/Eigen/src/SparseCore/SparseDot.h
@@ -26,7 +26,8 @@ SparseMatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
   eigen_assert(size() == other.size());
   eigen_assert(other.size()>0 && "you are using a non initialized vector");
 
-  typename Derived::InnerIterator i(derived(),0);
+  internal::evaluator<Derived> thisEval(derived());
+  typename internal::evaluator<Derived>::InnerIterator i(thisEval, 0);
   Scalar res(0);
   while (i)
   {
@@ -49,16 +50,12 @@ SparseMatrixBase<Derived>::dot(const SparseMatrixBase<OtherDerived>& other) cons
 
   eigen_assert(size() == other.size());
 
-  typedef typename Derived::Nested  Nested;
-  typedef typename OtherDerived::Nested  OtherNested;
-  typedef typename internal::remove_all<Nested>::type  NestedCleaned;
-  typedef typename internal::remove_all<OtherNested>::type  OtherNestedCleaned;
+  internal::evaluator<Derived> thisEval(derived());
+  typename internal::evaluator<Derived>::InnerIterator i(thisEval, 0);
+  
+  internal::evaluator<OtherDerived>  otherEval(other.derived());
+  typename internal::evaluator<OtherDerived>::InnerIterator j(otherEval, 0);
 
-  Nested nthis(derived());
-  OtherNested nother(other.derived());
-
-  typename NestedCleaned::InnerIterator i(nthis,0);
-  typename OtherNestedCleaned::InnerIterator j(nother,0);
   Scalar res(0);
   while (i && j)
   {
diff --git a/Eigen/src/SparseCore/SparseFuzzy.h b/Eigen/src/SparseCore/SparseFuzzy.h
index 45f36e9eb..7d47eb94d 100644
--- a/Eigen/src/SparseCore/SparseFuzzy.h
+++ b/Eigen/src/SparseCore/SparseFuzzy.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,17 +10,20 @@
 #ifndef EIGEN_SPARSE_FUZZY_H
 #define EIGEN_SPARSE_FUZZY_H
 
-// template<typename Derived>
-// template<typename OtherDerived>
-// bool SparseMatrixBase<Derived>::isApprox(
-//   const OtherDerived& other,
-//   typename NumTraits<Scalar>::Real prec
-// ) const
-// {
-//   const typename internal::nested<Derived,2>::type nested(derived());
-//   const typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
-//   return    (nested - otherNested).cwise().abs2().sum()
-//          <= prec * prec * (std::min)(nested.cwise().abs2().sum(), otherNested.cwise().abs2().sum());
-// }
+namespace Eigen {
+  
+template<typename Derived>
+template<typename OtherDerived>
+bool SparseMatrixBase<Derived>::isApprox(const SparseMatrixBase<OtherDerived>& other, const RealScalar &prec) const
+{
+  const typename internal::nested_eval<Derived,2,PlainObject>::type actualA(derived());
+  typename internal::conditional<bool(IsRowMajor)==bool(OtherDerived::IsRowMajor),
+    const typename internal::nested_eval<OtherDerived,2,PlainObject>::type,
+    const PlainObject>::type actualB(other.derived());
+
+  return (actualA - actualB).squaredNorm() <= prec * prec * numext::mini(actualA.squaredNorm(), actualB.squaredNorm());
+}
+
+} // end namespace Eigen
 
 #endif // EIGEN_SPARSE_FUZZY_H
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
new file mode 100644
index 000000000..f99be3379
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -0,0 +1,305 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_MAP_H
+#define EIGEN_SPARSE_MAP_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    Flags = TraitsBase::Flags & (~NestByRefBit)
+  };
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit))
+  };
+};
+
+} // end namespace internal
+
+template<typename Derived,
+         int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
+> class SparseMapBase;
+
+/** \ingroup SparseCore_Module
+  * class SparseMapBase
+  * \brief Common base class for Map and Ref instance of sparse matrix and vector.
+  */
+template<typename Derived>
+class SparseMapBase<Derived,ReadOnlyAccessors>
+  : public SparseCompressedBase<Derived>
+{
+  public:
+    typedef SparseCompressedBase<Derived> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
+    enum { IsRowMajor = Base::IsRowMajor };
+    using Base::operator=;
+  protected:
+    
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         Scalar *, const Scalar *>::type ScalarPointer;
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         StorageIndex *, const StorageIndex *>::type IndexPointer;
+
+    Index   m_outerSize;
+    Index   m_innerSize;
+    Array<StorageIndex,2,1>  m_zero_nnz;
+    IndexPointer  m_outerIndex;
+    IndexPointer  m_innerIndices;
+    ScalarPointer m_values;
+    IndexPointer  m_innerNonZeros;
+
+  public:
+
+    /** \copydoc SparseMatrixBase::rows() */
+    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+    /** \copydoc SparseMatrixBase::cols() */
+    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+    /** \copydoc SparseMatrixBase::innerSize() */
+    inline Index innerSize() const { return m_innerSize; }
+    /** \copydoc SparseMatrixBase::outerSize() */
+    inline Index outerSize() const { return m_outerSize; }
+    /** \copydoc SparseCompressedBase::nonZeros */
+    inline Index nonZeros() const { return m_zero_nnz[1]; }
+    
+    /** \copydoc SparseCompressedBase::isCompressed */
+    bool isCompressed() const { return m_innerNonZeros==0; }
+
+    //----------------------------------------
+    // direct access interface
+    /** \copydoc SparseMatrix::valuePtr */
+    inline const Scalar* valuePtr() const { return m_values; }
+    /** \copydoc SparseMatrix::innerIndexPtr */
+    inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
+    /** \copydoc SparseMatrix::outerIndexPtr */
+    inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
+    /** \copydoc SparseMatrix::innerNonZeroPtr */
+    inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
+    //----------------------------------------
+
+    /** \copydoc SparseMatrix::coeff */
+    inline Scalar coeff(Index row, Index col) const
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = m_outerIndex[outer];
+      Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer];
+      if (start==end)
+        return Scalar(0);
+      else if (end>0 && inner==m_innerIndices[end-1])
+        return m_values[end-1];
+      // ^^  optimization: let's first check if it is the last coefficient
+      // (very common in high level algorithms)
+
+      const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
+      const Index id = r-&m_innerIndices[0];
+      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
+    }
+
+    inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
+                              ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
+      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_zero_nnz(0,internal::convert_index<StorageIndex>(nnz)), m_outerIndex(outerIndexPtr),
+        m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(innerNonZerosPtr)
+    {}
+
+    // for vectors
+    inline SparseMapBase(Index size, Index nnz, IndexPointer innerIndexPtr, ScalarPointer valuePtr)
+      : m_outerSize(1), m_innerSize(size), m_zero_nnz(0,internal::convert_index<StorageIndex>(nnz)), m_outerIndex(m_zero_nnz.data()),
+        m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(0)
+    {}
+
+    /** Empty destructor */
+    inline ~SparseMapBase() {}
+
+  protected:
+    inline SparseMapBase() {}
+};
+
+/** \ingroup SparseCore_Module
+  * class SparseMapBase
+  * \brief Common base class for writable Map and Ref instance of sparse matrix and vector.
+  */
+template<typename Derived>
+class SparseMapBase<Derived,WriteAccessors>
+  : public SparseMapBase<Derived,ReadOnlyAccessors>
+{
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
+    
+  public:
+    typedef SparseMapBase<Derived, ReadOnlyAccessors> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
+    enum { IsRowMajor = Base::IsRowMajor };
+    
+    using Base::operator=;
+
+  public:
+    
+    //----------------------------------------
+    // direct access interface
+    using Base::valuePtr;
+    using Base::innerIndexPtr;
+    using Base::outerIndexPtr;
+    using Base::innerNonZeroPtr;
+    /** \copydoc SparseMatrix::valuePtr */
+    inline Scalar* valuePtr()              { return Base::m_values; }
+    /** \copydoc SparseMatrix::innerIndexPtr */
+    inline StorageIndex* innerIndexPtr()   { return Base::m_innerIndices; }
+    /** \copydoc SparseMatrix::outerIndexPtr */
+    inline StorageIndex* outerIndexPtr()   { return Base::m_outerIndex; }
+    /** \copydoc SparseMatrix::innerNonZeroPtr */
+    inline StorageIndex* innerNonZeroPtr() { return Base::m_innerNonZeros; }
+    //----------------------------------------
+
+    /** \copydoc SparseMatrix::coeffRef */
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = Base::m_outerIndex[outer];
+      Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer];
+      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
+      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
+      StorageIndex* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
+      const Index id = r - &Base::m_innerIndices[0];
+      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
+      return const_cast<Scalar*>(Base::m_values)[id];
+    }
+    
+    inline SparseMapBase(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr,
+                         Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    // for vectors
+    inline SparseMapBase(Index size, Index nnz, StorageIndex* innerIndexPtr, Scalar* valuePtr)
+      : Base(size, nnz, innerIndexPtr, valuePtr)
+    {}
+
+    /** Empty destructor */
+    inline ~SparseMapBase() {}
+
+  protected:
+    inline SparseMapBase() {}
+};
+
+/** \ingroup SparseCore_Module
+  *
+  * \brief Specialization of class Map for SparseMatrix-like storage.
+  *
+  * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix.
+  *
+  * \sa class Map, class SparseMatrix, class Ref<SparseMatrixType,Options>
+  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+#else
+template<typename SparseMatrixType>
+class Map<SparseMatrixType>
+  : public SparseMapBase<Derived,WriteAccessors>
+#endif
+{
+  public:
+    typedef SparseMapBase<Map> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+
+    /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients,
+      * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr.
+      * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed.
+      *
+      * This constructor is available only if \c SparseMatrixType is non-const.
+      *
+      * More details on the expected storage schemes are given in the \ref TutorialSparse "manual pages".
+      */
+    inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr,
+               StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Empty destructor */
+    inline ~Map() {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public SparseMapBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+  public:
+    typedef SparseMapBase<Map> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+#endif
+    /** This is the const version of the above constructor.
+      *
+      * This constructor is available only if \c SparseMatrixType is const, e.g.:
+      * \code Map<const SparseMatrix<double> >  \endcode
+      */
+    inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr,
+               const StorageIndex* innerIndexPtr, const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~Map() {}
+};
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_MAP_H
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index ba5e3a9b6..323c2323b 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -32,18 +32,22 @@ namespace Eigen {
   * \tparam _Scalar the scalar type, i.e. the type of the coefficients
   * \tparam _Options Union of bit flags controlling the storage scheme. Currently the only possibility
   *                 is ColMajor or RowMajor. The default is 0 which means column-major.
-  * \tparam _Index the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int.
+  * \tparam _StorageIndex the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int.
+  *
+  * \warning In %Eigen 3.2, the undocumented type \c SparseMatrix::Index was improperly defined as the storage index type (e.g., int),
+  *          whereas it is now (starting from %Eigen 3.3) deprecated and always defined as Eigen::Index.
+  *          Codes making use of \c SparseMatrix::Index, might thus likely have to be changed to use \c SparseMatrix::StorageIndex instead.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIX_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEMATRIX_PLUGIN.
   */
 
 namespace internal {
-template<typename _Scalar, int _Options, typename _Index>
-struct traits<SparseMatrix<_Scalar, _Options, _Index> >
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct traits<SparseMatrix<_Scalar, _Options, _StorageIndex> >
 {
   typedef _Scalar Scalar;
-  typedef _Index Index;
+  typedef _StorageIndex StorageIndex;
   typedef Sparse StorageKind;
   typedef MatrixXpr XprKind;
   enum {
@@ -51,22 +55,21 @@ struct traits<SparseMatrix<_Scalar, _Options, _Index> >
     ColsAtCompileTime = Dynamic,
     MaxRowsAtCompileTime = Dynamic,
     MaxColsAtCompileTime = Dynamic,
-    Flags = _Options | NestByRefBit | LvalueBit,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
 
-template<typename _Scalar, int _Options, typename _Index, int DiagIndex>
-struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
+template<typename _Scalar, int _Options, typename _StorageIndex, int DiagIndex>
+struct traits<Diagonal<SparseMatrix<_Scalar, _Options, _StorageIndex>, DiagIndex> >
 {
-  typedef SparseMatrix<_Scalar, _Options, _Index> MatrixType;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef SparseMatrix<_Scalar, _Options, _StorageIndex> MatrixType;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
 
   typedef _Scalar Scalar;
   typedef Dense StorageKind;
-  typedef _Index Index;
+  typedef _StorageIndex StorageIndex;
   typedef MatrixXpr XprKind;
 
   enum {
@@ -74,47 +77,61 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex>
     ColsAtCompileTime = 1,
     MaxRowsAtCompileTime = Dynamic,
     MaxColsAtCompileTime = 1,
-    Flags = 0,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost*10
+    Flags = LvalueBit
+  };
+};
+
+template<typename _Scalar, int _Options, typename _StorageIndex, int DiagIndex>
+struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _StorageIndex>, DiagIndex> >
+ : public traits<Diagonal<SparseMatrix<_Scalar, _Options, _StorageIndex>, DiagIndex> >
+{
+  enum {
+    Flags = 0
   };
 };
 
 } // end namespace internal
 
-template<typename _Scalar, int _Options, typename _Index>
+template<typename _Scalar, int _Options, typename _StorageIndex>
 class SparseMatrix
-  : public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> >
+  : public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _StorageIndex> >
 {
+    typedef SparseCompressedBase<SparseMatrix> Base;
+    using Base::convert_index;
+    friend class SparseVector<_Scalar,0,_StorageIndex>;
   public:
+    using Base::isCompressed;
+    using Base::nonZeros;
     EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
+    using Base::operator+=;
+    using Base::operator-=;
 
     typedef MappedSparseMatrix<Scalar,Flags> Map;
+    typedef Diagonal<SparseMatrix> DiagonalReturnType;
+    typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType;
+    typedef typename Base::InnerIterator InnerIterator;
+    typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
+    
+
     using Base::IsRowMajor;
-    typedef internal::CompressedStorage<Scalar,Index> Storage;
+    typedef internal::CompressedStorage<Scalar,StorageIndex> Storage;
     enum {
       Options = _Options
     };
 
+    typedef typename Base::IndexVector IndexVector;
+    typedef typename Base::ScalarVector ScalarVector;
   protected:
-
     typedef SparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0)> TransposedSparseMatrix;
 
     Index m_outerSize;
     Index m_innerSize;
-    Index* m_outerIndex;
-    Index* m_innerNonZeros;     // optional, if null then the data is compressed
+    StorageIndex* m_outerIndex;
+    StorageIndex* m_innerNonZeros;     // optional, if null then the data is compressed
     Storage m_data;
-    
-    Eigen::Map<Matrix<Index,Dynamic,1> > innerNonZeros() { return Eigen::Map<Matrix<Index,Dynamic,1> >(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
-    const  Eigen::Map<const Matrix<Index,Dynamic,1> > innerNonZeros() const { return Eigen::Map<const Matrix<Index,Dynamic,1> >(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
 
   public:
     
-    /** \returns whether \c *this is in compressed form. */
-    inline bool isCompressed() const { return m_innerNonZeros==0; }
-
     /** \returns the number of rows of the matrix */
     inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
     /** \returns the number of columns of the matrix */
@@ -128,38 +145,38 @@ class SparseMatrix
     /** \returns a const pointer to the array of values.
       * This function is aimed at interoperability with other libraries.
       * \sa innerIndexPtr(), outerIndexPtr() */
-    inline const Scalar* valuePtr() const { return &m_data.value(0); }
+    inline const Scalar* valuePtr() const { return m_data.valuePtr(); }
     /** \returns a non-const pointer to the array of values.
       * This function is aimed at interoperability with other libraries.
       * \sa innerIndexPtr(), outerIndexPtr() */
-    inline Scalar* valuePtr() { return &m_data.value(0); }
+    inline Scalar* valuePtr() { return m_data.valuePtr(); }
 
     /** \returns a const pointer to the array of inner indices.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), outerIndexPtr() */
-    inline const Index* innerIndexPtr() const { return &m_data.index(0); }
+    inline const StorageIndex* innerIndexPtr() const { return m_data.indexPtr(); }
     /** \returns a non-const pointer to the array of inner indices.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), outerIndexPtr() */
-    inline Index* innerIndexPtr() { return &m_data.index(0); }
+    inline StorageIndex* innerIndexPtr() { return m_data.indexPtr(); }
 
     /** \returns a const pointer to the array of the starting positions of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), innerIndexPtr() */
-    inline const Index* outerIndexPtr() const { return m_outerIndex; }
+    inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
     /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), innerIndexPtr() */
-    inline Index* outerIndexPtr() { return m_outerIndex; }
+    inline StorageIndex* outerIndexPtr() { return m_outerIndex; }
 
     /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \warning it returns the null pointer 0 in compressed mode */
-    inline const Index* innerNonZeroPtr() const { return m_innerNonZeros; }
+    inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
     /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \warning it returns the null pointer 0 in compressed mode */
-    inline Index* innerNonZeroPtr() { return m_innerNonZeros; }
+    inline StorageIndex* innerNonZeroPtr() { return m_innerNonZeros; }
 
     /** \internal */
     inline Storage& data() { return m_data; }
@@ -175,7 +192,7 @@ class SparseMatrix
       const Index outer = IsRowMajor ? row : col;
       const Index inner = IsRowMajor ? col : row;
       Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
-      return m_data.atInRange(m_outerIndex[outer], end, inner);
+      return m_data.atInRange(m_outerIndex[outer], end, StorageIndex(inner));
     }
 
     /** \returns a non-const reference to the value of the matrix at position \a i, \a j
@@ -198,7 +215,7 @@ class SparseMatrix
       eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
       if(end<=start)
         return insert(row,col);
-      const Index p = m_data.searchLowerIndex(start,end-1,inner);
+      const Index p = m_data.searchLowerIndex(start,end-1,StorageIndex(inner));
       if((p<end) && (m_data.index(p)==inner))
         return m_data.value(p);
       else
@@ -209,45 +226,34 @@ class SparseMatrix
       * The non zero coefficient must \b not already exist.
       *
       * If the matrix \c *this is in compressed mode, then \c *this is turned into uncompressed
-      * mode while reserving room for 2 non zeros per inner vector. It is strongly recommended to first
-      * call reserve(const SizesType &) to reserve a more appropriate number of elements per
-      * inner vector that better match your scenario.
+      * mode while reserving room for 2 x this->innerSize() non zeros if reserve(Index) has not been called earlier.
+      * In this case, the insertion procedure is optimized for a \e sequential insertion mode where elements are assumed to be
+      * inserted by increasing outer-indices.
+      * 
+      * If that's not the case, then it is strongly recommended to either use a triplet-list to assemble the matrix, or to first
+      * call reserve(const SizesType &) to reserve the appropriate number of non-zero elements per inner vector.
       *
-      * This function performs a sorted insertion in O(1) if the elements of each inner vector are
-      * inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
+      * Assuming memory has been appropriately reserved, this function performs a sorted insertion in O(1)
+      * if the elements of each inner vector are inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
       *
       */
-    Scalar& insert(Index row, Index col)
-    {
-      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-      
-      if(isCompressed())
-      {
-        reserve(Matrix<Index,Dynamic,1>::Constant(outerSize(), 2));
-      }
-      return insertUncompressed(row,col);
-    }
+    Scalar& insert(Index row, Index col);
 
   public:
 
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    /** Removes all non zeros but keep allocated memory */
+    /** Removes all non zeros but keep allocated memory
+      *
+      * This function does not free the currently allocated memory. To release as much as memory as possible,
+      * call \code mat.data().squeeze(); \endcode after resizing it.
+      * 
+      * \sa resize(Index,Index), data()
+      */
     inline void setZero()
     {
       m_data.clear();
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(Index));
-      if(m_innerNonZeros)
-        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(Index));
-    }
-
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const
-    {
+      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
       if(m_innerNonZeros)
-        return innerNonZeros().sum();
-      return static_cast<Index>(m_data.size());
+        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
     }
 
     /** Preallocates \a reserveSize non zeros.
@@ -262,22 +268,25 @@ class SparseMatrix
     #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** Preallocates \a reserveSize[\c j] non zeros for each column (resp. row) \c j.
       *
-      * This function turns the matrix in non-compressed mode */
+      * This function turns the matrix in non-compressed mode.
+      * 
+      * The type \c SizesType must expose the following interface:
+        \code
+        typedef value_type;
+        const value_type& operator[](i) const;
+        \endcode
+      * for \c i in the [0,this->outerSize()[ range.
+      * Typical choices include std::vector<int>, Eigen::VectorXi, Eigen::VectorXi::Constant, etc.
+      */
     template<class SizesType>
     inline void reserve(const SizesType& reserveSizes);
     #else
     template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif = typename SizesType::value_type())
-    {
-      EIGEN_UNUSED_VARIABLE(enableif);
-      reserveInnerVectors(reserveSizes);
-    }
-    template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes, const typename SizesType::Scalar& enableif =
-    #if (!defined(_MSC_VER)) || (_MSC_VER>=1500) // MSVC 2005 fails to compile with this typename
+    inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif =
+    #if (!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1500) // MSVC 2005 fails to compile with this typename
         typename
     #endif
-        SizesType::Scalar())
+        SizesType::value_type())
     {
       EIGEN_UNUSED_VARIABLE(enableif);
       reserveInnerVectors(reserveSizes);
@@ -289,15 +298,15 @@ class SparseMatrix
     {
       if(isCompressed())
       {
-        std::size_t totalReserveSize = 0;
+        Index totalReserveSize = 0;
         // turn the matrix into non-compressed mode
-        m_innerNonZeros = static_cast<Index*>(std::malloc(m_outerSize * sizeof(Index)));
+        m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
         if (!m_innerNonZeros) internal::throw_std_bad_alloc();
         
         // temporarily use m_innerSizes to hold the new starting points.
-        Index* newOuterIndex = m_innerNonZeros;
+        StorageIndex* newOuterIndex = m_innerNonZeros;
         
-        Index count = 0;
+        StorageIndex count = 0;
         for(Index j=0; j<m_outerSize; ++j)
         {
           newOuterIndex[j] = count;
@@ -305,10 +314,10 @@ class SparseMatrix
           totalReserveSize += reserveSizes[j];
         }
         m_data.reserve(totalReserveSize);
-        Index previousOuterIndex = m_outerIndex[m_outerSize];
+        StorageIndex previousOuterIndex = m_outerIndex[m_outerSize];
         for(Index j=m_outerSize-1; j>=0; --j)
         {
-          Index innerNNZ = previousOuterIndex - m_outerIndex[j];
+          StorageIndex innerNNZ = previousOuterIndex - m_outerIndex[j];
           for(Index i=innerNNZ-1; i>=0; --i)
           {
             m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
@@ -324,15 +333,15 @@ class SparseMatrix
       }
       else
       {
-        Index* newOuterIndex = static_cast<Index*>(std::malloc((m_outerSize+1)*sizeof(Index)));
+        StorageIndex* newOuterIndex = static_cast<StorageIndex*>(std::malloc((m_outerSize+1)*sizeof(StorageIndex)));
         if (!newOuterIndex) internal::throw_std_bad_alloc();
         
-        Index count = 0;
+        StorageIndex count = 0;
         for(Index j=0; j<m_outerSize; ++j)
         {
           newOuterIndex[j] = count;
-          Index alreadyReserved = (m_outerIndex[j+1]-m_outerIndex[j]) - m_innerNonZeros[j];
-          Index toReserve = std::max<Index>(reserveSizes[j], alreadyReserved);
+          StorageIndex alreadyReserved = (m_outerIndex[j+1]-m_outerIndex[j]) - m_innerNonZeros[j];
+          StorageIndex toReserve = std::max<StorageIndex>(reserveSizes[j], alreadyReserved);
           count += toReserve + m_innerNonZeros[j];
         }
         newOuterIndex[m_outerSize] = count;
@@ -343,7 +352,7 @@ class SparseMatrix
           Index offset = newOuterIndex[j] - m_outerIndex[j];
           if(offset>0)
           {
-            Index innerNNZ = m_innerNonZeros[j];
+            StorageIndex innerNNZ = m_innerNonZeros[j];
             for(Index i=innerNNZ-1; i>=0; --i)
             {
               m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
@@ -380,11 +389,11 @@ class SparseMatrix
       * \sa insertBack, startVec */
     inline Scalar& insertBackByOuterInner(Index outer, Index inner)
     {
-      eigen_assert(size_t(m_outerIndex[outer+1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
+      eigen_assert(Index(m_outerIndex[outer+1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
       eigen_assert( (m_outerIndex[outer+1]-m_outerIndex[outer]==0 || m_data.index(m_data.size()-1)<inner) && "Invalid ordered insertion (invalid inner index)");
       Index p = m_outerIndex[outer+1];
       ++m_outerIndex[outer+1];
-      m_data.append(0, inner);
+      m_data.append(Scalar(0), inner);
       return m_data.value(p);
     }
 
@@ -394,7 +403,7 @@ class SparseMatrix
     {
       Index p = m_outerIndex[outer+1];
       ++m_outerIndex[outer+1];
-      m_data.append(0, inner);
+      m_data.append(Scalar(0), inner);
       return m_data.value(p);
     }
 
@@ -414,7 +423,7 @@ class SparseMatrix
     {
       if(isCompressed())
       {
-        Index size = static_cast<Index>(m_data.size());
+        StorageIndex size = internal::convert_index<StorageIndex>(m_data.size());
         Index i = m_outerSize;
         // find the last filled column
         while (i>=0 && m_outerIndex[i]==0)
@@ -433,7 +442,13 @@ class SparseMatrix
     template<typename InputIterators>
     void setFromTriplets(const InputIterators& begin, const InputIterators& end);
 
-    void sumupDuplicates();
+    template<typename InputIterators,typename DupFunctor>
+    void setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
+
+    void sumupDuplicates() { collapseDuplicates(internal::scalar_sum_op<Scalar,Scalar>()); }
+
+    template<typename DupFunctor>
+    void collapseDuplicates(DupFunctor dup_func = DupFunctor());
 
     //---
     
@@ -451,6 +466,8 @@ class SparseMatrix
       if(isCompressed())
         return;
       
+      eigen_internal_assert(m_outerIndex!=0 && m_outerSize>0);
+      
       Index oldStart = m_outerIndex[1];
       m_outerIndex[1] = m_innerNonZeros[0];
       for(Index j=1; j<m_outerSize; ++j)
@@ -479,7 +496,7 @@ class SparseMatrix
     {
       if(m_innerNonZeros != 0)
         return; 
-      m_innerNonZeros = static_cast<Index*>(std::malloc(m_outerSize * sizeof(Index)));
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
       for (Index i = 0; i < m_outerSize; i++)
       {
         m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; 
@@ -503,10 +520,9 @@ class SparseMatrix
     void prune(const KeepFunc& keep = KeepFunc())
     {
       // TODO optimize the uncompressed mode to avoid moving and allocating the data twice
-      // TODO also implement a unit test
       makeCompressed();
 
-      Index k = 0;
+      StorageIndex k = 0;
       for(Index j=0; j<m_outerSize; ++j)
       {
         Index previousStart = m_outerIndex[j];
@@ -527,7 +543,12 @@ class SparseMatrix
     }
 
     /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
-      * \sa resizeNonZeros(Index), reserve(), setZero()
+      *
+      * If the sizes of the matrix are decreased, then the matrix is turned to \b uncompressed-mode
+      * and the storage of the out of bounds coefficients is kept and reserved.
+      * Call makeCompressed() to pack the entries and squeeze extra memory.
+      *
+      * \sa reserve(), setZero(), makeCompressed()
       */
     void conservativeResize(Index rows, Index cols) 
     {
@@ -539,13 +560,13 @@ class SparseMatrix
 
       Index innerChange = IsRowMajor ? cols - this->cols() : rows - this->rows();
       Index outerChange = IsRowMajor ? rows - this->rows() : cols - this->cols();
-      Index newInnerSize = IsRowMajor ? cols : rows;
+      StorageIndex newInnerSize = convert_index(IsRowMajor ? cols : rows);
 
       // Deals with inner non zeros
       if (m_innerNonZeros)
       {
         // Resize m_innerNonZeros
-        Index *newInnerNonZeros = static_cast<Index*>(std::realloc(m_innerNonZeros, (m_outerSize + outerChange) * sizeof(Index)));
+        StorageIndex *newInnerNonZeros = static_cast<StorageIndex*>(std::realloc(m_innerNonZeros, (m_outerSize + outerChange) * sizeof(StorageIndex)));
         if (!newInnerNonZeros) internal::throw_std_bad_alloc();
         m_innerNonZeros = newInnerNonZeros;
         
@@ -555,7 +576,7 @@ class SparseMatrix
       else if (innerChange < 0) 
       {
         // Inner size decreased: allocate a new m_innerNonZeros
-        m_innerNonZeros = static_cast<Index*>(std::malloc((m_outerSize+outerChange+1) * sizeof(Index)));
+        m_innerNonZeros = static_cast<StorageIndex*>(std::malloc((m_outerSize+outerChange+1) * sizeof(StorageIndex)));
         if (!m_innerNonZeros) internal::throw_std_bad_alloc();
         for(Index i = 0; i < m_outerSize; i++)
           m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
@@ -566,8 +587,8 @@ class SparseMatrix
       {
         for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
         {
-          Index &n = m_innerNonZeros[i];
-          Index start = m_outerIndex[i];
+          StorageIndex &n = m_innerNonZeros[i];
+          StorageIndex start = m_outerIndex[i];
           while (n > 0 && m_data.index(start+n-1) >= newInnerSize) --n; 
         }
       }
@@ -578,12 +599,12 @@ class SparseMatrix
       if (outerChange == 0)
         return;
           
-      Index *newOuterIndex = static_cast<Index*>(std::realloc(m_outerIndex, (m_outerSize + outerChange + 1) * sizeof(Index)));
+      StorageIndex *newOuterIndex = static_cast<StorageIndex*>(std::realloc(m_outerIndex, (m_outerSize + outerChange + 1) * sizeof(StorageIndex)));
       if (!newOuterIndex) internal::throw_std_bad_alloc();
       m_outerIndex = newOuterIndex;
       if (outerChange > 0)
       {
-        Index last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
+        StorageIndex last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
         for(Index i=m_outerSize; i<m_outerSize+outerChange+1; i++)          
           m_outerIndex[i] = last; 
       }
@@ -591,7 +612,11 @@ class SparseMatrix
     }
     
     /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero.
-      * \sa resizeNonZeros(Index), reserve(), setZero()
+      * 
+      * This function does not free the currently allocated memory. To release as much as memory as possible,
+      * call \code mat.data().squeeze(); \endcode after resizing it.
+      * 
+      * \sa reserve(), setZero()
       */
     void resize(Index rows, Index cols)
     {
@@ -601,7 +626,7 @@ class SparseMatrix
       if (m_outerSize != outerSize || m_outerSize==0)
       {
         std::free(m_outerIndex);
-        m_outerIndex = static_cast<Index*>(std::malloc((outerSize + 1) * sizeof(Index)));
+        m_outerIndex = static_cast<StorageIndex*>(std::malloc((outerSize + 1) * sizeof(StorageIndex)));
         if (!m_outerIndex) internal::throw_std_bad_alloc();
         
         m_outerSize = outerSize;
@@ -611,19 +636,24 @@ class SparseMatrix
         std::free(m_innerNonZeros);
         m_innerNonZeros = 0;
       }
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(Index));
+      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
     }
 
     /** \internal
       * Resize the nonzero vector to \a size */
     void resizeNonZeros(Index size)
     {
-      // TODO remove this function
       m_data.resize(size);
     }
 
-    /** \returns a const expression of the diagonal coefficients */
-    const Diagonal<const SparseMatrix> diagonal() const { return *this; }
+    /** \returns a const expression of the diagonal coefficients. */
+    const ConstDiagonalReturnType diagonal() const { return ConstDiagonalReturnType(*this); }
+    
+    /** \returns a read-write expression of the diagonal coefficients.
+      * \warning If the diagonal entries are written, then all diagonal
+      * entries \b must already exist, otherwise an assertion will be raised.
+      */
+    DiagonalReturnType diagonal() { return DiagonalReturnType(*this); }
 
     /** Default constructor yielding an empty \c 0 \c x \c 0 matrix */
     inline SparseMatrix()
@@ -649,7 +679,16 @@ class SparseMatrix
       EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
       check_template_parameters();
-      *this = other.derived();
+      const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
+      if (needToTranspose)
+        *this = other.derived();
+      else
+      {
+        #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+          EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        #endif
+        internal::call_assignment_no_alias(*this, other.derived());
+      }
     }
     
     /** Constructs a sparse matrix from the sparse selfadjoint view \a other */
@@ -658,7 +697,7 @@ class SparseMatrix
       : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
     {
       check_template_parameters();
-      *this = other;
+      Base::operator=(other);
     }
 
     /** Copy constructor (it performs a deep copy) */
@@ -678,6 +717,15 @@ class SparseMatrix
       initAssignment(other);
       other.evalTo(*this);
     }
+    
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    explicit SparseMatrix(const DiagonalBase<OtherDerived>& other)
+      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      check_template_parameters();
+      *this = other.derived();
+    }
 
     /** Swaps the content of two sparse matrices of the same type.
       * This is a fast operation that simply swaps the underlying pointers and parameters. */
@@ -691,14 +739,17 @@ class SparseMatrix
       m_data.swap(other.m_data);
     }
 
-    /** Sets *this to the identity matrix */
+    /** Sets *this to the identity matrix.
+      * This function also turns the matrix into compressed mode, and drop any reserved memory. */
     inline void setIdentity()
     {
       eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES");
       this->m_data.resize(rows());
-      Eigen::Map<Matrix<Index, Dynamic, 1> >(&this->m_data.index(0), rows()).setLinSpaced(0, rows()-1);
-      Eigen::Map<Matrix<Scalar, Dynamic, 1> >(&this->m_data.value(0), rows()).setOnes();
-      Eigen::Map<Matrix<Index, Dynamic, 1> >(this->m_outerIndex, rows()+1).setLinSpaced(0, rows());
+      Eigen::Map<IndexVector>(this->m_data.indexPtr(), rows()).setLinSpaced(0, StorageIndex(rows()-1));
+      Eigen::Map<ScalarVector>(this->m_data.valuePtr(), rows()).setOnes();
+      Eigen::Map<IndexVector>(this->m_outerIndex, rows()+1).setLinSpaced(0, StorageIndex(rows()));
+      std::free(m_innerNonZeros);
+      m_innerNonZeros = 0;
     }
     inline SparseMatrix& operator=(const SparseMatrix& other)
     {
@@ -708,10 +759,13 @@ class SparseMatrix
       }
       else if(this!=&other)
       {
+        #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+          EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        #endif
         initAssignment(other);
         if(other.isCompressed())
         {
-          memcpy(m_outerIndex, other.m_outerIndex, (m_outerSize+1)*sizeof(Index));
+          internal::smart_copy(other.m_outerIndex, other.m_outerIndex + m_outerSize + 1, m_outerIndex);
           m_data = other.m_data;
         }
         else
@@ -722,22 +776,11 @@ class SparseMatrix
       return *this;
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename Lhs, typename Rhs>
-    inline SparseMatrix& operator=(const SparseSparseProduct<Lhs,Rhs>& product)
-    { return Base::operator=(product); }
-    
-    template<typename OtherDerived>
-    inline SparseMatrix& operator=(const ReturnByValue<OtherDerived>& other)
-    {
-      initAssignment(other);
-      return Base::operator=(other.derived());
-    }
-    
+#ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename OtherDerived>
     inline SparseMatrix& operator=(const EigenBase<OtherDerived>& other)
     { return Base::operator=(other.derived()); }
-    #endif
+#endif // EIGEN_PARSED_BY_DOXYGEN
 
     template<typename OtherDerived>
     EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other);
@@ -747,30 +790,38 @@ class SparseMatrix
       EIGEN_DBG_SPARSE(
         s << "Nonzero entries:\n";
         if(m.isCompressed())
+        {
           for (Index i=0; i<m.nonZeros(); ++i)
             s << "(" << m.m_data.value(i) << "," << m.m_data.index(i) << ") ";
+        }
         else
+        {
           for (Index i=0; i<m.outerSize(); ++i)
           {
             Index p = m.m_outerIndex[i];
             Index pe = m.m_outerIndex[i]+m.m_innerNonZeros[i];
             Index k=p;
-            for (; k<pe; ++k)
+            for (; k<pe; ++k) {
               s << "(" << m.m_data.value(k) << "," << m.m_data.index(k) << ") ";
-            for (; k<m.m_outerIndex[i+1]; ++k)
+            }
+            for (; k<m.m_outerIndex[i+1]; ++k) {
               s << "(_,_) ";
+            }
           }
+        }
         s << std::endl;
         s << std::endl;
         s << "Outer pointers:\n";
-        for (Index i=0; i<m.outerSize(); ++i)
+        for (Index i=0; i<m.outerSize(); ++i) {
           s << m.m_outerIndex[i] << " ";
+        }
         s << " $" << std::endl;
         if(!m.isCompressed())
         {
           s << "Inner non zeros:\n";
-          for (Index i=0; i<m.outerSize(); ++i)
+          for (Index i=0; i<m.outerSize(); ++i) {
             s << m.m_innerNonZeros[i] << " ";
+          }
           s << " $" << std::endl;
         }
         s << std::endl;
@@ -786,10 +837,8 @@ class SparseMatrix
       std::free(m_innerNonZeros);
     }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
     /** Overloaded for performance */
     Scalar sum() const;
-#endif
     
 #   ifdef EIGEN_SPARSEMATRIX_PLUGIN
 #     include EIGEN_SPARSEMATRIX_PLUGIN
@@ -816,15 +865,15 @@ protected:
       * A vector object that is equal to 0 everywhere but v at the position i */
     class SingletonVector
     {
-        Index m_index;
-        Index m_value;
+        StorageIndex m_index;
+        StorageIndex m_value;
       public:
-        typedef Index value_type;
+        typedef StorageIndex value_type;
         SingletonVector(Index i, Index v)
-          : m_index(i), m_value(v)
+          : m_index(convert_index(i)), m_value(convert_index(v))
         {}
 
-        Index operator[](Index i) const { return i==m_index ? m_value : 0; }
+        StorageIndex operator[](Index i) const { return i==m_index ? m_value : 0; }
     };
 
     /** \internal
@@ -843,14 +892,14 @@ public:
       eigen_assert(m_innerNonZeros[outer]<=(m_outerIndex[outer+1] - m_outerIndex[outer]));
 
       Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++;
-      m_data.index(p) = inner;
+      m_data.index(p) = convert_index(inner);
       return (m_data.value(p) = 0);
     }
 
 private:
   static void check_template_parameters()
   {
-    EIGEN_STATIC_ASSERT(NumTraits<Index>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
+    EIGEN_STATIC_ASSERT(NumTraits<StorageIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
     EIGEN_STATIC_ASSERT((Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS);
   }
 
@@ -865,87 +914,20 @@ private:
   };
 };
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_end = mat.m_outerIndex[outer+1];
-      else
-        m_end = m_id + mat.m_innerNonZeros[outer];
-    }
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
-
-    inline Index index() const { return m_indices[m_id]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end); }
-
-  protected:
-    const Scalar* m_values;
-    const Index* m_indices;
-    const Index m_outer;
-    Index m_id;
-    Index m_end;
-};
-
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_id = mat.m_outerIndex[outer+1];
-      else
-        m_id = m_start + mat.m_innerNonZeros[outer];
-    }
-
-    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
-
-    inline Index index() const { return m_indices[m_id-1]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id > m_start); }
-
-  protected:
-    const Scalar* m_values;
-    const Index* m_indices;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-};
-
 namespace internal {
 
-template<typename InputIterator, typename SparseMatrixType>
-void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, int Options = 0)
+template<typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, DupFunctor dup_func)
 {
-  EIGEN_UNUSED_VARIABLE(Options);
   enum { IsRowMajor = SparseMatrixType::IsRowMajor };
   typedef typename SparseMatrixType::Scalar Scalar;
-  typedef typename SparseMatrixType::Index Index;
-  SparseMatrix<Scalar,IsRowMajor?ColMajor:RowMajor,Index> trMat(mat.rows(),mat.cols());
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  SparseMatrix<Scalar,IsRowMajor?ColMajor:RowMajor,StorageIndex> trMat(mat.rows(),mat.cols());
 
   if(begin!=end)
   {
     // pass 1: count the nnz per inner-vector
-    Matrix<Index,Dynamic,1> wi(trMat.outerSize());
+    typename SparseMatrixType::IndexVector wi(trMat.outerSize());
     wi.setZero();
     for(InputIterator it(begin); it!=end; ++it)
     {
@@ -959,7 +941,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
       trMat.insertBackUncompressed(it->row(),it->col()) = it->value();
 
     // pass 3:
-    trMat.sumupDuplicates();
+    trMat.collapseDuplicates(dup_func);
   }
 
   // pass 4: transposed copy -> implicit sorting
@@ -1006,26 +988,43 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
   * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather
   * be explicitely stored into a std::vector for instance.
   */
-template<typename Scalar, int _Options, typename _Index>
+template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename InputIterators>
-void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
+void SparseMatrix<Scalar,_Options,_StorageIndex>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
+{
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_StorageIndex> >(begin, end, *this, internal::scalar_sum_op<Scalar,Scalar>());
+}
+
+/** The same as setFromTriplets but when duplicates are met the functor \a dup_func is applied:
+  * \code
+  * value = dup_func(OldValue, NewValue)
+  * \endcode 
+  * Here is a C++11 example keeping the latest entry only:
+  * \code
+  * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
+  * \endcode
+  */
+template<typename Scalar, int _Options, typename _StorageIndex>
+template<typename InputIterators,typename DupFunctor>
+void SparseMatrix<Scalar,_Options,_StorageIndex>::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
 {
-  internal::set_from_triplets(begin, end, *this);
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_StorageIndex>, DupFunctor>(begin, end, *this, dup_func);
 }
 
 /** \internal */
-template<typename Scalar, int _Options, typename _Index>
-void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
+template<typename Scalar, int _Options, typename _StorageIndex>
+template<typename DupFunctor>
+void SparseMatrix<Scalar,_Options,_StorageIndex>::collapseDuplicates(DupFunctor dup_func)
 {
   eigen_assert(!isCompressed());
   // TODO, in practice we should be able to use m_innerNonZeros for that task
-  Matrix<Index,Dynamic,1> wi(innerSize());
+  IndexVector wi(innerSize());
   wi.fill(-1);
-  Index count = 0;
+  StorageIndex count = 0;
   // for each inner-vector, wi[inner_index] will hold the position of first element into the index/value buffers
   for(Index j=0; j<outerSize(); ++j)
   {
-    Index start   = count;
+    StorageIndex start   = count;
     Index oldEnd  = m_outerIndex[j]+m_innerNonZeros[j];
     for(Index k=m_outerIndex[j]; k<oldEnd; ++k)
     {
@@ -1033,7 +1032,7 @@ void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
       if(wi(i)>=start)
       {
         // we already meet this entry => accumulate it
-        m_data.value(wi(i)) += m_data.value(k);
+        m_data.value(wi(i)) = dup_func(m_data.value(wi(i)), m_data.value(k));
       }
       else
       {
@@ -1053,39 +1052,48 @@ void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
   m_data.resize(m_outerIndex[m_outerSize]);
 }
 
-template<typename Scalar, int _Options, typename _Index>
+template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename OtherDerived>
-EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Options,_Index>::operator=(const SparseMatrixBase<OtherDerived>& other)
+EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_StorageIndex>& SparseMatrix<Scalar,_Options,_StorageIndex>::operator=(const SparseMatrixBase<OtherDerived>& other)
 {
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-  
-  const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+
+  #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+    EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+  #endif
+      
+  const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
   if (needToTranspose)
   {
+    #ifdef EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN
+      EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN
+    #endif
     // two passes algorithm:
     //  1 - compute the number of coeffs per dest inner vector
     //  2 - do the actual copy/eval
     // Since each coeff of the rhs has to be evaluated twice, let's evaluate it if needed
-    typedef typename internal::nested<OtherDerived,2>::type OtherCopy;
+    typedef typename internal::nested_eval<OtherDerived,2,typename internal::plain_matrix_type<OtherDerived>::type >::type OtherCopy;
     typedef typename internal::remove_all<OtherCopy>::type _OtherCopy;
+    typedef internal::evaluator<_OtherCopy> OtherCopyEval;
     OtherCopy otherCopy(other.derived());
+    OtherCopyEval otherCopyEval(otherCopy);
 
     SparseMatrix dest(other.rows(),other.cols());
-    Eigen::Map<Matrix<Index, Dynamic, 1> > (dest.m_outerIndex,dest.outerSize()).setZero();
+    Eigen::Map<IndexVector> (dest.m_outerIndex,dest.outerSize()).setZero();
 
     // pass 1
     // FIXME the above copy could be merged with that pass
     for (Index j=0; j<otherCopy.outerSize(); ++j)
-      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
+      for (typename OtherCopyEval::InnerIterator it(otherCopyEval, j); it; ++it)
         ++dest.m_outerIndex[it.index()];
 
     // prefix sum
-    Index count = 0;
-    Matrix<Index,Dynamic,1> positions(dest.outerSize());
+    StorageIndex count = 0;
+    IndexVector positions(dest.outerSize());
     for (Index j=0; j<dest.outerSize(); ++j)
     {
-      Index tmp = dest.m_outerIndex[j];
+      StorageIndex tmp = dest.m_outerIndex[j];
       dest.m_outerIndex[j] = count;
       positions[j] = count;
       count += tmp;
@@ -1094,9 +1102,9 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
     // alloc
     dest.m_data.resize(count);
     // pass 2
-    for (Index j=0; j<otherCopy.outerSize(); ++j)
+    for (StorageIndex j=0; j<otherCopy.outerSize(); ++j)
     {
-      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
+      for (typename OtherCopyEval::InnerIterator it(otherCopyEval, j); it; ++it)
       {
         Index pos = positions[it.index()]++;
         dest.m_data.index(pos) = j;
@@ -1109,26 +1117,148 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
   else
   {
     if(other.isRValue())
+    {
       initAssignment(other.derived());
+    }
     // there is no special optimization
     return Base::operator=(other.derived());
   }
 }
 
-template<typename _Scalar, int _Options, typename _Index>
-EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertUncompressed(Index row, Index col)
+template<typename _Scalar, int _Options, typename _StorageIndex>
+typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insert(Index row, Index col)
+{
+  eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
+  
+  const Index outer = IsRowMajor ? row : col;
+  const Index inner = IsRowMajor ? col : row;
+  
+  if(isCompressed())
+  {
+    if(nonZeros()==0)
+    {
+      // reserve space if not already done
+      if(m_data.allocatedSize()==0)
+        m_data.reserve(2*m_innerSize);
+      
+      // turn the matrix into non-compressed mode
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
+      if(!m_innerNonZeros) internal::throw_std_bad_alloc();
+      
+      memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
+      
+      // pack all inner-vectors to the end of the pre-allocated space
+      // and allocate the entire free-space to the first inner-vector
+      StorageIndex end = convert_index(m_data.allocatedSize());
+      for(Index j=1; j<=m_outerSize; ++j)
+        m_outerIndex[j] = end;
+    }
+    else
+    {
+      // turn the matrix into non-compressed mode
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
+      if(!m_innerNonZeros) internal::throw_std_bad_alloc();
+      for(Index j=0; j<m_outerSize; ++j)
+        m_innerNonZeros[j] = m_outerIndex[j+1]-m_outerIndex[j];
+    }
+  }
+  
+  // check whether we can do a fast "push back" insertion
+  Index data_end = m_data.allocatedSize();
+  
+  // First case: we are filling a new inner vector which is packed at the end.
+  // We assume that all remaining inner-vectors are also empty and packed to the end.
+  if(m_outerIndex[outer]==data_end)
+  {
+    eigen_internal_assert(m_innerNonZeros[outer]==0);
+    
+    // pack previous empty inner-vectors to end of the used-space
+    // and allocate the entire free-space to the current inner-vector.
+    StorageIndex p = convert_index(m_data.size());
+    Index j = outer;
+    while(j>=0 && m_innerNonZeros[j]==0)
+      m_outerIndex[j--] = p;
+    
+    // push back the new element
+    ++m_innerNonZeros[outer];
+    m_data.append(Scalar(0), inner);
+    
+    // check for reallocation
+    if(data_end != m_data.allocatedSize())
+    {
+      // m_data has been reallocated
+      //  -> move remaining inner-vectors back to the end of the free-space
+      //     so that the entire free-space is allocated to the current inner-vector.
+      eigen_internal_assert(data_end < m_data.allocatedSize());
+      StorageIndex new_end = convert_index(m_data.allocatedSize());
+      for(Index k=outer+1; k<=m_outerSize; ++k)
+        if(m_outerIndex[k]==data_end)
+          m_outerIndex[k] = new_end;
+    }
+    return m_data.value(p);
+  }
+  
+  // Second case: the next inner-vector is packed to the end
+  // and the current inner-vector end match the used-space.
+  if(m_outerIndex[outer+1]==data_end && m_outerIndex[outer]+m_innerNonZeros[outer]==m_data.size())
+  {
+    eigen_internal_assert(outer+1==m_outerSize || m_innerNonZeros[outer+1]==0);
+    
+    // add space for the new element
+    ++m_innerNonZeros[outer];
+    m_data.resize(m_data.size()+1);
+    
+    // check for reallocation
+    if(data_end != m_data.allocatedSize())
+    {
+      // m_data has been reallocated
+      //  -> move remaining inner-vectors back to the end of the free-space
+      //     so that the entire free-space is allocated to the current inner-vector.
+      eigen_internal_assert(data_end < m_data.allocatedSize());
+      StorageIndex new_end = convert_index(m_data.allocatedSize());
+      for(Index k=outer+1; k<=m_outerSize; ++k)
+        if(m_outerIndex[k]==data_end)
+          m_outerIndex[k] = new_end;
+    }
+    
+    // and insert it at the right position (sorted insertion)
+    Index startId = m_outerIndex[outer];
+    Index p = m_outerIndex[outer]+m_innerNonZeros[outer]-1;
+    while ( (p > startId) && (m_data.index(p-1) > inner) )
+    {
+      m_data.index(p) = m_data.index(p-1);
+      m_data.value(p) = m_data.value(p-1);
+      --p;
+    }
+    
+    m_data.index(p) = convert_index(inner);
+    return (m_data.value(p) = 0);
+  }
+  
+  if(m_data.size() != m_data.allocatedSize())
+  {
+    // make sure the matrix is compatible to random un-compressed insertion:
+    m_data.resize(m_data.allocatedSize());
+    this->reserveInnerVectors(Array<StorageIndex,Dynamic,1>::Constant(m_outerSize, 2));
+  }
+  
+  return insertUncompressed(row,col);
+}
+    
+template<typename _Scalar, int _Options, typename _StorageIndex>
+EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insertUncompressed(Index row, Index col)
 {
   eigen_assert(!isCompressed());
 
   const Index outer = IsRowMajor ? row : col;
-  const Index inner = IsRowMajor ? col : row;
+  const StorageIndex inner = convert_index(IsRowMajor ? col : row);
 
   Index room = m_outerIndex[outer+1] - m_outerIndex[outer];
-  Index innerNNZ = m_innerNonZeros[outer];
+  StorageIndex innerNNZ = m_innerNonZeros[outer];
   if(innerNNZ>=room)
   {
     // this inner vector is full, we need to reallocate the whole buffer :(
-    reserve(SingletonVector(outer,std::max<Index>(2,innerNNZ)));
+    reserve(SingletonVector(outer,std::max<StorageIndex>(2,innerNNZ)));
   }
 
   Index startId = m_outerIndex[outer];
@@ -1139,7 +1269,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
     m_data.value(p) = m_data.value(p-1);
     --p;
   }
-  eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end");
+  eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exists, you must call coeffRef to this end");
 
   m_innerNonZeros[outer]++;
 
@@ -1147,8 +1277,8 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
   return (m_data.value(p) = 0);
 }
 
-template<typename _Scalar, int _Options, typename _Index>
-EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertCompressed(Index row, Index col)
+template<typename _Scalar, int _Options, typename _StorageIndex>
+EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insertCompressed(Index row, Index col)
 {
   eigen_assert(isCompressed());
 
@@ -1161,7 +1291,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
     // we start a new inner vector
     while (previousOuter>=0 && m_outerIndex[previousOuter]==0)
     {
-      m_outerIndex[previousOuter] = static_cast<Index>(m_data.size());
+      m_outerIndex[previousOuter] = convert_index(m_data.size());
       --previousOuter;
     }
     m_outerIndex[outer+1] = m_outerIndex[outer];
@@ -1171,11 +1301,11 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
   // starts with: [ 0 0 0 0 0 1 ...] and we are inserted in, e.g.,
   // the 2nd inner vector...
   bool isLastVec = (!(previousOuter==-1 && m_data.size()!=0))
-                && (size_t(m_outerIndex[outer+1]) == m_data.size());
+                && (std::size_t(m_outerIndex[outer+1]) == m_data.size());
 
-  size_t startId = m_outerIndex[outer];
-  // FIXME let's make sure sizeof(long int) == sizeof(size_t)
-  size_t p = m_outerIndex[outer+1];
+  std::size_t startId = m_outerIndex[outer];
+  // FIXME let's make sure sizeof(long int) == sizeof(std::size_t)
+  std::size_t p = m_outerIndex[outer+1];
   ++m_outerIndex[outer+1];
 
   double reallocRatio = 1;
@@ -1254,6 +1384,20 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
   return (m_data.value(p) = 0);
 }
 
+namespace internal {
+
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct evaluator<SparseMatrix<_Scalar,_Options,_StorageIndex> >
+  : evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_StorageIndex> > >
+{
+  typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_StorageIndex> > > Base;
+  typedef SparseMatrix<_Scalar,_Options,_StorageIndex> SparseMatrixType;
+  evaluator() : Base() {}
+  explicit evaluator(const SparseMatrixType &mat) : Base(mat) {}
+};
+
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSEMATRIX_H
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 6b2169a7b..c6b548f11 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,32 +18,41 @@ namespace Eigen {
   *
   * \brief Base class of any sparse matrices or sparse expressions
   *
-  * \tparam Derived
+  * \tparam Derived is the derived type, e.g. a sparse matrix type, or an expression, etc.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
   */
-template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
+template<typename Derived> class SparseMatrixBase
+  : public EigenBase<Derived>
 {
   public:
 
     typedef typename internal::traits<Derived>::Scalar Scalar;
+    
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+      *
+      * It is an alias for the Scalar type */
+    typedef Scalar value_type;
+    
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
+
+    /** The integer type used to \b store indices within a SparseMatrix.
+      * For a \c SparseMatrix<Scalar,Options,IndexType> it an alias of the third template parameter \c IndexType. */
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+
     typedef typename internal::add_const_on_value_type_if_arithmetic<
                          typename internal::packet_traits<Scalar>::type
                      >::type PacketReturnType;
 
     typedef SparseMatrixBase StorageBaseType;
-    typedef EigenBase<Derived> Base;
+
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
+    typedef Matrix<Scalar,Dynamic,1> ScalarVector;
     
     template<typename OtherDerived>
-    Derived& operator=(const EigenBase<OtherDerived> &other)
-    {
-      other.derived().evalTo(derived());
-      return derived();
-    }
+    Derived& operator=(const EigenBase<OtherDerived> &other);
 
     enum {
 
@@ -83,11 +92,6 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
           * constructed from this one. See the \ref flags "list of flags".
           */
 
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-        /**< This is a rough measure of how expensive it is to read one coefficient from
-          * this expression.
-          */
-
       IsRowMajor = Flags&RowMajorBit ? 1 : 0,
       
       InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
@@ -103,10 +107,11 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
                         CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, Eigen::Transpose<const Derived> >,
                         Transpose<const Derived>
                      >::type AdjointReturnType;
+    typedef Transpose<Derived> TransposeReturnType;
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
 
-
-    typedef SparseMatrix<Scalar, Flags&RowMajorBit ? RowMajor : ColMajor, Index> PlainObject;
-
+    // FIXME storage order do not match evaluator storage order
+    typedef SparseMatrix<Scalar, Flags&RowMajorBit ? RowMajor : ColMajor, StorageIndex> PlainObject;
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** This is the "real scalar" type; if the \a Scalar type is already real numbers
@@ -124,6 +129,8 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     /** \internal Represents a matrix with all coefficients equal to one another*/
     typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Matrix<Scalar,Dynamic,Dynamic> > ConstantReturnType;
 
+    /** type of the equivalent dense matrix */
+    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> DenseMatrixType;
     /** type of the equivalent square matrix */
     typedef Matrix<Scalar,EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime),
                           EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime)> SquareMatrixType;
@@ -132,9 +139,21 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     inline Derived& derived() { return *static_cast<Derived*>(this); }
     inline Derived& const_cast_derived() const
     { return *static_cast<Derived*>(const_cast<SparseMatrixBase*>(this)); }
+
+    typedef EigenBase<Derived> Base;
+
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+#define EIGEN_DOC_UNARY_ADDONS(METHOD,OP)           /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL      /** <p> \warning This method returns a read-only expression for any sparse matrices. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the returned expression is read-only. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
+#else
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
+#endif
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@@ -143,8 +162,10 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
 #   ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
 #     include EIGEN_SPARSEMATRIXBASE_PLUGIN
 #   endif
-#   undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
 
     /** \returns the number of rows. \sa cols() */
     inline Index rows() const { return derived().rows(); }
@@ -153,9 +174,6 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     /** \returns the number of coefficients, which is \a rows()*cols().
       * \sa rows(), cols(). */
     inline Index size() const { return rows() * cols(); }
-    /** \returns the number of nonzero coefficients which is in practice the number
-      * of stored coefficients. */
-    inline Index nonZeros() const { return derived().nonZeros(); }
     /** \returns true if either the number of rows or the number of columns is equal to 1.
       * In other words, this function returns
       * \code rows()==1 || cols()==1 \endcode
@@ -175,93 +193,23 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
 
     
     template<typename OtherDerived>
-    Derived& operator=(const ReturnByValue<OtherDerived>& other)
-    {
-      other.evalTo(derived());
-      return derived();
-    }
-
+    Derived& operator=(const ReturnByValue<OtherDerived>& other);
 
     template<typename OtherDerived>
-    inline Derived& operator=(const SparseMatrixBase<OtherDerived>& other)
-    {
-      return assign(other.derived());
-    }
+    inline Derived& operator=(const SparseMatrixBase<OtherDerived>& other);
 
-    inline Derived& operator=(const Derived& other)
-    {
-//       if (other.isRValue())
-//         derived().swap(other.const_cast_derived());
-//       else
-      return assign(other.derived());
-    }
+    inline Derived& operator=(const Derived& other);
 
   protected:
 
     template<typename OtherDerived>
-    inline Derived& assign(const OtherDerived& other)
-    {
-      const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      const Index outerSize = (int(OtherDerived::Flags) & RowMajorBit) ? other.rows() : other.cols();
-      if ((!transpose) && other.isRValue())
-      {
-        // eval without temporary
-        derived().resize(other.rows(), other.cols());
-        derived().setZero();
-        derived().reserve((std::max)(this->rows(),this->cols())*2);
-        for (Index j=0; j<outerSize; ++j)
-        {
-          derived().startVec(j);
-          for (typename OtherDerived::InnerIterator it(other, j); it; ++it)
-          {
-            Scalar v = it.value();
-            derived().insertBackByOuterInner(j,it.index()) = v;
-          }
-        }
-        derived().finalize();
-      }
-      else
-      {
-        assignGeneric(other);
-      }
-      return derived();
-    }
+    inline Derived& assign(const OtherDerived& other);
 
     template<typename OtherDerived>
-    inline void assignGeneric(const OtherDerived& other)
-    {
-      //const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      eigen_assert(( ((internal::traits<Derived>::SupportedAccessPatterns&OuterRandomAccessPattern)==OuterRandomAccessPattern) ||
-                  (!((Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit)))) &&
-                  "the transpose operation is supposed to be handled in SparseMatrix::operator=");
-
-      enum { Flip = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit) };
-
-      const Index outerSize = other.outerSize();
-      //typedef typename internal::conditional<transpose, LinkedVectorMatrix<Scalar,Flags&RowMajorBit>, Derived>::type TempType;
-      // thanks to shallow copies, we always eval to a tempary
-      Derived temp(other.rows(), other.cols());
-
-      temp.reserve((std::max)(this->rows(),this->cols())*2);
-      for (Index j=0; j<outerSize; ++j)
-      {
-        temp.startVec(j);
-        for (typename OtherDerived::InnerIterator it(other.derived(), j); it; ++it)
-        {
-          Scalar v = it.value();
-          temp.insertBackByOuterInner(Flip?it.index():j,Flip?j:it.index()) = v;
-        }
-      }
-      temp.finalize();
-
-      derived() = temp.markAsRValue();
-    }
+    inline void assignGeneric(const OtherDerived& other);
 
   public:
 
-    template<typename Lhs, typename Rhs>
-    inline Derived& operator=(const SparseSparseProduct<Lhs,Rhs>& product);
-
     friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m)
     {
       typedef typename Derived::Nested Nested;
@@ -269,11 +217,12 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
 
       if (Flags&RowMajorBit)
       {
-        const Nested nm(m.derived());
+        Nested nm(m.derived());
+        internal::evaluator<NestedCleaned> thisEval(nm);
         for (Index row=0; row<nm.outerSize(); ++row)
         {
           Index col = 0;
-          for (typename NestedCleaned::InnerIterator it(nm.derived(), row); it; ++it)
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, row); it; ++it)
           {
             for ( ; col<it.index(); ++col)
               s << "0 ";
@@ -287,10 +236,11 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
       }
       else
       {
-        const Nested nm(m.derived());
+        Nested nm(m.derived());
+        internal::evaluator<NestedCleaned> thisEval(nm);
         if (m.cols() == 1) {
           Index row = 0;
-          for (typename NestedCleaned::InnerIterator it(nm.derived(), 0); it; ++it)
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, 0); it; ++it)
           {
             for ( ; row<it.index(); ++row)
               s << "0" << std::endl;
@@ -302,8 +252,8 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
         }
         else
         {
-          SparseMatrix<Scalar, RowMajorBit, Index> trans = m;
-          s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit, Index> >&>(trans);
+          SparseMatrix<Scalar, RowMajorBit, StorageIndex> trans = m;
+          s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit, StorageIndex> >&>(trans);
         }
       }
       return s;
@@ -313,56 +263,65 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     Derived& operator+=(const SparseMatrixBase<OtherDerived>& other);
     template<typename OtherDerived>
     Derived& operator-=(const SparseMatrixBase<OtherDerived>& other);
+    
+    template<typename OtherDerived>
+    Derived& operator+=(const DiagonalBase<OtherDerived>& other);
+    template<typename OtherDerived>
+    Derived& operator-=(const DiagonalBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    Derived& operator+=(const EigenBase<OtherDerived> &other);
+    template<typename OtherDerived>
+    Derived& operator-=(const EigenBase<OtherDerived> &other);
 
     Derived& operator*=(const Scalar& other);
     Derived& operator/=(const Scalar& other);
 
-    #define EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE \
-      CwiseBinaryOp< \
-        internal::scalar_product_op< \
-          typename internal::scalar_product_traits< \
-            typename internal::traits<Derived>::Scalar, \
-            typename internal::traits<OtherDerived>::Scalar \
-          >::ReturnType \
-        >, \
-        const Derived, \
-        const OtherDerived \
-      >
+    template<typename OtherDerived> struct CwiseProductDenseReturnType {
+      typedef CwiseBinaryOp<internal::scalar_product_op<typename ScalarBinaryOpTraits<
+                                                          typename internal::traits<Derived>::Scalar,
+                                                          typename internal::traits<OtherDerived>::Scalar
+                                                        >::ReturnType>,
+                            const Derived,
+                            const OtherDerived
+                          > Type;
+    };
 
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
+    EIGEN_STRONG_INLINE const typename CwiseProductDenseReturnType<OtherDerived>::Type
     cwiseProduct(const MatrixBase<OtherDerived> &other) const;
 
-    // sparse * sparse
-    template<typename OtherDerived>
-    const typename SparseSparseProductReturnType<Derived,OtherDerived>::Type
-    operator*(const SparseMatrixBase<OtherDerived> &other) const;
-
     // sparse * diagonal
     template<typename OtherDerived>
-    const SparseDiagonalProduct<Derived,OtherDerived>
-    operator*(const DiagonalBase<OtherDerived> &other) const;
+    const Product<Derived,OtherDerived>
+    operator*(const DiagonalBase<OtherDerived> &other) const
+    { return Product<Derived,OtherDerived>(derived(), other.derived()); }
 
     // diagonal * sparse
     template<typename OtherDerived> friend
-    const SparseDiagonalProduct<OtherDerived,Derived>
+    const Product<OtherDerived,Derived>
     operator*(const DiagonalBase<OtherDerived> &lhs, const SparseMatrixBase& rhs)
-    { return SparseDiagonalProduct<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
-
-    /** dense * sparse (return a dense object unless it is an outer product) */
-    template<typename OtherDerived> friend
-    const typename DenseSparseProductReturnType<OtherDerived,Derived>::Type
-    operator*(const MatrixBase<OtherDerived>& lhs, const Derived& rhs)
-    { return typename DenseSparseProductReturnType<OtherDerived,Derived>::Type(lhs.derived(),rhs); }
-
-    /** sparse * dense (returns a dense object unless it is an outer product) */
+    { return Product<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
+    
+    // sparse * sparse
+    template<typename OtherDerived>
+    const Product<Derived,OtherDerived,AliasFreeProduct>
+    operator*(const SparseMatrixBase<OtherDerived> &other) const;
+    
+    // sparse * dense
     template<typename OtherDerived>
-    const typename SparseDenseProductReturnType<Derived,OtherDerived>::Type
+    const Product<Derived,OtherDerived>
     operator*(const MatrixBase<OtherDerived> &other) const
-    { return typename SparseDenseProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived()); }
+    { return Product<Derived,OtherDerived>(derived(), other.derived()); }
+    
+    // dense * sparse
+    template<typename OtherDerived> friend
+    const Product<OtherDerived,Derived>
+    operator*(const MatrixBase<OtherDerived> &lhs, const SparseMatrixBase& rhs)
+    { return Product<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
     
      /** \returns an expression of P H P^-1 where H is the matrix represented by \c *this */
-    SparseSymmetricPermutationProduct<Derived,Upper|Lower> twistedBy(const PermutationMatrix<Dynamic,Dynamic,Index>& perm) const
+    SparseSymmetricPermutationProduct<Derived,Upper|Lower> twistedBy(const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm) const
     {
       return SparseSymmetricPermutationProduct<Derived,Upper|Lower>(derived(), perm);
     }
@@ -370,22 +329,16 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     template<typename OtherDerived>
     Derived& operator*=(const SparseMatrixBase<OtherDerived>& other);
 
-    #ifdef EIGEN2_SUPPORT
-    // deprecated
-    template<typename OtherDerived>
-    typename internal::plain_matrix_type_column_major<OtherDerived>::type
-    solveTriangular(const MatrixBase<OtherDerived>& other) const;
-
-    // deprecated
-    template<typename OtherDerived>
-    void solveTriangularInPlace(MatrixBase<OtherDerived>& other) const;
-    #endif // EIGEN2_SUPPORT
-
     template<int Mode>
-    inline const SparseTriangularView<Derived, Mode> triangularView() const;
+    inline const TriangularView<const Derived, Mode> triangularView() const;
+    
+    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SparseSelfAdjointView<Derived, UpLo> Type; };
+    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SparseSelfAdjointView<const Derived, UpLo> Type; };
 
-    template<unsigned int UpLo> inline const SparseSelfAdjointView<Derived, UpLo> selfadjointView() const;
-    template<unsigned int UpLo> inline SparseSelfAdjointView<Derived, UpLo> selfadjointView();
+    template<unsigned int UpLo> inline 
+    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+    template<unsigned int UpLo> inline
+    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
 
     template<typename OtherDerived> Scalar dot(const MatrixBase<OtherDerived>& other) const;
     template<typename OtherDerived> Scalar dot(const SparseMatrixBase<OtherDerived>& other) const;
@@ -393,9 +346,9 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     RealScalar norm()  const;
     RealScalar blueNorm() const;
 
-    Transpose<Derived> transpose() { return derived(); }
-    const Transpose<const Derived> transpose() const { return derived(); }
-    const AdjointReturnType adjoint() const { return transpose(); }
+    TransposeReturnType transpose() { return TransposeReturnType(derived()); }
+    const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); }
+    const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); }
 
     // inner-vector
     typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
@@ -409,25 +362,14 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize);
     const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const;
 
-    /** \internal use operator= */
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived>& dst) const
-    {
-      dst.setZero();
-      for (Index j=0; j<outerSize(); ++j)
-        for (typename Derived::InnerIterator i(derived(),j); i; ++i)
-          dst.coeffRef(i.row(),i.col()) = i.value();
-    }
-
-    Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> toDense() const
+    DenseMatrixType toDense() const
     {
-      return derived();
+      return DenseMatrixType(derived());
     }
 
     template<typename OtherDerived>
     bool isApprox(const SparseMatrixBase<OtherDerived>& other,
-                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
-    { return toDense().isApprox(other.toDense(),prec); }
+                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
 
     template<typename OtherDerived>
     bool isApprox(const MatrixBase<OtherDerived>& other,
@@ -443,10 +385,19 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     { return typename internal::eval<Derived>::type(derived()); }
 
     Scalar sum() const;
+    
+    inline const SparseView<Derived>
+    pruned(const Scalar& reference = Scalar(0), const RealScalar& epsilon = NumTraits<Scalar>::dummy_precision()) const;
 
   protected:
 
     bool m_isRValue;
+
+    static inline StorageIndex convert_index(const Index idx) {
+      return internal::convert_index<StorageIndex>(idx);
+    }
+  private:
+    template<typename Dest> void evalTo(Dest &) const;
 };
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparsePermutation.h b/Eigen/src/SparseCore/SparsePermutation.h
index 75e210009..ef38357ae 100644
--- a/Eigen/src/SparseCore/SparsePermutation.h
+++ b/Eigen/src/SparseCore/SparsePermutation.h
@@ -16,131 +16,161 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct traits<permut_sparsematrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, SparseShape>
 {
-  typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-  typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
-  typedef typename MatrixTypeNestedCleaned::Index Index;
-  enum {
-    SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
-    MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
-  };
+    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
 
-  typedef typename internal::conditional<MoveOuter,
-        SparseMatrix<Scalar,SrcStorageOrder,Index>,
-        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,Index> >::type ReturnType;
-};
-
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct permut_sparsematrix_product_retval
- : public ReturnByValue<permut_sparsematrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
-    typedef typename MatrixTypeNestedCleaned::Index Index;
+    typedef typename MatrixTypeCleaned::Scalar Scalar;
+    typedef typename MatrixTypeCleaned::StorageIndex StorageIndex;
 
     enum {
-      SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
+      SrcStorageOrder = MatrixTypeCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
       MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
     };
+    
+    typedef typename internal::conditional<MoveOuter,
+        SparseMatrix<Scalar,SrcStorageOrder,StorageIndex>,
+        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> >::type ReturnType;
 
-    permut_sparsematrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
-      : m_permutation(perm), m_matrix(matrix)
-    {}
-
-    inline int rows() const { return m_matrix.rows(); }
-    inline int cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
+    template<typename Dest,typename PermutationType>
+    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
     {
+      MatrixType mat(xpr);
       if(MoveOuter)
       {
-        SparseMatrix<Scalar,SrcStorageOrder,Index> tmp(m_matrix.rows(), m_matrix.cols());
-        Matrix<Index,Dynamic,1> sizes(m_matrix.outerSize());
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
+        SparseMatrix<Scalar,SrcStorageOrder,StorageIndex> tmp(mat.rows(), mat.cols());
+        Matrix<StorageIndex,Dynamic,1> sizes(mat.outerSize());
+        for(Index j=0; j<mat.outerSize(); ++j)
         {
-          Index jp = m_permutation.indices().coeff(j);
-          sizes[((Side==OnTheLeft) ^ Transposed) ? jp : j] = m_matrix.innerVector(((Side==OnTheRight) ^ Transposed) ? jp : j).nonZeros();
+          Index jp = perm.indices().coeff(j);
+          sizes[((Side==OnTheLeft) ^ Transposed) ? jp : j] = StorageIndex(mat.innerVector(((Side==OnTheRight) ^ Transposed) ? jp : j).nonZeros());
         }
         tmp.reserve(sizes);
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
+        for(Index j=0; j<mat.outerSize(); ++j)
         {
-          Index jp = m_permutation.indices().coeff(j);
+          Index jp = perm.indices().coeff(j);
           Index jsrc = ((Side==OnTheRight) ^ Transposed) ? jp : j;
           Index jdst = ((Side==OnTheLeft) ^ Transposed) ? jp : j;
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,jsrc); it; ++it)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,jsrc); it; ++it)
             tmp.insertByOuterInner(jdst,it.index()) = it.value();
         }
         dst = tmp;
       }
       else
       {
-        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,Index> tmp(m_matrix.rows(), m_matrix.cols());
-        Matrix<Index,Dynamic,1> sizes(tmp.outerSize());
+        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> tmp(mat.rows(), mat.cols());
+        Matrix<StorageIndex,Dynamic,1> sizes(tmp.outerSize());
         sizes.setZero();
-        PermutationMatrix<Dynamic,Dynamic,Index> perm;
+        PermutationMatrix<Dynamic,Dynamic,StorageIndex> perm_cpy;
         if((Side==OnTheLeft) ^ Transposed)
-          perm = m_permutation;
+          perm_cpy = perm;
         else
-          perm = m_permutation.transpose();
+          perm_cpy = perm.transpose();
 
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,j); it; ++it)
-            sizes[perm.indices().coeff(it.index())]++;
+        for(Index j=0; j<mat.outerSize(); ++j)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
+            sizes[perm_cpy.indices().coeff(it.index())]++;
         tmp.reserve(sizes);
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,j); it; ++it)
-            tmp.insertByOuterInner(perm.indices().coeff(it.index()),j) = it.value();
+        for(Index j=0; j<mat.outerSize(); ++j)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
+            tmp.insertByOuterInner(perm_cpy.indices().coeff(it.index()),j) = it.value();
         dst = tmp;
       }
     }
-
-  protected:
-    const PermutationType& m_permutation;
-    typename MatrixType::Nested m_matrix;
 };
 
 }
 
+namespace internal {
+
+template <int ProductTag> struct product_promote_storage_type<Sparse,             PermutationStorage, ProductTag> { typedef Sparse ret; };
+template <int ProductTag> struct product_promote_storage_type<PermutationStorage, Sparse,             ProductTag> { typedef Sparse ret; };
+
+// TODO, the following two overloads are only needed to define the right temporary type through 
+// typename traits<permutation_sparse_matrix_product<Rhs,Lhs,OnTheRight,false> >::ReturnType
+// whereas it should be correctly handled by traits<Product<> >::PlainObject
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, PermutationShape, SparseShape>
+  : public evaluator<typename permutation_matrix_product<Rhs,OnTheLeft,false,SparseShape>::ReturnType>
+{
+  typedef Product<Lhs, Rhs, AliasFreeProduct> XprType;
+  typedef typename permutation_matrix_product<Rhs,OnTheLeft,false,SparseShape>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
+  explicit product_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, Rhs, PermutationShape, SparseShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
 
+protected:
+  PlainObject m_result;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, SparseShape, PermutationShape >
+  : public evaluator<typename permutation_matrix_product<Lhs,OnTheRight,false,SparseShape>::ReturnType>
+{
+  typedef Product<Lhs, Rhs, AliasFreeProduct> XprType;
+  typedef typename permutation_matrix_product<Lhs,OnTheRight,false,SparseShape>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
+  explicit product_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, Rhs, SparseShape, PermutationShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
 
 /** \returns the matrix with the permutation applied to the columns
   */
 template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, false>
+inline const Product<SparseDerived, PermDerived, AliasFreeProduct>
 operator*(const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm)
-{
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, false>(perm, matrix.derived());
-}
+{ return Product<SparseDerived, PermDerived, AliasFreeProduct>(matrix.derived(), perm.derived()); }
 
 /** \returns the matrix with the permutation applied to the rows
   */
 template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, false>
+inline const Product<PermDerived, SparseDerived, AliasFreeProduct>
 operator*( const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix)
-{
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, false>(perm, matrix.derived());
-}
-
+{ return  Product<PermDerived, SparseDerived, AliasFreeProduct>(perm.derived(), matrix.derived()); }
 
 
 /** \returns the matrix with the inverse permutation applied to the columns.
   */
-template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, true>
-operator*(const SparseMatrixBase<SparseDerived>& matrix, const Transpose<PermutationBase<PermDerived> >& tperm)
+template<typename SparseDerived, typename PermutationType>
+inline const Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>
+operator*(const SparseMatrixBase<SparseDerived>& matrix, const InverseImpl<PermutationType, PermutationStorage>& tperm)
 {
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, true>(tperm.nestedPermutation(), matrix.derived());
+  return Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>(matrix.derived(), tperm.derived());
 }
 
 /** \returns the matrix with the inverse permutation applied to the rows.
   */
-template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, true>
-operator*(const Transpose<PermutationBase<PermDerived> >& tperm, const SparseMatrixBase<SparseDerived>& matrix)
+template<typename SparseDerived, typename PermutationType>
+inline const Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>
+operator*(const InverseImpl<PermutationType,PermutationStorage>& tperm, const SparseMatrixBase<SparseDerived>& matrix)
 {
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, true>(tperm.nestedPermutation(), matrix.derived());
+  return Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>(tperm.derived(), matrix.derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseProduct.h b/Eigen/src/SparseCore/SparseProduct.h
index cf7663070..4cbf68781 100644
--- a/Eigen/src/SparseCore/SparseProduct.h
+++ b/Eigen/src/SparseCore/SparseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,158 +12,6 @@
 
 namespace Eigen { 
 
-template<typename Lhs, typename Rhs>
-struct SparseSparseProductReturnType
-{
-  typedef typename internal::traits<Lhs>::Scalar Scalar;
-  typedef typename internal::traits<Lhs>::Index Index;
-  enum {
-    LhsRowMajor = internal::traits<Lhs>::Flags & RowMajorBit,
-    RhsRowMajor = internal::traits<Rhs>::Flags & RowMajorBit,
-    TransposeRhs = (!LhsRowMajor) && RhsRowMajor,
-    TransposeLhs = LhsRowMajor && (!RhsRowMajor)
-  };
-
-  typedef typename internal::conditional<TransposeLhs,
-    SparseMatrix<Scalar,0,Index>,
-    typename internal::nested<Lhs,Rhs::RowsAtCompileTime>::type>::type LhsNested;
-
-  typedef typename internal::conditional<TransposeRhs,
-    SparseMatrix<Scalar,0,Index>,
-    typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type>::type RhsNested;
-
-  typedef SparseSparseProduct<LhsNested, RhsNested> Type;
-};
-
-namespace internal {
-template<typename LhsNested, typename RhsNested>
-struct traits<SparseSparseProduct<LhsNested, RhsNested> >
-{
-  typedef MatrixXpr XprKind;
-  // clean the nested types:
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-  typedef typename _LhsNested::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
-                                         typename traits<_RhsNested>::Index>::type Index;
-
-  enum {
-    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-    LhsFlags = _LhsNested::Flags,
-    RhsFlags = _RhsNested::Flags,
-
-    RowsAtCompileTime    = _LhsNested::RowsAtCompileTime,
-    ColsAtCompileTime    = _RhsNested::ColsAtCompileTime,
-    MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
-
-    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
-
-    EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit),
-
-    RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
-
-    Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
-          | EvalBeforeAssigningBit
-          | EvalBeforeNestingBit,
-
-    CoeffReadCost = Dynamic
-  };
-
-  typedef Sparse StorageKind;
-};
-
-} // end namespace internal
-
-template<typename LhsNested, typename RhsNested>
-class SparseSparseProduct : internal::no_assignment_operator,
-  public SparseMatrixBase<SparseSparseProduct<LhsNested, RhsNested> >
-{
-  public:
-
-    typedef SparseMatrixBase<SparseSparseProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SparseSparseProduct)
-
-  private:
-
-    typedef typename internal::traits<SparseSparseProduct>::_LhsNested _LhsNested;
-    typedef typename internal::traits<SparseSparseProduct>::_RhsNested _RhsNested;
-
-  public:
-
-    template<typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE SparseSparseProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs), m_tolerance(0), m_conservative(true)
-    {
-      init();
-    }
-
-    template<typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE SparseSparseProduct(const Lhs& lhs, const Rhs& rhs, const RealScalar& tolerance)
-      : m_lhs(lhs), m_rhs(rhs), m_tolerance(tolerance), m_conservative(false)
-    {
-      init();
-    }
-
-    SparseSparseProduct pruned(const Scalar& reference = 0, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision()) const
-    {
-      using std::abs;
-      return SparseSparseProduct(m_lhs,m_rhs,abs(reference)*epsilon);
-    }
-
-    template<typename Dest>
-    void evalTo(Dest& result) const
-    {
-      if(m_conservative)
-        internal::conservative_sparse_sparse_product_selector<_LhsNested, _RhsNested, Dest>::run(lhs(),rhs(),result);
-      else
-        internal::sparse_sparse_product_with_pruning_selector<_LhsNested, _RhsNested, Dest>::run(lhs(),rhs(),result,m_tolerance);
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
-  protected:
-    void init()
-    {
-      eigen_assert(m_lhs.cols() == m_rhs.rows());
-
-      enum {
-        ProductIsValid = _LhsNested::ColsAtCompileTime==Dynamic
-                      || _RhsNested::RowsAtCompileTime==Dynamic
-                      || int(_LhsNested::ColsAtCompileTime)==int(_RhsNested::RowsAtCompileTime),
-        AreVectors = _LhsNested::IsVectorAtCompileTime && _RhsNested::IsVectorAtCompileTime,
-        SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(_LhsNested,_RhsNested)
-      };
-      // note to the lost user:
-      //    * for a dot product use: v1.dot(v2)
-      //    * for a coeff-wise product use: v1.cwise()*v2
-      EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
-        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
-      EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
-        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
-      EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
-    }
-
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-    RealScalar m_tolerance;
-    bool m_conservative;
-};
-
-// sparse = sparse * sparse
-template<typename Derived>
-template<typename Lhs, typename Rhs>
-inline Derived& SparseMatrixBase<Derived>::operator=(const SparseSparseProduct<Lhs,Rhs>& product)
-{
-  product.evalTo(derived());
-  return derived();
-}
-
 /** \returns an expression of the product of two sparse matrices.
   * By default a conservative product preserving the symbolic non zeros is performed.
   * The automatic pruning of the small values can be achieved by calling the pruned() function
@@ -177,12 +25,145 @@ inline Derived& SparseMatrixBase<Derived>::operator=(const SparseSparseProduct<L
   * */
 template<typename Derived>
 template<typename OtherDerived>
-inline const typename SparseSparseProductReturnType<Derived,OtherDerived>::Type
+inline const Product<Derived,OtherDerived,AliasFreeProduct>
 SparseMatrixBase<Derived>::operator*(const SparseMatrixBase<OtherDerived> &other) const
 {
-  return typename SparseSparseProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+  return Product<Derived,OtherDerived,AliasFreeProduct>(derived(), other.derived());
 }
 
+namespace internal {
+
+// sparse * sparse
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    evalTo(dst, lhs, rhs, typename evaluator_traits<Dest>::Shape());
+  }
+
+  // dense += sparse * sparse
+  template<typename Dest,typename ActualLhs>
+  static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs, typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type* = 0)
+  {
+    typedef typename nested_eval<ActualLhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::sparse_sparse_to_dense_product_selector<typename remove_all<LhsNested>::type,
+                                                      typename remove_all<RhsNested>::type, Dest>::run(lhsNested,rhsNested,dst);
+  }
+
+  // dense -= sparse * sparse
+  template<typename Dest>
+  static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type* = 0)
+  {
+    addTo(dst, -lhs, rhs);
+  }
+
+protected:
+
+  // sparse = sparse * sparse
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, SparseShape)
+  {
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::conservative_sparse_sparse_product_selector<typename remove_all<LhsNested>::type,
+                                                          typename remove_all<RhsNested>::type, Dest>::run(lhsNested,rhsNested,dst);
+  }
+
+  // dense = sparse * sparse
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, DenseShape)
+  {
+    dst.setZero();
+    addTo(dst, lhs, rhs);
+  }
+};
+
+// sparse * sparse-triangular
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, SparseTriangularShape, ProductType>
+ : public generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
+{};
+
+// sparse-triangular * sparse
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, SparseShape, ProductType>
+ : public generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
+{};
+
+// dense = sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    
+    generic_product_impl<Lhs, Rhs>::evalTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+// dense += sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::add_assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
+  {
+    generic_product_impl<Lhs, Rhs>::addTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+// dense -= sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::sub_assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
+  {
+    generic_product_impl<Lhs, Rhs>::subTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+template<typename Lhs, typename Rhs, int Options>
+struct unary_evaluator<SparseView<Product<Lhs, Rhs, Options> >, IteratorBased>
+ : public evaluator<typename Product<Lhs, Rhs, DefaultProduct>::PlainObject>
+{
+  typedef SparseView<Product<Lhs, Rhs, Options> > XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  explicit unary_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    using std::abs;
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(xpr.nestedExpression().lhs());
+    RhsNested rhsNested(xpr.nestedExpression().rhs());
+
+    internal::sparse_sparse_product_with_pruning_selector<typename remove_all<LhsNested>::type,
+                                                          typename remove_all<RhsNested>::type, PlainObject>::run(lhsNested,rhsNested,m_result,
+                                                                                                                  abs(xpr.reference())*xpr.epsilon());
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSEPRODUCT_H
diff --git a/Eigen/src/SparseCore/SparseRedux.h b/Eigen/src/SparseCore/SparseRedux.h
index f3da93a71..458774962 100644
--- a/Eigen/src/SparseCore/SparseRedux.h
+++ b/Eigen/src/SparseCore/SparseRedux.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,8 +18,9 @@ SparseMatrixBase<Derived>::sum() const
 {
   eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
   Scalar res(0);
+  internal::evaluator<Derived> thisEval(derived());
   for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator iter(derived(),j); iter; ++iter)
+    for (typename internal::evaluator<Derived>::InnerIterator iter(thisEval,j); iter; ++iter)
       res += iter.value();
   return res;
 }
@@ -29,7 +30,10 @@ typename internal::traits<SparseMatrix<_Scalar,_Options,_Index> >::Scalar
 SparseMatrix<_Scalar,_Options,_Index>::sum() const
 {
   eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
-  return Matrix<Scalar,1,Dynamic>::Map(&m_data.value(0), m_data.size()).sum();
+  if(this->isCompressed())
+    return Matrix<Scalar,1,Dynamic>::Map(m_data.valuePtr(), m_data.size()).sum();
+  else
+    return Base::sum();
 }
 
 template<typename _Scalar, int _Options, typename _Index>
@@ -37,7 +41,7 @@ typename internal::traits<SparseVector<_Scalar,_Options, _Index> >::Scalar
 SparseVector<_Scalar,_Options,_Index>::sum() const
 {
   eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
-  return Matrix<Scalar,1,Dynamic>::Map(&m_data.value(0), m_data.size()).sum();
+  return Matrix<Scalar,1,Dynamic>::Map(m_data.valuePtr(), m_data.size()).sum();
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
new file mode 100644
index 000000000..d91f38f97
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseRef.h
@@ -0,0 +1,397 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_REF_H
+#define EIGEN_SPARSE_REF_H
+
+namespace Eigen {
+
+enum {
+  StandardCompressedFormat = 2 /**< used by Ref<SparseMatrix> to specify whether the input storage must be in standard compressed form */
+};
+  
+namespace internal {
+
+template<typename Derived> class SparseRefBase;
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  enum {
+    Options = _Options,
+    Flags = traits<PlainObjectType>::Flags | CompressedAccessBit | NestByRefBit
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+  
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+{
+  enum {
+    Flags = (traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<SparseVector<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseVector<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  enum {
+    Options = _Options,
+    Flags = traits<PlainObjectType>::Flags | CompressedAccessBit | NestByRefBit
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && Derived::IsVectorAtCompileTime
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+{
+  enum {
+    Flags = (traits<SparseVector<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+
+template<typename Derived>
+struct traits<SparseRefBase<Derived> > : public traits<Derived> {};
+
+template<typename Derived> class SparseRefBase
+  : public SparseMapBase<Derived>
+{
+public:
+
+  typedef SparseMapBase<Derived> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase)
+
+  SparseRefBase()
+    : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0)
+  {}
+  
+protected:
+
+  template<typename Expression>
+  void construct(Expression& expr)
+  {
+    if(expr.outerIndexPtr()==0)
+      ::new (static_cast<Base*>(this)) Base(expr.size(), expr.nonZeros(), expr.innerIndexPtr(), expr.valuePtr());
+    else
+      ::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
+  }
+};
+
+} // namespace internal
+
+
+/** 
+  * \ingroup SparseCore_Module
+  *
+  * \brief A sparse matrix expression referencing an existing sparse expression
+  *
+  * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix.
+  * \tparam Options specifies whether the a standard compressed format is required \c Options is  \c #StandardCompressedFormat, or \c 0.
+  *                The default is \c 0.
+  *
+  * \sa class Ref
+  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
+  : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+#else
+template<typename SparseMatrixType, int Options>
+class Ref<SparseMatrixType, Options>
+  : public SparseMapBase<Derived,WriteAccessors> // yes, that's weird to use Derived here, but that works!
+#endif
+{
+    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+    typedef internal::traits<Ref> Traits;
+    template<int OtherOptions>
+    inline Ref(const SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+    template<int OtherOptions>
+    inline Ref(const MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<int OtherOptions>
+    inline Ref(SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
+      Base::construct(expr.derived());
+    }
+    
+    template<int OtherOptions>
+    inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
+      Base::construct(expr.derived());
+    }
+    
+    template<typename Derived>
+    inline Ref(const SparseCompressedBase<Derived>& expr)
+    #else
+    /** Implicit constructor from any sparse expression (2D matrix or 1D vector) */
+    template<typename Derived>
+    inline Ref(SparseCompressedBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
+      Base::construct(expr.const_cast_derived());
+    }
+};
+
+// this is the const ref version
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public internal::SparseRefBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> TPlainObjectType;
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    inline Ref(const SparseMatrixBase<Derived>& expr) : m_hasCopy(false)
+    {
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+    inline Ref(const Ref& other) : Base(other), m_hasCopy(false) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    inline Ref(const RefBase<OtherRef>& other) : m_hasCopy(false) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
+    ~Ref() {
+      if(m_hasCopy) {
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+        obj->~TPlainObjectType();
+      }
+    }
+
+  protected:
+
+    template<typename Expression>
+    void construct(const Expression& expr,internal::true_type)
+    {
+      if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed()))
+      {
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+        ::new (obj) TPlainObjectType(expr);
+        m_hasCopy = true;
+        Base::construct(*obj);
+      }
+      else
+      {
+        Base::construct(expr);
+      }
+    }
+
+    template<typename Expression>
+    void construct(const Expression& expr, internal::false_type)
+    {
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+      ::new (obj) TPlainObjectType(expr);
+      m_hasCopy = true;
+      Base::construct(*obj);
+    }
+
+  protected:
+    char m_object_bytes[sizeof(TPlainObjectType)];
+    bool m_hasCopy;
+};
+
+
+
+/**
+  * \ingroup SparseCore_Module
+  *
+  * \brief A sparse vector expression referencing an existing sparse vector expression
+  *
+  * \tparam SparseVectorType the equivalent sparse vector type of the referenced data, it must be a template instance of class SparseVector.
+  *
+  * \sa class Ref
+  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType >
+  : public internal::SparseRefBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+#else
+template<typename SparseVectorType>
+class Ref<SparseVectorType>
+  : public SparseMapBase<Derived,WriteAccessors>
+#endif
+{
+    typedef SparseVector<MatScalar,MatOptions,MatIndex> PlainObjectType;
+    typedef internal::traits<Ref> Traits;
+    template<int OtherOptions>
+    inline Ref(const SparseVector<MatScalar,OtherOptions,MatIndex>& expr);
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<int OtherOptions>
+    inline Ref(SparseVector<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseVector<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
+    }
+
+    template<typename Derived>
+    inline Ref(const SparseCompressedBase<Derived>& expr)
+    #else
+    /** Implicit constructor from any 1D sparse vector expression */
+    template<typename Derived>
+    inline Ref(SparseCompressedBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.const_cast_derived());
+    }
+};
+
+// this is the const ref version
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public internal::SparseRefBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+    typedef SparseVector<MatScalar,MatOptions,MatIndex> TPlainObjectType;
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    inline Ref(const SparseMatrixBase<Derived>& expr) : m_hasCopy(false)
+    {
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+    inline Ref(const Ref& other) : Base(other), m_hasCopy(false) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    inline Ref(const RefBase<OtherRef>& other) : m_hasCopy(false) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
+    ~Ref() {
+      if(m_hasCopy) {
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+        obj->~TPlainObjectType();
+      }
+    }
+
+  protected:
+
+    template<typename Expression>
+    void construct(const Expression& expr,internal::true_type)
+    {
+      Base::construct(expr);
+    }
+
+    template<typename Expression>
+    void construct(const Expression& expr, internal::false_type)
+    {
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+      ::new (obj) TPlainObjectType(expr);
+      m_hasCopy = true;
+      Base::construct(*obj);
+    }
+
+  protected:
+    char m_object_bytes[sizeof(TPlainObjectType)];
+    bool m_hasCopy;
+};
+
+namespace internal {
+
+// FIXME shall we introduce a general evaluatior_ref that we can specialize for any sparse object once, and thus remove this copy-pasta thing...
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_REF_H
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 0eda96bc4..9e39be738 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,14 +11,14 @@
 #define EIGEN_SPARSE_SELFADJOINTVIEW_H
 
 namespace Eigen { 
-
+  
 /** \ingroup SparseCore_Module
   * \class SparseSelfAdjointView
   *
   * \brief Pseudo expression to manipulate a triangular sparse matrix as a selfadjoint matrix.
   *
   * \param MatrixType the type of the dense matrix storing the coefficients
-  * \param UpLo can be either \c #Lower or \c #Upper
+  * \param Mode can be either \c #Lower or \c #Upper
   *
   * This class is an expression of a sefladjoint matrix from a triangular part of a matrix
   * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
@@ -26,38 +26,39 @@ namespace Eigen {
   *
   * \sa SparseMatrixBase::selfadjointView()
   */
-template<typename Lhs, typename Rhs, int UpLo>
-class SparseSelfAdjointTimeDenseProduct;
-
-template<typename Lhs, typename Rhs, int UpLo>
-class DenseTimeSparseSelfAdjointProduct;
-
 namespace internal {
   
-template<typename MatrixType, unsigned int UpLo>
-struct traits<SparseSelfAdjointView<MatrixType,UpLo> > : traits<MatrixType> {
+template<typename MatrixType, unsigned int Mode>
+struct traits<SparseSelfAdjointView<MatrixType,Mode> > : traits<MatrixType> {
 };
 
-template<int SrcUpLo,int DstUpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm = 0);
+template<int SrcMode,int DstMode,typename MatrixType,int DestOrder>
+void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm = 0);
 
-template<int UpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm = 0);
+template<int Mode,typename MatrixType,int DestOrder>
+void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm = 0);
 
 }
 
-template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
-  : public EigenBase<SparseSelfAdjointView<MatrixType,UpLo> >
+template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
+  : public EigenBase<SparseSelfAdjointView<MatrixType,_Mode> >
 {
   public:
+    
+    enum {
+      Mode = _Mode,
+      RowsAtCompileTime = internal::traits<SparseSelfAdjointView>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<SparseSelfAdjointView>::ColsAtCompileTime
+    };
 
+    typedef EigenBase<SparseSelfAdjointView> Base;
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Index,Dynamic,1> VectorI;
-    typedef typename MatrixType::Nested MatrixTypeNested;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
     typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
-
-    inline SparseSelfAdjointView(const MatrixType& matrix) : m_matrix(matrix)
+    
+    explicit inline SparseSelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
     {
       eigen_assert(rows()==cols() && "SelfAdjointView is only for squared matrices");
     }
@@ -67,7 +68,7 @@ template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
 
     /** \internal \returns a reference to the nested matrix */
     const _MatrixTypeNested& matrix() const { return m_matrix; }
-    _MatrixTypeNested& matrix() { return m_matrix.const_cast_derived(); }
+    typename internal::remove_reference<MatrixTypeNested>::type& matrix() { return m_matrix; }
 
     /** \returns an expression of the matrix product between a sparse self-adjoint matrix \c *this and a sparse matrix \a rhs.
       *
@@ -75,10 +76,10 @@ template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
       * Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing the product.
       */
     template<typename OtherDerived>
-    SparseSparseProduct<typename OtherDerived::PlainObject, OtherDerived>
+    Product<SparseSelfAdjointView, OtherDerived>
     operator*(const SparseMatrixBase<OtherDerived>& rhs) const
     {
-      return SparseSparseProduct<typename OtherDerived::PlainObject, OtherDerived>(*this, rhs.derived());
+      return Product<SparseSelfAdjointView, OtherDerived>(*this, rhs.derived());
     }
 
     /** \returns an expression of the matrix product between a sparse matrix \a lhs and a sparse self-adjoint matrix \a rhs.
@@ -87,26 +88,26 @@ template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
       * Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing the product.
       */
     template<typename OtherDerived> friend
-    SparseSparseProduct<OtherDerived, typename OtherDerived::PlainObject >
+    Product<OtherDerived, SparseSelfAdjointView>
     operator*(const SparseMatrixBase<OtherDerived>& lhs, const SparseSelfAdjointView& rhs)
     {
-      return SparseSparseProduct<OtherDerived, typename OtherDerived::PlainObject>(lhs.derived(), rhs);
+      return Product<OtherDerived, SparseSelfAdjointView>(lhs.derived(), rhs);
     }
     
     /** Efficient sparse self-adjoint matrix times dense vector/matrix product */
     template<typename OtherDerived>
-    SparseSelfAdjointTimeDenseProduct<MatrixType,OtherDerived,UpLo>
+    Product<SparseSelfAdjointView,OtherDerived>
     operator*(const MatrixBase<OtherDerived>& rhs) const
     {
-      return SparseSelfAdjointTimeDenseProduct<MatrixType,OtherDerived,UpLo>(m_matrix, rhs.derived());
+      return Product<SparseSelfAdjointView,OtherDerived>(*this, rhs.derived());
     }
 
     /** Efficient dense vector/matrix times sparse self-adjoint matrix product */
     template<typename OtherDerived> friend
-    DenseTimeSparseSelfAdjointProduct<OtherDerived,MatrixType,UpLo>
+    Product<OtherDerived,SparseSelfAdjointView>
     operator*(const MatrixBase<OtherDerived>& lhs, const SparseSelfAdjointView& rhs)
     {
-      return DenseTimeSparseSelfAdjointProduct<OtherDerived,_MatrixTypeNested,UpLo>(lhs.derived(), rhs.m_matrix);
+      return Product<OtherDerived,SparseSelfAdjointView>(lhs.derived(), rhs);
     }
 
     /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
@@ -120,56 +121,48 @@ template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
     template<typename DerivedU>
     SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
     
-    /** \internal triggered by sparse_matrix = SparseSelfadjointView; */
-    template<typename DestScalar,int StorageOrder> void evalTo(SparseMatrix<DestScalar,StorageOrder,Index>& _dest) const
-    {
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix, _dest);
-    }
-    
-    template<typename DestScalar> void evalTo(DynamicSparseMatrix<DestScalar,ColMajor,Index>& _dest) const
-    {
-      // TODO directly evaluate into _dest;
-      SparseMatrix<DestScalar,ColMajor,Index> tmp(_dest.rows(),_dest.cols());
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix, tmp);
-      _dest = tmp;
-    }
-    
     /** \returns an expression of P H P^-1 */
-    SparseSymmetricPermutationProduct<_MatrixTypeNested,UpLo> twistedBy(const PermutationMatrix<Dynamic,Dynamic,Index>& perm) const
+    // TODO implement twists in a more evaluator friendly fashion
+    SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode> twistedBy(const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm) const
     {
-      return SparseSymmetricPermutationProduct<_MatrixTypeNested,UpLo>(m_matrix, perm);
+      return SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode>(m_matrix, perm);
     }
-    
-    template<typename SrcMatrixType,int SrcUpLo>
-    SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType,SrcUpLo>& permutedMatrix)
+
+    template<typename SrcMatrixType,int SrcMode>
+    SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType,SrcMode>& permutedMatrix)
     {
-      permutedMatrix.evalTo(*this);
+      internal::call_assignment_no_alias_no_transpose(*this, permutedMatrix);
       return *this;
     }
 
-
     SparseSelfAdjointView& operator=(const SparseSelfAdjointView& src)
     {
-      PermutationMatrix<Dynamic> pnull;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> pnull;
       return *this = src.twistedBy(pnull);
     }
 
-    template<typename SrcMatrixType,unsigned int SrcUpLo>
-    SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType,SrcUpLo>& src)
+    template<typename SrcMatrixType,unsigned int SrcMode>
+    SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType,SrcMode>& src)
     {
-      PermutationMatrix<Dynamic> pnull;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> pnull;
       return *this = src.twistedBy(pnull);
     }
     
-
-    // const SparseLLT<PlainObject, UpLo> llt() const;
-    // const SparseLDLT<PlainObject, UpLo> ldlt() const;
-
+    void resize(Index rows, Index cols)
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(rows);
+      EIGEN_ONLY_USED_FOR_DEBUG(cols);
+      eigen_assert(rows == this->rows() && cols == this->cols()
+                && "SparseSelfadjointView::resize() does not actually allow to resize.");
+    }
+    
   protected:
 
-    typename MatrixType::Nested m_matrix;
-    mutable VectorI m_countPerRow;
-    mutable VectorI m_countPerCol;
+    MatrixTypeNested m_matrix;
+    //mutable VectorI m_countPerRow;
+    //mutable VectorI m_countPerCol;
+  private:
+    template<typename Dest> void evalTo(Dest &) const;
 };
 
 /***************************************************************************
@@ -178,146 +171,268 @@ template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
 
 template<typename Derived>
 template<unsigned int UpLo>
-const SparseSelfAdjointView<Derived, UpLo> SparseMatrixBase<Derived>::selfadjointView() const
+typename SparseMatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type SparseMatrixBase<Derived>::selfadjointView() const
 {
-  return derived();
+  return SparseSelfAdjointView<const Derived, UpLo>(derived());
 }
 
 template<typename Derived>
 template<unsigned int UpLo>
-SparseSelfAdjointView<Derived, UpLo> SparseMatrixBase<Derived>::selfadjointView()
+typename SparseMatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type SparseMatrixBase<Derived>::selfadjointView()
 {
-  return derived();
+  return SparseSelfAdjointView<Derived, UpLo>(derived());
 }
 
 /***************************************************************************
 * Implementation of SparseSelfAdjointView methods
 ***************************************************************************/
 
-template<typename MatrixType, unsigned int UpLo>
+template<typename MatrixType, unsigned int Mode>
 template<typename DerivedU>
-SparseSelfAdjointView<MatrixType,UpLo>&
-SparseSelfAdjointView<MatrixType,UpLo>::rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha)
+SparseSelfAdjointView<MatrixType,Mode>&
+SparseSelfAdjointView<MatrixType,Mode>::rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha)
 {
-  SparseMatrix<Scalar,MatrixType::Flags&RowMajorBit?RowMajor:ColMajor> tmp = u * u.adjoint();
+  SparseMatrix<Scalar,(MatrixType::Flags&RowMajorBit)?RowMajor:ColMajor> tmp = u * u.adjoint();
   if(alpha==Scalar(0))
-    m_matrix.const_cast_derived() = tmp.template triangularView<UpLo>();
+    m_matrix = tmp.template triangularView<Mode>();
   else
-    m_matrix.const_cast_derived() += alpha * tmp.template triangularView<UpLo>();
+    m_matrix += alpha * tmp.template triangularView<Mode>();
 
   return *this;
 }
 
+namespace internal {
+  
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SparseSelfAdjointView<MatrixType,Mode> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseSelfAdjointShape Shape;
+};
+
+struct SparseSelfAdjoint2Sparse {};
+
+template<> struct AssignmentKind<SparseShape,SparseSelfAdjointShape> { typedef SparseSelfAdjoint2Sparse Kind; };
+template<> struct AssignmentKind<SparseSelfAdjointShape,SparseShape> { typedef Sparse2Sparse Kind; };
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse>
+{
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> AssignOpType;
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignOpType&/*func*/)
+  {
+    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), dst);
+  }
+
+  // FIXME: the handling of += and -= in sparse matrices should be cleanup so that next two overloads could be reduced to:
+  template<typename DestScalar,int StorageOrder,typename AssignFunc>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignFunc& func)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    call_assignment_no_alias_no_transpose(dst, tmp, func);
+  }
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src,
+                  const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>& /* func */)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    dst += tmp;
+  }
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src,
+                  const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>& /* func */)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    dst -= tmp;
+  }
+  
+  template<typename DestScalar>
+  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const AssignOpType&/*func*/)
+  {
+    // TODO directly evaluate into dst;
+    SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(dst.rows(),dst.cols());
+    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), tmp);
+    dst = tmp;
+  }
+};
+
+} // end namespace internal
+
 /***************************************************************************
 * Implementation of sparse self-adjoint time dense matrix
 ***************************************************************************/
 
 namespace internal {
-template<typename Lhs, typename Rhs, int UpLo>
-struct traits<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo> >
- : traits<ProductBase<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo>, Lhs, Rhs> >
-{
-  typedef Dense StorageKind;
-};
-}
 
-template<typename Lhs, typename Rhs, int UpLo>
-class SparseSelfAdjointTimeDenseProduct
-  : public ProductBase<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo>, Lhs, Rhs>
+template<int Mode, typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
+inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
 {
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(SparseSelfAdjointTimeDenseProduct)
-
-    SparseSelfAdjointTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
+  EIGEN_ONLY_USED_FOR_DEBUG(alpha);
+  
+  typedef typename internal::nested_eval<SparseLhsType,DenseRhsType::MaxColsAtCompileTime>::type SparseLhsTypeNested;
+  typedef typename internal::remove_all<SparseLhsTypeNested>::type SparseLhsTypeNestedCleaned;
+  typedef evaluator<SparseLhsTypeNestedCleaned> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsIterator;
+  typedef typename SparseLhsType::Scalar LhsScalar;
+  
+  enum {
+    LhsIsRowMajor = (LhsEval::Flags&RowMajorBit)==RowMajorBit,
+    ProcessFirstHalf =
+              ((Mode&(Upper|Lower))==(Upper|Lower))
+          || ( (Mode&Upper) && !LhsIsRowMajor)
+          || ( (Mode&Lower) && LhsIsRowMajor),
+    ProcessSecondHalf = !ProcessFirstHalf
+  };
+  
+  SparseLhsTypeNested lhs_nested(lhs);
+  LhsEval lhsEval(lhs_nested);
 
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  // work on one column at once
+  for (Index k=0; k<rhs.cols(); ++k)
+  {
+    for (Index j=0; j<lhs.outerSize(); ++j)
     {
-      EIGEN_ONLY_USED_FOR_DEBUG(alpha);
-      // TODO use alpha
-      eigen_assert(alpha==Scalar(1) && "alpha != 1 is not implemented yet, sorry");
-      typedef typename internal::remove_all<Lhs>::type _Lhs;
-      typedef typename _Lhs::InnerIterator LhsInnerIterator;
-      enum {
-        LhsIsRowMajor = (_Lhs::Flags&RowMajorBit)==RowMajorBit,
-        ProcessFirstHalf =
-                 ((UpLo&(Upper|Lower))==(Upper|Lower))
-              || ( (UpLo&Upper) && !LhsIsRowMajor)
-              || ( (UpLo&Lower) && LhsIsRowMajor),
-        ProcessSecondHalf = !ProcessFirstHalf
-      };
-      for (Index j=0; j<m_lhs.outerSize(); ++j)
+      LhsIterator i(lhsEval,j);
+      // handle diagonal coeff
+      if (ProcessSecondHalf)
       {
-        LhsInnerIterator i(m_lhs,j);
-        if (ProcessSecondHalf)
-        {
-          while (i && i.index()<j) ++i;
-          if(i && i.index()==j)
-          {
-            dest.row(j) += i.value() * m_rhs.row(j);
-            ++i;
-          }
-        }
-        for(; (ProcessFirstHalf ? i && i.index() < j : i) ; ++i)
+        while (i && i.index()<j) ++i;
+        if(i && i.index()==j)
         {
-          Index a = LhsIsRowMajor ? j : i.index();
-          Index b = LhsIsRowMajor ? i.index() : j;
-          typename Lhs::Scalar v = i.value();
-          dest.row(a) += (v) * m_rhs.row(b);
-          dest.row(b) += numext::conj(v) * m_rhs.row(a);
+          res(j,k) += alpha * i.value() * rhs(j,k);
+          ++i;
         }
-        if (ProcessFirstHalf && i && (i.index()==j))
-          dest.row(j) += i.value() * m_rhs.row(j);
       }
+
+      // premultiplied rhs for scatters
+      typename ScalarBinaryOpTraits<AlphaType, typename DenseRhsType::Scalar>::ReturnType rhs_j(alpha*rhs(j,k));
+      // accumulator for partial scalar product
+      typename DenseResType::Scalar res_j(0);
+      for(; (ProcessFirstHalf ? i && i.index() < j : i) ; ++i)
+      {
+        LhsScalar lhs_ij = i.value();
+        if(!LhsIsRowMajor) lhs_ij = numext::conj(lhs_ij);
+        res_j += lhs_ij * rhs(i.index(),k);
+        res(i.index(),k) += numext::conj(lhs_ij) * rhs_j;
+      }
+      res(j,k) += alpha * res_j;
+
+      // handle diagonal coeff
+      if (ProcessFirstHalf && i && (i.index()==j))
+        res(j,k) += alpha * i.value() * rhs(j,k);
     }
+  }
+}
 
-  private:
-    SparseSelfAdjointTimeDenseProduct& operator=(const SparseSelfAdjointTimeDenseProduct&);
+
+template<typename LhsView, typename Rhs, int ProductType>
+struct generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType>
+: generic_product_impl_base<LhsView, Rhs, generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType> >
+{
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const LhsView& lhsView, const Rhs& rhs, const typename Dest::Scalar& alpha)
+  {
+    typedef typename LhsView::_MatrixTypeNested Lhs;
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhsView.matrix());
+    RhsNested rhsNested(rhs);
+    
+    internal::sparse_selfadjoint_time_dense_product<LhsView::Mode>(lhsNested, rhsNested, dst, alpha);
+  }
 };
 
-namespace internal {
-template<typename Lhs, typename Rhs, int UpLo>
-struct traits<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo> >
- : traits<ProductBase<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo>, Lhs, Rhs> >
-{};
-}
+template<typename Lhs, typename RhsView, int ProductType>
+struct generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, ProductType>
+: generic_product_impl_base<Lhs, RhsView, generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, ProductType> >
+{
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const RhsView& rhsView, const typename Dest::Scalar& alpha)
+  {
+    typedef typename RhsView::_MatrixTypeNested Rhs;
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhsView.matrix());
+    
+    // transpose everything
+    Transpose<Dest> dstT(dst);
+    internal::sparse_selfadjoint_time_dense_product<RhsView::Mode>(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
+  }
+};
 
-template<typename Lhs, typename Rhs, int UpLo>
-class DenseTimeSparseSelfAdjointProduct
-  : public ProductBase<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo>, Lhs, Rhs>
+// NOTE: these two overloads are needed to evaluate the sparse selfadjoint view into a full sparse matrix
+// TODO: maybe the copy could be handled by generic_product_impl so that these overloads would not be needed anymore
+
+template<typename LhsView, typename Rhs, int ProductTag>
+struct product_evaluator<Product<LhsView, Rhs, DefaultProduct>, ProductTag, SparseSelfAdjointShape, SparseShape>
+  : public evaluator<typename Product<typename Rhs::PlainObject, Rhs, DefaultProduct>::PlainObject>
 {
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(DenseTimeSparseSelfAdjointProduct)
+  typedef Product<LhsView, Rhs, DefaultProduct> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
 
-    DenseTimeSparseSelfAdjointProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
+  product_evaluator(const XprType& xpr)
+    : m_lhs(xpr.lhs()), m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<typename Rhs::PlainObject, Rhs, SparseShape, SparseShape, ProductTag>::evalTo(m_result, m_lhs, xpr.rhs());
+  }
+  
+protected:
+  typename Rhs::PlainObject m_lhs;
+  PlainObject m_result;
+};
 
-    template<typename Dest> void scaleAndAddTo(Dest& /*dest*/, const Scalar& /*alpha*/) const
-    {
-      // TODO
-    }
+template<typename Lhs, typename RhsView, int ProductTag>
+struct product_evaluator<Product<Lhs, RhsView, DefaultProduct>, ProductTag, SparseShape, SparseSelfAdjointShape>
+  : public evaluator<typename Product<Lhs, typename Lhs::PlainObject, DefaultProduct>::PlainObject>
+{
+  typedef Product<Lhs, RhsView, DefaultProduct> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
 
-  private:
-    DenseTimeSparseSelfAdjointProduct& operator=(const DenseTimeSparseSelfAdjointProduct&);
+  product_evaluator(const XprType& xpr)
+    : m_rhs(xpr.rhs()), m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, typename Lhs::PlainObject, SparseShape, SparseShape, ProductTag>::evalTo(m_result, xpr.lhs(), m_rhs);
+  }
+  
+protected:
+  typename Lhs::PlainObject m_rhs;
+  PlainObject m_result;
 };
 
+} // namespace internal
+
 /***************************************************************************
 * Implementation of symmetric copies and permutations
 ***************************************************************************/
 namespace internal {
-  
-template<typename MatrixType, int UpLo>
-struct traits<SparseSymmetricPermutationProduct<MatrixType,UpLo> > : traits<MatrixType> {
-};
 
-template<int UpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm)
+template<int Mode,typename MatrixType,int DestOrder>
+void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm)
 {
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::Scalar Scalar;
-  typedef SparseMatrix<Scalar,DestOrder,Index> Dest;
-  typedef Matrix<Index,Dynamic,1> VectorI;
+  typedef SparseMatrix<Scalar,DestOrder,StorageIndex> Dest;
+  typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+  typedef evaluator<MatrixType> MatEval;
+  typedef typename evaluator<MatrixType>::InnerIterator MatIterator;
   
+  MatEval matEval(mat);
   Dest& dest(_dest.derived());
   enum {
     StorageOrderMatch = int(Dest::IsRowMajor) == int(MatrixType::IsRowMajor)
@@ -331,17 +446,17 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
   for(Index j = 0; j<size; ++j)
   {
     Index jp = perm ? perm[j] : j;
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    for(MatIterator it(matEval,j); it; ++it)
     {
       Index i = it.index();
       Index r = it.row();
       Index c = it.col();
       Index ip = perm ? perm[i] : i;
-      if(UpLo==(Upper|Lower))
+      if(Mode==(Upper|Lower))
         count[StorageOrderMatch ? jp : ip]++;
       else if(r==c)
         count[ip]++;
-      else if(( UpLo==Lower && r>c) || ( UpLo==Upper && r<c))
+      else if(( Mode==Lower && r>c) || ( Mode==Upper && r<c))
       {
         count[ip]++;
         count[jp]++;
@@ -359,18 +474,18 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
     count[j] = dest.outerIndexPtr()[j];
   
   // copy data
-  for(Index j = 0; j<size; ++j)
+  for(StorageIndex j = 0; j<size; ++j)
   {
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    for(MatIterator it(matEval,j); it; ++it)
     {
-      Index i = it.index();
+      StorageIndex i = internal::convert_index<StorageIndex>(it.index());
       Index r = it.row();
       Index c = it.col();
       
-      Index jp = perm ? perm[j] : j;
-      Index ip = perm ? perm[i] : i;
+      StorageIndex jp = perm ? perm[j] : j;
+      StorageIndex ip = perm ? perm[i] : i;
       
-      if(UpLo==(Upper|Lower))
+      if(Mode==(Upper|Lower))
       {
         Index k = count[StorageOrderMatch ? jp : ip]++;
         dest.innerIndexPtr()[k] = StorageOrderMatch ? ip : jp;
@@ -382,7 +497,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
         dest.innerIndexPtr()[k] = ip;
         dest.valuePtr()[k] = it.value();
       }
-      else if(( (UpLo&Lower)==Lower && r>c) || ( (UpLo&Upper)==Upper && r<c))
+      else if(( (Mode&Lower)==Lower && r>c) || ( (Mode&Upper)==Upper && r<c))
       {
         if(!StorageOrderMatch)
           std::swap(ip,jp);
@@ -397,35 +512,40 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
   }
 }
 
-template<int _SrcUpLo,int _DstUpLo,typename MatrixType,int DstOrder>
-void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DstOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm)
+template<int _SrcMode,int _DstMode,typename MatrixType,int DstOrder>
+void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DstOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm)
 {
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::Scalar Scalar;
-  SparseMatrix<Scalar,DstOrder,Index>& dest(_dest.derived());
-  typedef Matrix<Index,Dynamic,1> VectorI;
+  SparseMatrix<Scalar,DstOrder,StorageIndex>& dest(_dest.derived());
+  typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+  typedef evaluator<MatrixType> MatEval;
+  typedef typename evaluator<MatrixType>::InnerIterator MatIterator;
+
   enum {
     SrcOrder = MatrixType::IsRowMajor ? RowMajor : ColMajor,
     StorageOrderMatch = int(SrcOrder) == int(DstOrder),
-    DstUpLo = DstOrder==RowMajor ? (_DstUpLo==Upper ? Lower : Upper) : _DstUpLo,
-    SrcUpLo = SrcOrder==RowMajor ? (_SrcUpLo==Upper ? Lower : Upper) : _SrcUpLo
+    DstMode = DstOrder==RowMajor ? (_DstMode==Upper ? Lower : Upper) : _DstMode,
+    SrcMode = SrcOrder==RowMajor ? (_SrcMode==Upper ? Lower : Upper) : _SrcMode
   };
+
+  MatEval matEval(mat);
   
   Index size = mat.rows();
   VectorI count(size);
   count.setZero();
   dest.resize(size,size);
-  for(Index j = 0; j<size; ++j)
+  for(StorageIndex j = 0; j<size; ++j)
   {
-    Index jp = perm ? perm[j] : j;
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    StorageIndex jp = perm ? perm[j] : j;
+    for(MatIterator it(matEval,j); it; ++it)
     {
-      Index i = it.index();
-      if((int(SrcUpLo)==int(Lower) && i<j) || (int(SrcUpLo)==int(Upper) && i>j))
+      StorageIndex i = it.index();
+      if((int(SrcMode)==int(Lower) && i<j) || (int(SrcMode)==int(Upper) && i>j))
         continue;
                   
-      Index ip = perm ? perm[i] : i;
-      count[int(DstUpLo)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
+      StorageIndex ip = perm ? perm[i] : i;
+      count[int(DstMode)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
     }
   }
   dest.outerIndexPtr()[0] = 0;
@@ -435,23 +555,23 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixTyp
   for(Index j=0; j<size; ++j)
     count[j] = dest.outerIndexPtr()[j];
   
-  for(Index j = 0; j<size; ++j)
+  for(StorageIndex j = 0; j<size; ++j)
   {
     
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    for(MatIterator it(matEval,j); it; ++it)
     {
-      Index i = it.index();
-      if((int(SrcUpLo)==int(Lower) && i<j) || (int(SrcUpLo)==int(Upper) && i>j))
+      StorageIndex i = it.index();
+      if((int(SrcMode)==int(Lower) && i<j) || (int(SrcMode)==int(Upper) && i>j))
         continue;
                   
-      Index jp = perm ? perm[j] : j;
-      Index ip = perm? perm[i] : i;
+      StorageIndex jp = perm ? perm[j] : j;
+      StorageIndex ip = perm? perm[i] : i;
       
-      Index k = count[int(DstUpLo)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
-      dest.innerIndexPtr()[k] = int(DstUpLo)==int(Lower) ? (std::max)(ip,jp) : (std::min)(ip,jp);
+      Index k = count[int(DstMode)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
+      dest.innerIndexPtr()[k] = int(DstMode)==int(Lower) ? (std::max)(ip,jp) : (std::min)(ip,jp);
       
       if(!StorageOrderMatch) std::swap(ip,jp);
-      if( ((int(DstUpLo)==int(Lower) && ip<jp) || (int(DstUpLo)==int(Upper) && ip>jp)))
+      if( ((int(DstMode)==int(Lower) && ip<jp) || (int(DstMode)==int(Upper) && ip>jp)))
         dest.valuePtr()[k] = numext::conj(it.value());
       else
         dest.valuePtr()[k] = it.value();
@@ -461,19 +581,33 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixTyp
 
 }
 
-template<typename MatrixType,int UpLo>
+// TODO implement twists in a more evaluator friendly fashion
+
+namespace internal {
+
+template<typename MatrixType, int Mode>
+struct traits<SparseSymmetricPermutationProduct<MatrixType,Mode> > : traits<MatrixType> {
+};
+
+}
+
+template<typename MatrixType,int Mode>
 class SparseSymmetricPermutationProduct
-  : public EigenBase<SparseSymmetricPermutationProduct<MatrixType,UpLo> >
+  : public EigenBase<SparseSymmetricPermutationProduct<MatrixType,Mode> >
 {
   public:
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    enum {
+      RowsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::ColsAtCompileTime
+    };
   protected:
-    typedef PermutationMatrix<Dynamic,Dynamic,Index> Perm;
+    typedef PermutationMatrix<Dynamic,Dynamic,StorageIndex> Perm;
   public:
-    typedef Matrix<Index,Dynamic,1> VectorI;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
     typedef typename MatrixType::Nested MatrixTypeNested;
-    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+    typedef typename internal::remove_all<MatrixTypeNested>::type NestedExpression;
     
     SparseSymmetricPermutationProduct(const MatrixType& mat, const Perm& perm)
       : m_matrix(mat), m_perm(perm)
@@ -481,20 +615,9 @@ class SparseSymmetricPermutationProduct
     
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
-    
-    template<typename DestScalar, int Options, typename DstIndex>
-    void evalTo(SparseMatrix<DestScalar,Options,DstIndex>& _dest) const
-    {
-//       internal::permute_symm_to_fullsymm<UpLo>(m_matrix,_dest,m_perm.indices().data());
-      SparseMatrix<DestScalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix,tmp,m_perm.indices().data());
-      _dest = tmp;
-    }
-    
-    template<typename DestType,unsigned int DestUpLo> void evalTo(SparseSelfAdjointView<DestType,DestUpLo>& dest) const
-    {
-      internal::permute_symm_to_symm<UpLo,DestUpLo>(m_matrix,dest.matrix(),m_perm.indices().data());
-    }
+        
+    const NestedExpression& matrix() const { return m_matrix; }
+    const Perm& perm() const { return m_perm; }
     
   protected:
     MatrixTypeNested m_matrix;
@@ -502,6 +625,31 @@ class SparseSymmetricPermutationProduct
 
 };
 
+namespace internal {
+  
+template<typename DstXprType, typename MatrixType, int Mode, typename Scalar>
+struct Assignment<DstXprType, SparseSymmetricPermutationProduct<MatrixType,Mode>, internal::assign_op<Scalar,typename MatrixType::Scalar>, Sparse2Sparse>
+{
+  typedef SparseSymmetricPermutationProduct<MatrixType,Mode> SrcXprType;
+  typedef typename DstXprType::StorageIndex DstIndex;
+  template<int Options>
+  static void run(SparseMatrix<Scalar,Options,DstIndex> &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename MatrixType::Scalar> &)
+  {
+    // internal::permute_symm_to_fullsymm<Mode>(m_matrix,_dest,m_perm.indices().data());
+    SparseMatrix<Scalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
+    internal::permute_symm_to_fullsymm<Mode>(src.matrix(),tmp,src.perm().indices().data());
+    dst = tmp;
+  }
+  
+  template<typename DestType,unsigned int DestMode>
+  static void run(SparseSelfAdjointView<DestType,DestMode>& dst, const SrcXprType &src, const internal::assign_op<Scalar,typename MatrixType::Scalar> &)
+  {
+    internal::permute_symm_to_symm<Mode,DestMode>(src.matrix(),dst.matrix(),src.perm().indices().data());
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSE_SELFADJOINTVIEW_H
diff --git a/Eigen/src/SparseCore/SparseSolverBase.h b/Eigen/src/SparseCore/SparseSolverBase.h
new file mode 100644
index 000000000..b4c9a422f
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseSolverBase.h
@@ -0,0 +1,124 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSESOLVERBASE_H
+#define EIGEN_SPARSESOLVERBASE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+  /** \internal
+  * Helper functions to solve with a sparse right-hand-side and result.
+  * The rhs is decomposed into small vertical panels which are solved through dense temporaries.
+  */
+template<typename Decomposition, typename Rhs, typename Dest>
+typename enable_if<Rhs::ColsAtCompileTime!=1 && Dest::ColsAtCompileTime!=1>::type
+solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest &dest)
+{
+  EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  typedef typename Dest::Scalar DestScalar;
+  // we process the sparse rhs per block of NbColsAtOnce columns temporarily stored into a dense matrix.
+  static const Index NbColsAtOnce = 4;
+  Index rhsCols = rhs.cols();
+  Index size = rhs.rows();
+  // the temporary matrices do not need more columns than NbColsAtOnce:
+  Index tmpCols = (std::min)(rhsCols, NbColsAtOnce); 
+  Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmp(size,tmpCols);
+  Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmpX(size,tmpCols);
+  for(Index k=0; k<rhsCols; k+=NbColsAtOnce)
+  {
+    Index actualCols = std::min<Index>(rhsCols-k, NbColsAtOnce);
+    tmp.leftCols(actualCols) = rhs.middleCols(k,actualCols);
+    tmpX.leftCols(actualCols) = dec.solve(tmp.leftCols(actualCols));
+    dest.middleCols(k,actualCols) = tmpX.leftCols(actualCols).sparseView();
+  }
+}
+
+// Overload for vector as rhs
+template<typename Decomposition, typename Rhs, typename Dest>
+typename enable_if<Rhs::ColsAtCompileTime==1 || Dest::ColsAtCompileTime==1>::type
+solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest &dest)
+{
+  typedef typename Dest::Scalar DestScalar;
+  Index size = rhs.rows();
+  Eigen::Matrix<DestScalar,Dynamic,1> rhs_dense(rhs);
+  Eigen::Matrix<DestScalar,Dynamic,1> dest_dense(size);
+  dest_dense = dec.solve(rhs_dense);
+  dest = dest_dense.sparseView();
+}
+
+} // end namespace internal
+
+/** \class SparseSolverBase
+  * \ingroup SparseCore_Module
+  * \brief A base class for sparse solvers
+  *
+  * \tparam Derived the actual type of the solver.
+  *
+  */
+template<typename Derived>
+class SparseSolverBase : internal::noncopyable
+{
+  public:
+
+    /** Default constructor */
+    SparseSolverBase()
+      : m_isInitialized(false)
+    {}
+
+    ~SparseSolverBase()
+    {}
+
+    Derived& derived() { return *static_cast<Derived*>(this); }
+    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "Solver is not initialized.");
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+    
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const SparseMatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "Solver is not initialized.");
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal default implementation of solving with a sparse rhs */
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const SparseMatrixBase<Rhs> &b, SparseMatrixBase<Dest> &dest) const
+    {
+      internal::solve_sparse_through_dense_panels(derived(), b.derived(), dest.derived());
+    }
+    #endif // EIGEN_PARSED_BY_DOXYGEN
+
+  protected:
+    
+    mutable bool m_isInitialized;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSESOLVERBASE_H
diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index fcc18f5c9..21c419002 100644
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -22,7 +22,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
   // return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res);
 
   typedef typename remove_all<Lhs>::type::Scalar Scalar;
-  typedef typename remove_all<Lhs>::type::Index Index;
+  typedef typename remove_all<Lhs>::type::StorageIndex StorageIndex;
 
   // make sure to call innerSize/outerSize since we fake the storage order.
   Index rows = lhs.innerSize();
@@ -31,24 +31,27 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
   eigen_assert(lhs.outerSize() == rhs.innerSize());
 
   // allocate a temporary buffer
-  AmbiVector<Scalar,Index> tempVector(rows);
+  AmbiVector<Scalar,StorageIndex> tempVector(rows);
 
+  // mimics a resizeByInnerOuter:
+  if(ResultType::IsRowMajor)
+    res.resize(cols, rows);
+  else
+    res.resize(rows, cols);
+  
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
+  
   // estimate the number of non zero entries
   // given a rhs column containing Y non zeros, we assume that the respective Y columns
   // of the lhs differs in average of one non zeros, thus the number of non zeros for
   // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
   // per column of the lhs.
   // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
-
-  // mimics a resizeByInnerOuter:
-  if(ResultType::IsRowMajor)
-    res.resize(cols, rows);
-  else
-    res.resize(rows, cols);
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
   res.reserve(estimated_nnz_prod);
-  double ratioColRes = double(estimated_nnz_prod)/double(lhs.rows()*rhs.cols());
+  double ratioColRes = double(estimated_nnz_prod)/(double(lhs.rows())*double(rhs.cols()));
   for (Index j=0; j<cols; ++j)
   {
     // FIXME:
@@ -56,18 +59,18 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
     // let's do a more accurate determination of the nnz ratio for the current column j of res
     tempVector.init(ratioColRes);
     tempVector.setZero();
-    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
     {
       // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
       tempVector.restart();
       Scalar x = rhsIt.value();
-      for (typename Lhs::InnerIterator lhsIt(lhs, rhsIt.index()); lhsIt; ++lhsIt)
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt)
       {
         tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x;
       }
     }
     res.startVec(j);
-    for (typename AmbiVector<Scalar,Index>::Iterator it(tempVector,tolerance); it; ++it)
+    for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it)
       res.insertBackByOuterInner(j,it.index()) = it.value();
   }
   res.finalize();
@@ -100,7 +103,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,C
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
     // we need a col-major matrix to hold the result
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> SparseTemporaryType;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> SparseTemporaryType;
     SparseTemporaryType _res(res.rows(), res.cols());
     internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, _res, tolerance);
     res = _res;
@@ -126,8 +129,8 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
   typedef typename ResultType::RealScalar RealScalar;
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::Index> ColMajorMatrixLhs;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::Index> ColMajorMatrixRhs;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
     ColMajorMatrixLhs colLhs(lhs);
     ColMajorMatrixRhs colRhs(rhs);
     internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,ColMajorMatrixRhs,ResultType>(colLhs, colRhs, res, tolerance);
@@ -140,8 +143,53 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
   }
 };
 
-// NOTE the 2 others cases (col row *) must never occur since they are caught
-// by ProductReturnType which transforms it to (col col *) by evaluating rhs.
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,RowMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs;
+    RowMajorMatrixLhs rowLhs(lhs);
+    sparse_sparse_product_with_pruning_selector<RowMajorMatrixLhs,Rhs,ResultType,RowMajor,RowMajor>(rowLhs,rhs,res,tolerance);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,RowMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs;
+    RowMajorMatrixRhs rowRhs(rhs);
+    sparse_sparse_product_with_pruning_selector<Lhs,RowMajorMatrixRhs,ResultType,RowMajor,RowMajor,RowMajor>(lhs,rowRhs,res,tolerance);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,ColMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
+    ColMajorMatrixRhs colRhs(rhs);
+    internal::sparse_sparse_product_with_pruning_impl<Lhs,ColMajorMatrixRhs,ResultType>(lhs, colRhs, res, tolerance);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,ColMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
+    ColMajorMatrixLhs colLhs(lhs);
+    internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,Rhs,ResultType>(colLhs, rhs, res, tolerance);
+  }
+};
 
 } // end namespace internal
 
diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h
index 76d031d52..3757d4c6b 100644
--- a/Eigen/src/SparseCore/SparseTranspose.h
+++ b/Eigen/src/SparseCore/SparseTranspose.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,52 +12,81 @@
 
 namespace Eigen { 
 
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
-  : public SparseMatrixBase<Transpose<MatrixType> >
-{
-    typedef typename internal::remove_all<typename MatrixType::Nested>::type _MatrixTypeNested;
+namespace internal {
+  template<typename MatrixType,int CompressedAccess=int(MatrixType::Flags&CompressedAccessBit)>
+  class SparseTransposeImpl
+    : public SparseMatrixBase<Transpose<MatrixType> >
+  {};
+  
+  template<typename MatrixType>
+  class SparseTransposeImpl<MatrixType,CompressedAccessBit>
+    : public SparseCompressedBase<Transpose<MatrixType> >
+  {
+    typedef SparseCompressedBase<Transpose<MatrixType> > Base;
   public:
-
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Transpose<MatrixType> )
-
-    class InnerIterator;
-    class ReverseInnerIterator;
+    using Base::derived;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
 
     inline Index nonZeros() const { return derived().nestedExpression().nonZeros(); }
-};
+    
+    inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); }
+    inline const StorageIndex* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); }
+    inline const StorageIndex* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); }
+    inline const StorageIndex* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); }
 
-// NOTE: VC10 and VC11 trigger an ICE if don't put typename TransposeImpl<MatrixType,Sparse>:: in front of Index,
-// a typedef typename TransposeImpl<MatrixType,Sparse>::Index Index;
-// does not fix the issue.
-// An alternative is to define the nested class in the parent class itself.
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>::InnerIterator
-  : public _MatrixTypeNested::InnerIterator
+    inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); }
+    inline StorageIndex* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); }
+    inline StorageIndex* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); }
+    inline StorageIndex* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); }
+  };
+}
+  
+template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
+  : public internal::SparseTransposeImpl<MatrixType>
 {
-    typedef typename _MatrixTypeNested::InnerIterator Base;
-    typedef typename TransposeImpl::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE InnerIterator(const TransposeImpl& trans, typename TransposeImpl<MatrixType,Sparse>::Index outer)
-      : Base(trans.derived().nestedExpression(), outer)
-    {}
-    typename TransposeImpl<MatrixType,Sparse>::Index row() const { return Base::col(); }
-    typename TransposeImpl<MatrixType,Sparse>::Index col() const { return Base::row(); }
+  protected:
+    typedef internal::SparseTransposeImpl<MatrixType> Base;
 };
 
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>::ReverseInnerIterator
-  : public _MatrixTypeNested::ReverseInnerIterator
+namespace internal {
+  
+template<typename ArgType>
+struct unary_evaluator<Transpose<ArgType>, IteratorBased>
+  : public evaluator_base<Transpose<ArgType> >
 {
-    typedef typename _MatrixTypeNested::ReverseInnerIterator Base;
-    typedef typename TransposeImpl::Index Index;
+    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
   public:
+    typedef Transpose<ArgType> XprType;
+    
+    inline Index nonZerosEstimate() const {
+      return m_argImpl.nonZerosEstimate();
+    }
 
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const TransposeImpl& xpr, typename TransposeImpl<MatrixType,Sparse>::Index outer)
-      : Base(xpr.derived().nestedExpression(), outer)
-    {}
-    typename TransposeImpl<MatrixType,Sparse>::Index row() const { return Base::col(); }
-    typename TransposeImpl<MatrixType,Sparse>::Index col() const { return Base::row(); }
+    class InnerIterator : public EvalIterator
+    {
+    public:
+      EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+        : EvalIterator(unaryOp.m_argImpl,outer)
+      {}
+      
+      Index row() const { return EvalIterator::col(); }
+      Index col() const { return EvalIterator::row(); }
+    };
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& op) :m_argImpl(op.nestedExpression()) {}
+
+  protected:
+    evaluator<ArgType> m_argImpl;
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSETRANSPOSE_H
diff --git a/Eigen/src/SparseCore/SparseTriangularView.h b/Eigen/src/SparseCore/SparseTriangularView.h
index 333127b78..9ac120266 100644
--- a/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/Eigen/src/SparseCore/SparseTriangularView.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,19 +11,19 @@
 #ifndef EIGEN_SPARSE_TRIANGULARVIEW_H
 #define EIGEN_SPARSE_TRIANGULARVIEW_H
 
-namespace Eigen { 
-
-namespace internal {
-  
-template<typename MatrixType, int Mode>
-struct traits<SparseTriangularView<MatrixType,Mode> >
-: public traits<MatrixType>
-{};
-
-} // namespace internal
-
-template<typename MatrixType, int Mode> class SparseTriangularView
-  : public SparseMatrixBase<SparseTriangularView<MatrixType,Mode> >
+namespace Eigen {
+
+/** \ingroup SparseCore_Module
+  *
+  * \brief Base class for a triangular part in a \b sparse matrix
+  *
+  * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
+  * It extends class TriangularView with additional methods which are available for sparse expressions only.
+  *
+  * \sa class TriangularView, SparseMatrixBase::triangularView()
+  */
+template<typename MatrixType, unsigned int Mode> class TriangularViewImpl<MatrixType,Mode,Sparse>
+  : public SparseMatrixBase<TriangularView<MatrixType,Mode> >
 {
     enum { SkipFirst = ((Mode&Lower) && !(MatrixType::Flags&RowMajorBit))
                     || ((Mode&Upper) &&  (MatrixType::Flags&RowMajorBit)),
@@ -31,147 +31,157 @@ template<typename MatrixType, int Mode> class SparseTriangularView
            SkipDiag = (Mode&ZeroDiag) ? 1 : 0,
            HasUnitDiag = (Mode&UnitDiag) ? 1 : 0
     };
+    
+    typedef TriangularView<MatrixType,Mode> TriangularViewType;
+    
+  protected:
+    // dummy solve function to make TriangularView happy.
+    void solve() const;
 
+    typedef SparseMatrixBase<TriangularViewType> Base;
   public:
     
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseTriangularView)
-
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
+    EIGEN_SPARSE_PUBLIC_INTERFACE(TriangularViewType)
+    
     typedef typename MatrixType::Nested MatrixTypeNested;
     typedef typename internal::remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
     typedef typename internal::remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
 
-    inline SparseTriangularView(const MatrixType& matrix) : m_matrix(matrix) {}
-
-    /** \internal */
-    inline const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-
-    template<typename OtherDerived>
-    typename internal::plain_matrix_type_column_major<OtherDerived>::type
-    solve(const MatrixBase<OtherDerived>& other) const;
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const {
+      if(!(internal::is_same<RhsType,DstType>::value && internal::extract_data(dst) == internal::extract_data(rhs)))
+        dst = rhs;
+      this->solveInPlace(dst);
+    }
 
+    /** Applies the inverse of \c *this to the dense vector or matrix \a other, "in-place" */
     template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const;
-    template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
 
-  protected:
-    MatrixTypeNested m_matrix;
+    /** Applies the inverse of \c *this to the sparse vector or matrix \a other, "in-place" */
+    template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
+  
 };
 
-template<typename MatrixType, int Mode>
-class SparseTriangularView<MatrixType,Mode>::InnerIterator : public MatrixTypeNestedCleaned::InnerIterator
+namespace internal {
+
+template<typename ArgType, unsigned int Mode>
+struct unary_evaluator<TriangularView<ArgType,Mode>, IteratorBased>
+ : evaluator_base<TriangularView<ArgType,Mode> >
 {
-    typedef typename MatrixTypeNestedCleaned::InnerIterator Base;
-    typedef typename SparseTriangularView::Index Index;
-  public:
+  typedef TriangularView<ArgType,Mode> XprType;
+  
+protected:
+  
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+  
+  enum { SkipFirst = ((Mode&Lower) && !(ArgType::Flags&RowMajorBit))
+                    || ((Mode&Upper) &&  (ArgType::Flags&RowMajorBit)),
+         SkipLast = !SkipFirst,
+         SkipDiag = (Mode&ZeroDiag) ? 1 : 0,
+         HasUnitDiag = (Mode&UnitDiag) ? 1 : 0
+  };
+  
+public:
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = XprType::Flags
+  };
+    
+  explicit unary_evaluator(const XprType &xpr) : m_argImpl(xpr.nestedExpression()), m_arg(xpr.nestedExpression()) {}
+  
+  inline Index nonZerosEstimate() const {
+    return m_argImpl.nonZerosEstimate();
+  }
+  
+  class InnerIterator : public EvalIterator
+  {
+      typedef EvalIterator Base;
+    public:
 
-    EIGEN_STRONG_INLINE InnerIterator(const SparseTriangularView& view, Index outer)
-      : Base(view.nestedExpression(), outer), m_returnOne(false)
-    {
-      if(SkipFirst)
+      EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& xprEval, Index outer)
+        : Base(xprEval.m_argImpl,outer), m_returnOne(false), m_containsDiag(Base::outer()<xprEval.m_arg.innerSize())
       {
-        while((*this) && ((HasUnitDiag||SkipDiag)  ? this->index()<=outer : this->index()<outer))
-          Base::operator++();
-        if(HasUnitDiag)
-          m_returnOne = true;
+        if(SkipFirst)
+        {
+          while((*this) && ((HasUnitDiag||SkipDiag)  ? this->index()<=outer : this->index()<outer))
+            Base::operator++();
+          if(HasUnitDiag)
+            m_returnOne = m_containsDiag;
+        }
+        else if(HasUnitDiag && ((!Base::operator bool()) || Base::index()>=Base::outer()))
+        {
+          if((!SkipFirst) && Base::operator bool())
+            Base::operator++();
+          m_returnOne = m_containsDiag;
+        }
       }
-      else if(HasUnitDiag && ((!Base::operator bool()) || Base::index()>=Base::outer()))
+
+      EIGEN_STRONG_INLINE InnerIterator& operator++()
       {
-        if((!SkipFirst) && Base::operator bool())
+        if(HasUnitDiag && m_returnOne)
+          m_returnOne = false;
+        else
+        {
           Base::operator++();
-        m_returnOne = true;
+          if(HasUnitDiag && (!SkipFirst) && ((!Base::operator bool()) || Base::index()>=Base::outer()))
+          {
+            if((!SkipFirst) && Base::operator bool())
+              Base::operator++();
+            m_returnOne = m_containsDiag;
+          }
+        }
+        return *this;
       }
-    }
-
-    EIGEN_STRONG_INLINE InnerIterator& operator++()
-    {
-      if(HasUnitDiag && m_returnOne)
-        m_returnOne = false;
-      else
+      
+      EIGEN_STRONG_INLINE operator bool() const
       {
-        Base::operator++();
-        if(HasUnitDiag && (!SkipFirst) && ((!Base::operator bool()) || Base::index()>=Base::outer()))
+        if(HasUnitDiag && m_returnOne)
+          return true;
+        if(SkipFirst) return  Base::operator bool();
+        else
         {
-          if((!SkipFirst) && Base::operator bool())
-            Base::operator++();
-          m_returnOne = true;
+          if (SkipDiag) return (Base::operator bool() && this->index() < this->outer());
+          else return (Base::operator bool() && this->index() <= this->outer());
         }
       }
-      return *this;
-    }
-
-    inline Index row() const { return (MatrixType::Flags&RowMajorBit ? Base::outer() : this->index()); }
-    inline Index col() const { return (MatrixType::Flags&RowMajorBit ? this->index() : Base::outer()); }
-    inline Index index() const
-    {
-      if(HasUnitDiag && m_returnOne)  return Base::outer();
-      else                            return Base::index();
-    }
-    inline Scalar value() const
-    {
-      if(HasUnitDiag && m_returnOne)  return Scalar(1);
-      else                            return Base::value();
-    }
 
-    EIGEN_STRONG_INLINE operator bool() const
-    {
-      if(HasUnitDiag && m_returnOne)
-        return true;
-      if(SkipFirst) return  Base::operator bool();
-      else
+//       inline Index row() const { return (ArgType::Flags&RowMajorBit ? Base::outer() : this->index()); }
+//       inline Index col() const { return (ArgType::Flags&RowMajorBit ? this->index() : Base::outer()); }
+      inline StorageIndex index() const
       {
-        if (SkipDiag) return (Base::operator bool() && this->index() < this->outer());
-        else return (Base::operator bool() && this->index() <= this->outer());
+        if(HasUnitDiag && m_returnOne)  return internal::convert_index<StorageIndex>(Base::outer());
+        else                            return Base::index();
       }
-    }
-  protected:
-    bool m_returnOne;
-};
-
-template<typename MatrixType, int Mode>
-class SparseTriangularView<MatrixType,Mode>::ReverseInnerIterator : public MatrixTypeNestedCleaned::ReverseInnerIterator
-{
-    typedef typename MatrixTypeNestedCleaned::ReverseInnerIterator Base;
-    typedef typename SparseTriangularView::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const SparseTriangularView& view, Index outer)
-      : Base(view.nestedExpression(), outer)
-    {
-      eigen_assert((!HasUnitDiag) && "ReverseInnerIterator does not support yet triangular views with a unit diagonal");
-      if(SkipLast) {
-        while((*this) && (SkipDiag ? this->index()>=outer : this->index()>outer))
-          --(*this);
-      }
-    }
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
-
-    inline Index row() const { return Base::row(); }
-    inline Index col() const { return Base::col(); }
-
-    EIGEN_STRONG_INLINE operator bool() const
-    {
-      if (SkipLast) return Base::operator bool() ;
-      else
+      inline Scalar value() const
       {
-        if(SkipDiag) return (Base::operator bool() && this->index() > this->outer());
-        else return (Base::operator bool() && this->index() >= this->outer());
+        if(HasUnitDiag && m_returnOne)  return Scalar(1);
+        else                            return Base::value();
       }
-    }
+
+    protected:
+      bool m_returnOne;
+      bool m_containsDiag;
+    private:
+      Scalar& valueRef();
+  };
+  
+protected:
+  evaluator<ArgType> m_argImpl;
+  const ArgType& m_arg;
 };
 
+} // end namespace internal
+
 template<typename Derived>
 template<int Mode>
-inline const SparseTriangularView<Derived, Mode>
+inline const TriangularView<const Derived, Mode>
 SparseMatrixBase<Derived>::triangularView() const
 {
-  return derived();
+  return TriangularView<const Derived, Mode>(derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index 0ba471320..74df0d496 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -37,43 +37,23 @@ EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
 }
 
 #define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
-EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
-EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
-
-#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \
-  typedef BaseClass Base; \
-  typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
-  typedef typename Eigen::internal::nested<Derived >::type Nested; \
-  typedef typename Eigen::internal::traits<Derived >::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived >::Index Index; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived >::RowsAtCompileTime, \
-        ColsAtCompileTime = Eigen::internal::traits<Derived >::ColsAtCompileTime, \
-        Flags = Eigen::internal::traits<Derived >::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived >::CoeffReadCost, \
-        SizeAtCompileTime = Base::SizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
-  using Base::derived; \
-  using Base::const_cast_derived;
+EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =)
+
 
 #define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
-  _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >)
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived)
 
+  
 const int CoherentAccessPattern     = 0x1;
 const int InnerRandomAccessPattern  = 0x2 | CoherentAccessPattern;
 const int OuterRandomAccessPattern  = 0x4 | CoherentAccessPattern;
 const int RandomAccessPattern       = 0x8 | OuterRandomAccessPattern | InnerRandomAccessPattern;
 
-template<typename Derived> class SparseMatrixBase;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class SparseMatrix;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class DynamicSparseMatrix;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class SparseVector;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class MappedSparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class SparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class DynamicSparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class SparseVector;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class MappedSparseMatrix;
 
-template<typename MatrixType, int Mode>           class SparseTriangularView;
 template<typename MatrixType, unsigned int UpLo>  class SparseSelfAdjointView;
 template<typename Lhs, typename Rhs>              class SparseDiagonalProduct;
 template<typename MatrixType> class SparseView;
@@ -85,42 +65,44 @@ template<typename Lhs, typename Rhs, bool Transpose> class SparseDenseOuterProdu
 
 template<typename Lhs, typename Rhs> struct SparseSparseProductReturnType;
 template<typename Lhs, typename Rhs,
-         int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct DenseSparseProductReturnType;         
+         int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct DenseSparseProductReturnType;
+         
 template<typename Lhs, typename Rhs,
          int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct SparseDenseProductReturnType;
 template<typename MatrixType,int UpLo> class SparseSymmetricPermutationProduct;
 
 namespace internal {
 
-template<typename T,int Rows,int Cols> struct sparse_eval;
+template<typename T,int Rows,int Cols,int Flags> struct sparse_eval;
 
 template<typename T> struct eval<T,Sparse>
-  : public sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime>
+  : sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime,traits<T>::Flags>
 {};
 
-template<typename T,int Cols> struct sparse_eval<T,1,Cols> {
+template<typename T,int Cols,int Flags> struct sparse_eval<T,1,Cols,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
+    typedef typename traits<T>::StorageIndex _StorageIndex;
   public:
-    typedef SparseVector<_Scalar, RowMajor, _Index> type;
+    typedef SparseVector<_Scalar, RowMajor, _StorageIndex> type;
 };
 
-template<typename T,int Rows> struct sparse_eval<T,Rows,1> {
+template<typename T,int Rows,int Flags> struct sparse_eval<T,Rows,1,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
+    typedef typename traits<T>::StorageIndex _StorageIndex;
   public:
-    typedef SparseVector<_Scalar, ColMajor, _Index> type;
+    typedef SparseVector<_Scalar, ColMajor, _StorageIndex> type;
 };
 
-template<typename T,int Rows,int Cols> struct sparse_eval {
+// TODO this seems almost identical to plain_matrix_type<T, Sparse>
+template<typename T,int Rows,int Cols,int Flags> struct sparse_eval {
     typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
-    enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
+    typedef typename traits<T>::StorageIndex _StorageIndex;
+    enum { _Options = ((Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
   public:
-    typedef SparseMatrix<_Scalar, _Options, _Index> type;
+    typedef SparseMatrix<_Scalar, _Options, _StorageIndex> type;
 };
 
-template<typename T> struct sparse_eval<T,1,1> {
+template<typename T,int Flags> struct sparse_eval<T,1,1,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
   public:
     typedef Matrix<_Scalar, 1, 1> type;
@@ -129,12 +111,35 @@ template<typename T> struct sparse_eval<T,1,1> {
 template<typename T> struct plain_matrix_type<T,Sparse>
 {
   typedef typename traits<T>::Scalar _Scalar;
-  typedef typename traits<T>::Index _Index;
-  enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
+  typedef typename traits<T>::StorageIndex _StorageIndex;
+  enum { _Options = ((evaluator<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
   public:
-    typedef SparseMatrix<_Scalar, _Options, _Index> type;
+    typedef SparseMatrix<_Scalar, _Options, _StorageIndex> type;
+};
+
+template<typename T>
+struct plain_object_eval<T,Sparse>
+  : sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime, evaluator<T>::Flags>
+{};
+
+template<typename Decomposition, typename RhsType>
+struct solve_traits<Decomposition,RhsType,Sparse>
+{
+  typedef typename sparse_eval<RhsType, RhsType::RowsAtCompileTime, RhsType::ColsAtCompileTime,traits<RhsType>::Flags>::type PlainObject;
 };
 
+template<typename Derived>
+struct generic_xpr_base<Derived, MatrixXpr, Sparse>
+{
+  typedef SparseMatrixBase<Derived> type;
+};
+
+struct SparseTriangularShape  { static std::string debugName() { return "SparseTriangularShape"; } };
+struct SparseSelfAdjointShape { static std::string debugName() { return "SparseSelfAdjointShape"; } };
+
+template<> struct glue_shapes<SparseShape,SelfAdjointShape> { typedef SparseSelfAdjointShape type;  };
+template<> struct glue_shapes<SparseShape,TriangularShape > { typedef SparseTriangularShape  type;  };
+
 } // end namespace internal
 
 /** \ingroup SparseCore_Module
@@ -145,26 +150,26 @@ template<typename T> struct plain_matrix_type<T,Sparse>
   *
   * \sa SparseMatrix::setFromTriplets()
   */
-template<typename Scalar, typename Index=typename SparseMatrix<Scalar>::Index >
+template<typename Scalar, typename StorageIndex=typename SparseMatrix<Scalar>::StorageIndex >
 class Triplet
 {
 public:
   Triplet() : m_row(0), m_col(0), m_value(0) {}
 
-  Triplet(const Index& i, const Index& j, const Scalar& v = Scalar(0))
+  Triplet(const StorageIndex& i, const StorageIndex& j, const Scalar& v = Scalar(0))
     : m_row(i), m_col(j), m_value(v)
   {}
 
   /** \returns the row index of the element */
-  const Index& row() const { return m_row; }
+  const StorageIndex& row() const { return m_row; }
 
   /** \returns the column index of the element */
-  const Index& col() const { return m_col; }
+  const StorageIndex& col() const { return m_col; }
 
   /** \returns the value of the element */
   const Scalar& value() const { return m_value; }
 protected:
-  Index m_row, m_col;
+  StorageIndex m_row, m_col;
   Scalar m_value;
 };
 
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index 7e15c814b..19b0fbc9d 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -22,15 +22,15 @@ namespace Eigen {
   * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEVECTOR_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEVECTOR_PLUGIN.
   */
 
 namespace internal {
-template<typename _Scalar, int _Options, typename _Index>
-struct traits<SparseVector<_Scalar, _Options, _Index> >
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct traits<SparseVector<_Scalar, _Options, _StorageIndex> >
 {
   typedef _Scalar Scalar;
-  typedef _Index Index;
+  typedef _StorageIndex StorageIndex;
   typedef Sparse StorageKind;
   typedef MatrixXpr XprKind;
   enum {
@@ -40,8 +40,7 @@ struct traits<SparseVector<_Scalar, _Options, _Index> >
     ColsAtCompileTime = IsColVector ? 1 : Dynamic,
     MaxRowsAtCompileTime = RowsAtCompileTime,
     MaxColsAtCompileTime = ColsAtCompileTime,
-    Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit),
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit) | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
@@ -61,18 +60,18 @@ struct sparse_vector_assign_selector;
 
 }
 
-template<typename _Scalar, int _Options, typename _Index>
+template<typename _Scalar, int _Options, typename _StorageIndex>
 class SparseVector
-  : public SparseMatrixBase<SparseVector<_Scalar, _Options, _Index> >
+  : public SparseCompressedBase<SparseVector<_Scalar, _Options, _StorageIndex> >
 {
-    typedef SparseMatrixBase<SparseVector> SparseBase;
-    
+    typedef SparseCompressedBase<SparseVector> Base;
+    using Base::convert_index;
   public:
     EIGEN_SPARSE_PUBLIC_INTERFACE(SparseVector)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, +=)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, -=)
     
-    typedef internal::CompressedStorage<Scalar,Index> Storage;
+    typedef internal::CompressedStorage<Scalar,StorageIndex> Storage;
     enum { IsColVector = internal::traits<SparseVector>::IsColVector };
     
     enum {
@@ -84,11 +83,16 @@ class SparseVector
     EIGEN_STRONG_INLINE Index innerSize() const { return m_size; }
     EIGEN_STRONG_INLINE Index outerSize() const { return 1; }
 
-    EIGEN_STRONG_INLINE const Scalar* valuePtr() const { return &m_data.value(0); }
-    EIGEN_STRONG_INLINE Scalar* valuePtr() { return &m_data.value(0); }
+    EIGEN_STRONG_INLINE const Scalar* valuePtr() const { return m_data.valuePtr(); }
+    EIGEN_STRONG_INLINE Scalar* valuePtr() { return m_data.valuePtr(); }
 
-    EIGEN_STRONG_INLINE const Index* innerIndexPtr() const { return &m_data.index(0); }
-    EIGEN_STRONG_INLINE Index* innerIndexPtr() { return &m_data.index(0); }
+    EIGEN_STRONG_INLINE const StorageIndex* innerIndexPtr() const { return m_data.indexPtr(); }
+    EIGEN_STRONG_INLINE StorageIndex* innerIndexPtr() { return m_data.indexPtr(); }
+
+    inline const StorageIndex* outerIndexPtr() const { return 0; }
+    inline StorageIndex* outerIndexPtr() { return 0; }
+    inline const StorageIndex* innerNonZeroPtr() const { return 0; }
+    inline StorageIndex* innerNonZeroPtr() { return 0; }
     
     /** \internal */
     inline Storage& data() { return m_data; }
@@ -103,13 +107,13 @@ class SparseVector
     inline Scalar coeff(Index i) const
     {
       eigen_assert(i>=0 && i<m_size);
-      return m_data.at(i);
+      return m_data.at(StorageIndex(i));
     }
 
     inline Scalar& coeffRef(Index row, Index col)
     {
       eigen_assert(IsColVector ? (col==0 && row>=0 && row<m_size) : (row==0 && col>=0 && col<m_size));
-      return coeff(IsColVector ? row : col);
+      return coeffRef(IsColVector ? row : col);
     }
 
     /** \returns a reference to the coefficient value at given index \a i
@@ -121,18 +125,19 @@ class SparseVector
     inline Scalar& coeffRef(Index i)
     {
       eigen_assert(i>=0 && i<m_size);
-      return m_data.atWithInsertion(i);
+
+      return m_data.atWithInsertion(StorageIndex(i));
     }
 
   public:
 
-    class InnerIterator;
-    class ReverseInnerIterator;
+    typedef typename Base::InnerIterator InnerIterator;
+    typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
 
     inline void setZero() { m_data.clear(); }
 
     /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return static_cast<Index>(m_data.size()); }
+    inline Index nonZeros() const  { return m_data.size(); }
 
     inline void startVec(Index outer)
     {
@@ -151,6 +156,18 @@ class SparseVector
       m_data.append(0, i);
       return m_data.value(m_data.size()-1);
     }
+    
+    Scalar& insertBackByOuterInnerUnordered(Index outer, Index inner)
+    {
+      EIGEN_UNUSED_VARIABLE(outer);
+      eigen_assert(outer==0);
+      return insertBackUnordered(inner);
+    }
+    inline Scalar& insertBackUnordered(Index i)
+    {
+      m_data.append(0, i);
+      return m_data.value(m_data.size()-1);
+    }
 
     inline Scalar& insert(Index row, Index col)
     {
@@ -158,6 +175,7 @@ class SparseVector
       
       Index inner = IsColVector ? row : col;
       Index outer = IsColVector ? col : row;
+      EIGEN_ONLY_USED_FOR_DEBUG(outer);
       eigen_assert(outer==0);
       return insert(inner);
     }
@@ -176,7 +194,7 @@ class SparseVector
         m_data.value(p+1) = m_data.value(p);
         --p;
       }
-      m_data.index(p+1) = i;
+      m_data.index(p+1) = convert_index(i);
       m_data.value(p+1) = 0;
       return m_data.value(p+1);
     }
@@ -188,28 +206,59 @@ class SparseVector
 
     inline void finalize() {}
 
+    /** \copydoc SparseMatrix::prune(const Scalar&,const RealScalar&) */
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       m_data.prune(reference,epsilon);
     }
 
+    /** Resizes the sparse vector to \a rows x \a cols
+      *
+      * This method is provided for compatibility with matrices.
+      * For a column vector, \a cols must be equal to 1.
+      * For a row vector, \a rows must be equal to 1.
+      *
+      * \sa resize(Index)
+      */
     void resize(Index rows, Index cols)
     {
-      eigen_assert(rows==1 || cols==1);
+      eigen_assert((IsColVector ? cols : rows)==1 && "Outer dimension must equal 1");
       resize(IsColVector ? rows : cols);
     }
 
+    /** Resizes the sparse vector to \a newSize
+      * This method deletes all entries, thus leaving an empty sparse vector
+      *
+      * \sa  conservativeResize(), setZero() */
     void resize(Index newSize)
     {
       m_size = newSize;
       m_data.clear();
     }
 
+    /** Resizes the sparse vector to \a newSize, while leaving old values untouched.
+      *
+      * If the size of the vector is decreased, then the storage of the out-of bounds coefficients is kept and reserved.
+      * Call .data().squeeze() to free extra memory.
+      *
+      * \sa reserve(), setZero()
+      */
+    void conservativeResize(Index newSize)
+    {
+      if (newSize < m_size)
+      {
+        Index i = 0;
+        while (i<m_data.size() && m_data.index(i)<newSize) ++i;
+        m_data.resize(i);
+      }
+      m_size = newSize;
+    }
+
     void resizeNonZeros(Index size) { m_data.resize(size); }
 
     inline SparseVector() : m_size(0) { check_template_parameters(); resize(0); }
 
-    inline SparseVector(Index size) : m_size(0) { check_template_parameters(); resize(size); }
+    explicit inline SparseVector(Index size) : m_size(0) { check_template_parameters(); resize(size); }
 
     inline SparseVector(Index rows, Index cols) : m_size(0) { check_template_parameters(); resize(rows,cols); }
 
@@ -217,12 +266,15 @@ class SparseVector
     inline SparseVector(const SparseMatrixBase<OtherDerived>& other)
       : m_size(0)
     {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
       check_template_parameters();
       *this = other.derived();
     }
 
     inline SparseVector(const SparseVector& other)
-      : SparseBase(other), m_size(0)
+      : Base(other), m_size(0)
     {
       check_template_parameters();
       *this = other.derived();
@@ -238,6 +290,14 @@ class SparseVector
       m_data.swap(other.m_data);
     }
 
+    template<int OtherOptions>
+    inline void swap(SparseMatrix<Scalar,OtherOptions,StorageIndex>& other)
+    {
+      eigen_assert(other.outerSize()==1);
+      std::swap(m_size, other.m_innerSize);
+      m_data.swap(other.m_data);
+    }
+
     inline SparseVector& operator=(const SparseVector& other)
     {
       if (other.isRValue())
@@ -336,7 +396,7 @@ protected:
   
     static void check_template_parameters()
     {
-      EIGEN_STATIC_ASSERT(NumTraits<Index>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
+      EIGEN_STATIC_ASSERT(NumTraits<StorageIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
       EIGEN_STATIC_ASSERT((_Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS);
     }
     
@@ -344,77 +404,46 @@ protected:
     Index m_size;
 };
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseVector<Scalar,_Options,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const SparseVector& vec, Index outer=0)
-      : m_data(vec.m_data), m_id(0), m_end(static_cast<Index>(m_data.size()))
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
-
-    InnerIterator(const internal::CompressedStorage<Scalar,Index>& data)
-      : m_data(data), m_id(0), m_end(static_cast<Index>(m_data.size()))
-    {}
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline Scalar value() const { return m_data.value(m_id); }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id)); }
-
-    inline Index index() const { return m_data.index(m_id); }
-    inline Index row() const { return IsColVector ? index() : 0; }
-    inline Index col() const { return IsColVector ? 0 : index(); }
-
-    inline operator bool() const { return (m_id < m_end); }
-
-  protected:
-    const internal::CompressedStorage<Scalar,Index>& m_data;
-    Index m_id;
-    const Index m_end;
-};
+namespace internal {
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator
+template<typename _Scalar, int _Options, typename _Index>
+struct evaluator<SparseVector<_Scalar,_Options,_Index> >
+  : evaluator_base<SparseVector<_Scalar,_Options,_Index> >
 {
-  public:
-    ReverseInnerIterator(const SparseVector& vec, Index outer=0)
-      : m_data(vec.m_data), m_id(static_cast<Index>(m_data.size())), m_start(0)
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
-
-    ReverseInnerIterator(const internal::CompressedStorage<Scalar,Index>& data)
-      : m_data(data), m_id(static_cast<Index>(m_data.size())), m_start(0)
-    {}
-
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
-
-    inline Scalar value() const { return m_data.value(m_id-1); }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id-1)); }
-
-    inline Index index() const { return m_data.index(m_id-1); }
-    inline Index row() const { return IsColVector ? index() : 0; }
-    inline Index col() const { return IsColVector ? 0 : index(); }
-
-    inline operator bool() const { return (m_id > m_start); }
+  typedef SparseVector<_Scalar,_Options,_Index> SparseVectorType;
+  typedef evaluator_base<SparseVectorType> Base;
+  typedef typename SparseVectorType::InnerIterator InnerIterator;
+  typedef typename SparseVectorType::ReverseInnerIterator ReverseInnerIterator;
+  
+  enum {
+    CoeffReadCost = NumTraits<_Scalar>::ReadCost,
+    Flags = SparseVectorType::Flags
+  };
 
-  protected:
-    const internal::CompressedStorage<Scalar,Index>& m_data;
-    Index m_id;
-    const Index m_start;
+  evaluator() : Base() {}
+  
+  explicit evaluator(const SparseVectorType &mat) : m_matrix(&mat)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_matrix->nonZeros();
+  }
+  
+  operator SparseVectorType&() { return m_matrix->const_cast_derived(); }
+  operator const SparseVectorType&() const { return *m_matrix; }
+  
+  const SparseVectorType *m_matrix;
 };
 
-namespace internal {
-
 template< typename Dest, typename Src>
 struct sparse_vector_assign_selector<Dest,Src,SVA_Inner> {
   static void run(Dest& dst, const Src& src) {
     eigen_internal_assert(src.innerSize()==src.size());
-    for(typename Src::InnerIterator it(src, 0); it; ++it)
+    typedef internal::evaluator<Src> SrcEvaluatorType;
+    SrcEvaluatorType srcEval(src);
+    for(typename SrcEvaluatorType::InnerIterator it(srcEval, 0); it; ++it)
       dst.insert(it.index()) = it.value();
   }
 };
@@ -423,9 +452,11 @@ template< typename Dest, typename Src>
 struct sparse_vector_assign_selector<Dest,Src,SVA_Outer> {
   static void run(Dest& dst, const Src& src) {
     eigen_internal_assert(src.outerSize()==src.size());
-    for(typename Dest::Index i=0; i<src.size(); ++i)
+    typedef internal::evaluator<Src> SrcEvaluatorType;
+    SrcEvaluatorType srcEval(src);
+    for(Index i=0; i<src.size(); ++i)
     {
-      typename Src::InnerIterator it(src, i);
+      typename SrcEvaluatorType::InnerIterator it(srcEval, i);
       if(it)
         dst.insert(i) = it.value();
     }
diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h
index fd8450463..7c4aea743 100644
--- a/Eigen/src/SparseCore/SparseView.h
+++ b/Eigen/src/SparseCore/SparseView.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010 Daniel Lowengrub <lowdanie@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -18,7 +18,7 @@ namespace internal {
 template<typename MatrixType>
 struct traits<SparseView<MatrixType> > : traits<MatrixType>
 {
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef Sparse StorageKind;
   enum {
     Flags = int(traits<MatrixType>::Flags) & (RowMajorBit)
@@ -27,71 +27,225 @@ struct traits<SparseView<MatrixType> > : traits<MatrixType>
 
 } // end namespace internal
 
+/** \ingroup SparseCore_Module
+  * \class SparseView
+  *
+  * \brief Expression of a dense or sparse matrix with zero or too small values removed
+  *
+  * \tparam MatrixType the type of the object of which we are removing the small entries
+  *
+  * This class represents an expression of a given dense or sparse matrix with
+  * entries smaller than \c reference * \c epsilon are removed.
+  * It is the return type of MatrixBase::sparseView() and SparseMatrixBase::pruned()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::sparseView(), SparseMatrixBase::pruned()
+  */
 template<typename MatrixType>
 class SparseView : public SparseMatrixBase<SparseView<MatrixType> >
 {
   typedef typename MatrixType::Nested MatrixTypeNested;
   typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+  typedef SparseMatrixBase<SparseView > Base;
 public:
   EIGEN_SPARSE_PUBLIC_INTERFACE(SparseView)
+  typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-  SparseView(const MatrixType& mat, const Scalar& m_reference = Scalar(0),
-             typename NumTraits<Scalar>::Real m_epsilon = NumTraits<Scalar>::dummy_precision()) : 
-    m_matrix(mat), m_reference(m_reference), m_epsilon(m_epsilon) {}
-
-  class InnerIterator;
+  explicit SparseView(const MatrixType& mat, const Scalar& reference = Scalar(0),
+                      const RealScalar &epsilon = NumTraits<Scalar>::dummy_precision())
+    : m_matrix(mat), m_reference(reference), m_epsilon(epsilon) {}
 
   inline Index rows() const { return m_matrix.rows(); }
   inline Index cols() const { return m_matrix.cols(); }
 
   inline Index innerSize() const { return m_matrix.innerSize(); }
   inline Index outerSize() const { return m_matrix.outerSize(); }
-
+  
+  /** \returns the nested expression */
+  const typename internal::remove_all<MatrixTypeNested>::type&
+  nestedExpression() const { return m_matrix; }
+  
+  Scalar reference() const { return m_reference; }
+  RealScalar epsilon() const { return m_epsilon; }
+  
 protected:
   MatrixTypeNested m_matrix;
   Scalar m_reference;
-  typename NumTraits<Scalar>::Real m_epsilon;
+  RealScalar m_epsilon;
 };
 
-template<typename MatrixType>
-class SparseView<MatrixType>::InnerIterator : public _MatrixTypeNested::InnerIterator
-{
-  typedef typename SparseView::Index Index;
-public:
-  typedef typename _MatrixTypeNested::InnerIterator IterBase;
-  InnerIterator(const SparseView& view, Index outer) :
-  IterBase(view.m_matrix, outer), m_view(view)
-  {
-    incrementToNonZero();
-  }
-
-  EIGEN_STRONG_INLINE InnerIterator& operator++()
-  {
-    IterBase::operator++();
-    incrementToNonZero();
-    return *this;
-  }
-
-  using IterBase::value;
+namespace internal {
 
-protected:
-  const SparseView& m_view;
+// TODO find a way to unify the two following variants
+// This is tricky because implementing an inner iterator on top of an IndexBased evaluator is
+// not easy because the evaluators do not expose the sizes of the underlying expression.
+  
+template<typename ArgType>
+struct unary_evaluator<SparseView<ArgType>, IteratorBased>
+  : public evaluator_base<SparseView<ArgType> >
+{
+    typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+  public:
+    typedef SparseView<ArgType> XprType;
+    
+    class InnerIterator : public EvalIterator
+    {
+        typedef typename XprType::Scalar Scalar;
+      public:
+
+        EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& sve, Index outer)
+          : EvalIterator(sve.m_argImpl,outer), m_view(sve.m_view)
+        {
+          incrementToNonZero();
+        }
+
+        EIGEN_STRONG_INLINE InnerIterator& operator++()
+        {
+          EvalIterator::operator++();
+          incrementToNonZero();
+          return *this;
+        }
+
+        using EvalIterator::value;
+
+      protected:
+        const XprType &m_view;
+
+      private:
+        void incrementToNonZero()
+        {
+          while((bool(*this)) && internal::isMuchSmallerThan(value(), m_view.reference(), m_view.epsilon()))
+          {
+            EvalIterator::operator++();
+          }
+        }
+    };
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_view(xpr) {}
+
+  protected:
+    evaluator<ArgType> m_argImpl;
+    const XprType &m_view;
+};
 
-private:
-  void incrementToNonZero()
-  {
-    while((bool(*this)) && internal::isMuchSmallerThan(value(), m_view.m_reference, m_view.m_epsilon))
+template<typename ArgType>
+struct unary_evaluator<SparseView<ArgType>, IndexBased>
+  : public evaluator_base<SparseView<ArgType> >
+{
+  public:
+    typedef SparseView<ArgType> XprType;
+  protected:
+    enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit };
+    typedef typename XprType::Scalar Scalar;
+    typedef typename XprType::StorageIndex StorageIndex;
+  public:
+    
+    class InnerIterator
     {
-      IterBase::operator++();
-    }
-  }
+      public:
+
+        EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& sve, Index outer)
+          : m_sve(sve), m_inner(0), m_outer(outer), m_end(sve.m_view.innerSize())
+        {
+          incrementToNonZero();
+        }
+
+        EIGEN_STRONG_INLINE InnerIterator& operator++()
+        {
+          m_inner++;
+          incrementToNonZero();
+          return *this;
+        }
+
+        EIGEN_STRONG_INLINE Scalar value() const
+        {
+          return (IsRowMajor) ? m_sve.m_argImpl.coeff(m_outer, m_inner)
+                              : m_sve.m_argImpl.coeff(m_inner, m_outer);
+        }
+
+        EIGEN_STRONG_INLINE StorageIndex index() const { return m_inner; }
+        inline Index row() const { return IsRowMajor ? m_outer : index(); }
+        inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+        EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
+
+      protected:
+        const unary_evaluator &m_sve;
+        Index m_inner;
+        const Index m_outer;
+        const Index m_end;
+
+      private:
+        void incrementToNonZero()
+        {
+          while((bool(*this)) && internal::isMuchSmallerThan(value(), m_sve.m_view.reference(), m_sve.m_view.epsilon()))
+          {
+            m_inner++;
+          }
+        }
+    };
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_view(xpr) {}
+
+  protected:
+    evaluator<ArgType> m_argImpl;
+    const XprType &m_view;
 };
 
+} // end namespace internal
+
+/** \ingroup SparseCore_Module
+  *
+  * \returns a sparse expression of the dense expression \c *this with values smaller than
+  * \a reference * \a epsilon removed.
+  *
+  * This method is typically used when prototyping to convert a quickly assembled dense Matrix \c D to a SparseMatrix \c S:
+  * \code
+  * MatrixXd D(n,m);
+  * SparseMatrix<double> S;
+  * S = D.sparseView();             // suppress numerical zeros (exact)
+  * S = D.sparseView(reference);
+  * S = D.sparseView(reference,epsilon);
+  * \endcode
+  * where \a reference is a meaningful non zero reference value,
+  * and \a epsilon is a tolerance factor defaulting to NumTraits<Scalar>::dummy_precision().
+  *
+  * \sa SparseMatrixBase::pruned(), class SparseView */
+template<typename Derived>
+const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& reference,
+                                                          const typename NumTraits<Scalar>::Real& epsilon) const
+{
+  return SparseView<Derived>(derived(), reference, epsilon);
+}
+
+/** \returns an expression of \c *this with values smaller than
+  * \a reference * \a epsilon removed.
+  *
+  * This method is typically used in conjunction with the product of two sparse matrices
+  * to automatically prune the smallest values as follows:
+  * \code
+  * C = (A*B).pruned();             // suppress numerical zeros (exact)
+  * C = (A*B).pruned(ref);
+  * C = (A*B).pruned(ref,epsilon);
+  * \endcode
+  * where \c ref is a meaningful non zero reference value.
+  * */
 template<typename Derived>
-const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& m_reference,
-                                                          const typename NumTraits<Scalar>::Real& m_epsilon) const
+const SparseView<Derived>
+SparseMatrixBase<Derived>::pruned(const Scalar& reference,
+                                  const RealScalar& epsilon) const
 {
-  return SparseView<Derived>(derived(), m_reference, m_epsilon);
+  return SparseView<Derived>(derived(), reference, epsilon);
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h
index ccc12af79..f9c56ba79 100644
--- a/Eigen/src/SparseCore/TriangularSolver.h
+++ b/Eigen/src/SparseCore/TriangularSolver.h
@@ -28,16 +28,19 @@ template<typename Lhs, typename Rhs, int Mode>
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,RowMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=0; i<lhs.rows(); ++i)
+      for(Index i=0; i<lhs.rows(); ++i)
       {
         Scalar tmp = other.coeff(i,col);
         Scalar lastVal(0);
-        int lastIndex = 0;
-        for(typename Lhs::InnerIterator it(lhs, i); it; ++it)
+        Index lastIndex = 0;
+        for(LhsIterator it(lhsEval, i); it; ++it)
         {
           lastVal = it.value();
           lastIndex = it.index();
@@ -62,15 +65,18 @@ template<typename Lhs, typename Rhs, int Mode>
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,RowMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=lhs.rows()-1 ; i>=0 ; --i)
+      for(Index i=lhs.rows()-1 ; i>=0 ; --i)
       {
         Scalar tmp = other.coeff(i,col);
         Scalar l_ii(0);
-        typename Lhs::InnerIterator it(lhs, i);
+        LhsIterator it(lhsEval, i);
         while(it && it.index()<i)
           ++it;
         if(!(Mode & UnitDiag))
@@ -86,10 +92,8 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,RowMajor>
           tmp -= it.value() * other.coeff(it.index(),col);
         }
 
-        if (Mode & UnitDiag)
-          other.coeffRef(i,col) = tmp;
-        else
-          other.coeffRef(i,col) = tmp/l_ii;
+        if (Mode & UnitDiag)  other.coeffRef(i,col) = tmp;
+        else                  other.coeffRef(i,col) = tmp/l_ii;
       }
     }
   }
@@ -100,16 +104,19 @@ template<typename Lhs, typename Rhs, int Mode>
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,ColMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=0; i<lhs.cols(); ++i)
+      for(Index i=0; i<lhs.cols(); ++i)
       {
         Scalar& tmp = other.coeffRef(i,col);
         if (tmp!=Scalar(0)) // optimization when other is actually sparse
         {
-          typename Lhs::InnerIterator it(lhs, i);
+          LhsIterator it(lhsEval, i);
           while(it && it.index()<i)
             ++it;
           if(!(Mode & UnitDiag))
@@ -132,11 +139,14 @@ template<typename Lhs, typename Rhs, int Mode>
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,ColMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=lhs.cols()-1; i>=0; --i)
+      for(Index i=lhs.cols()-1; i>=0; --i)
       {
         Scalar& tmp = other.coeffRef(i,col);
         if (tmp!=Scalar(0)) // optimization when other is actually sparse
@@ -144,13 +154,13 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,ColMajor>
           if(!(Mode & UnitDiag))
           {
             // TODO replace this by a binary search. make sure the binary search is safe for partially sorted elements
-            typename Lhs::ReverseInnerIterator it(lhs, i);
+            LhsIterator it(lhsEval, i);
             while(it && it.index()!=i)
-              --it;
+              ++it;
             eigen_assert(it && it.index()==i);
             other.coeffRef(i,col) /= it.value();
           }
-          typename Lhs::InnerIterator it(lhs, i);
+          LhsIterator it(lhsEval, i);
           for(; it && it.index()<i; ++it)
             other.coeffRef(it.index(), col) -= tmp * it.value();
         }
@@ -161,11 +171,13 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,ColMajor>
 
 } // end namespace internal
 
-template<typename ExpressionType,int Mode>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+template<typename ExpressionType,unsigned int Mode>
 template<typename OtherDerived>
-void SparseTriangularView<ExpressionType,Mode>::solveInPlace(MatrixBase<OtherDerived>& other) const
+void TriangularViewImpl<ExpressionType,Mode,Sparse>::solveInPlace(MatrixBase<OtherDerived>& other) const
 {
-  eigen_assert(m_matrix.cols() == m_matrix.rows() && m_matrix.cols() == other.rows());
+  eigen_assert(derived().cols() == derived().rows() && derived().cols() == other.rows());
   eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
 
   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
@@ -174,21 +186,12 @@ void SparseTriangularView<ExpressionType,Mode>::solveInPlace(MatrixBase<OtherDer
     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
   OtherCopy otherCopy(other.derived());
 
-  internal::sparse_solve_triangular_selector<ExpressionType, typename internal::remove_reference<OtherCopy>::type, Mode>::run(m_matrix, otherCopy);
+  internal::sparse_solve_triangular_selector<ExpressionType, typename internal::remove_reference<OtherCopy>::type, Mode>::run(derived().nestedExpression(), otherCopy);
 
   if (copy)
     other = otherCopy;
 }
-
-template<typename ExpressionType,int Mode>
-template<typename OtherDerived>
-typename internal::plain_matrix_type_column_major<OtherDerived>::type
-SparseTriangularView<ExpressionType,Mode>::solve(const MatrixBase<OtherDerived>& other) const
-{
-  typename internal::plain_matrix_type_column_major<OtherDerived>::type res(other);
-  solveInPlace(res);
-  return res;
-}
+#endif
 
 // pure sparse path
 
@@ -208,18 +211,18 @@ template<typename Lhs, typename Rhs, int Mode, int UpLo>
 struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
 {
   typedef typename Rhs::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
+  typedef typename promote_index_type<typename traits<Lhs>::StorageIndex,
+                                      typename traits<Rhs>::StorageIndex>::type StorageIndex;
   static void run(const Lhs& lhs, Rhs& other)
   {
     const bool IsLower = (UpLo==Lower);
-    AmbiVector<Scalar,Index> tempVector(other.rows()*2);
+    AmbiVector<Scalar,StorageIndex> tempVector(other.rows()*2);
     tempVector.setBounds(0,other.rows());
 
     Rhs res(other.rows(), other.cols());
     res.reserve(other.nonZeros());
 
-    for(int col=0 ; col<other.cols() ; ++col)
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
       // FIXME estimate number of non zeros
       tempVector.init(.99/*float(other.col(col).nonZeros())/float(other.rows())*/);
@@ -230,7 +233,7 @@ struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
         tempVector.coeffRef(rhsIt.index()) = rhsIt.value();
       }
 
-      for(int i=IsLower?0:lhs.cols()-1;
+      for(Index i=IsLower?0:lhs.cols()-1;
           IsLower?i<lhs.cols():i>=0;
           i+=IsLower?1:-1)
       {
@@ -267,9 +270,9 @@ struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
       }
 
 
-      int count = 0;
+      Index count = 0;
       // FIXME compute a reference value to filter zeros
-      for (typename AmbiVector<Scalar,Index>::Iterator it(tempVector/*,1e-12*/); it; ++it)
+      for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector/*,1e-12*/); it; ++it)
       {
         ++ count;
 //         std::cerr << "fill " << it.index() << ", " << col << "\n";
@@ -286,11 +289,12 @@ struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
 
 } // end namespace internal
 
-template<typename ExpressionType,int Mode>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename ExpressionType,unsigned int Mode>
 template<typename OtherDerived>
-void SparseTriangularView<ExpressionType,Mode>::solveInPlace(SparseMatrixBase<OtherDerived>& other) const
+void TriangularViewImpl<ExpressionType,Mode,Sparse>::solveInPlace(SparseMatrixBase<OtherDerived>& other) const
 {
-  eigen_assert(m_matrix.cols() == m_matrix.rows() && m_matrix.cols() == other.rows());
+  eigen_assert(derived().cols() == derived().rows() && derived().cols() == other.rows());
   eigen_assert( (!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
 
 //   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
@@ -299,35 +303,12 @@ void SparseTriangularView<ExpressionType,Mode>::solveInPlace(SparseMatrixBase<Ot
 //     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
 //   OtherCopy otherCopy(other.derived());
 
-  internal::sparse_solve_triangular_sparse_selector<ExpressionType, OtherDerived, Mode>::run(m_matrix, other.derived());
+  internal::sparse_solve_triangular_sparse_selector<ExpressionType, OtherDerived, Mode>::run(derived().nestedExpression(), other.derived());
 
 //   if (copy)
 //     other = otherCopy;
 }
-
-#ifdef EIGEN2_SUPPORT
-
-// deprecated stuff:
-
-/** \deprecated */
-template<typename Derived>
-template<typename OtherDerived>
-void SparseMatrixBase<Derived>::solveTriangularInPlace(MatrixBase<OtherDerived>& other) const
-{
-  this->template triangular<Flags&(Upper|Lower)>().solveInPlace(other);
-}
-
-/** \deprecated */
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::plain_matrix_type_column_major<OtherDerived>::type
-SparseMatrixBase<Derived>::solveTriangular(const MatrixBase<OtherDerived>& other) const
-{
-  typename internal::plain_matrix_type_column_major<OtherDerived>::type res(other);
-  derived().solveTriangularInPlace(res);
-  return res;
-}
-#endif // EIGEN2_SUPPORT
+#endif
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/SparseLU/CMakeLists.txt b/Eigen/src/SparseLU/CMakeLists.txt
deleted file mode 100644
index 69729ee89..000000000
--- a/Eigen/src/SparseLU/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseLU_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SparseLU_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SparseLU COMPONENT Devel
-  )
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index 4514cfd9c..f883ab383 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,7 +14,7 @@
 
 namespace Eigen {
 
-template <typename _MatrixType, typename _OrderingType = COLAMDOrdering<typename _MatrixType::Index> > class SparseLU;
+template <typename _MatrixType, typename _OrderingType = COLAMDOrdering<typename _MatrixType::StorageIndex> > class SparseLU;
 template <typename MappedSparseMatrixType> struct SparseLUMatrixLReturnType;
 template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixUReturnType;
 
@@ -64,33 +64,45 @@ template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixURetu
   * 
   * \tparam _MatrixType The type of the sparse matrix. It must be a column-major SparseMatrix<>
   * \tparam _OrderingType The ordering method to use, either AMD, COLAMD or METIS. Default is COLMAD
+  *
+  * \implsparsesolverconcept
   * 
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
+  * \sa \ref TutorialSparseSolverConcept
   * \sa \ref OrderingMethods_Module
   */
 template <typename _MatrixType, typename _OrderingType>
-class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typename _MatrixType::Index>
+class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >, public internal::SparseLUImpl<typename _MatrixType::Scalar, typename _MatrixType::StorageIndex>
 {
+  protected:
+    typedef SparseSolverBase<SparseLU<_MatrixType,_OrderingType> > APIBase;
+    using APIBase::m_isInitialized;
   public:
+    using APIBase::_solve_impl;
+    
     typedef _MatrixType MatrixType; 
     typedef _OrderingType OrderingType;
     typedef typename MatrixType::Scalar Scalar; 
     typedef typename MatrixType::RealScalar RealScalar; 
-    typedef typename MatrixType::Index Index; 
-    typedef SparseMatrix<Scalar,ColMajor,Index> NCMatrix;
-    typedef internal::MappedSuperNodalMatrix<Scalar, Index> SCMatrix; 
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> NCMatrix;
+    typedef internal::MappedSuperNodalMatrix<Scalar, StorageIndex> SCMatrix;
     typedef Matrix<Scalar,Dynamic,1> ScalarVector;
-    typedef Matrix<Index,Dynamic,1> IndexVector;
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
-    typedef internal::SparseLUImpl<Scalar, Index> Base;
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+    typedef internal::SparseLUImpl<Scalar, StorageIndex> Base;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
     
   public:
-    SparseLU():m_isInitialized(true),m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
+    SparseLU():m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
     {
       initperfvalues(); 
     }
-    SparseLU(const MatrixType& matrix):m_isInitialized(true),m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
+    explicit SparseLU(const MatrixType& matrix)
+      : m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
     {
       initperfvalues(); 
       compute(matrix);
@@ -141,9 +153,9 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       * y = b; matrixU().solveInPlace(y);
       * \endcode
       */
-    SparseLUMatrixUReturnType<SCMatrix,MappedSparseMatrix<Scalar,ColMajor,Index> > matrixU() const
+    SparseLUMatrixUReturnType<SCMatrix,MappedSparseMatrix<Scalar,ColMajor,StorageIndex> > matrixU() const
     {
-      return SparseLUMatrixUReturnType<SCMatrix, MappedSparseMatrix<Scalar,ColMajor,Index> >(m_Lstore, m_Ustore);
+      return SparseLUMatrixUReturnType<SCMatrix, MappedSparseMatrix<Scalar,ColMajor,StorageIndex> >(m_Lstore, m_Ustore);
     }
 
     /**
@@ -168,6 +180,7 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       m_diagpivotthresh = thresh; 
     }
 
+#ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
       *
       * \warning the destination matrix X in X = this->solve(B) must be colmun-major.
@@ -175,26 +188,8 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       * \sa compute()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_factorizationIsOk && "SparseLU is not initialized."); 
-      eigen_assert(rows()==B.rows()
-                    && "SparseLU::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::solve_retval<SparseLU, Rhs>(*this, B.derived());
-    }
-
-    /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SparseLU, Rhs> solve(const SparseMatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_factorizationIsOk && "SparseLU is not initialized."); 
-      eigen_assert(rows()==B.rows()
-                    && "SparseLU::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::sparse_solve_retval<SparseLU, Rhs>(*this, B.derived());
-    }
+    inline const Solve<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const;
+#endif // EIGEN_PARSED_BY_DOXYGEN
     
     /** \brief Reports whether previous computation was successful.
       *
@@ -219,7 +214,7 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
     }
 
     template<typename Rhs, typename Dest>
-    bool _solve(const MatrixBase<Rhs> &B, MatrixBase<Dest> &X_base) const
+    bool _solve_impl(const MatrixBase<Rhs> &B, MatrixBase<Dest> &X_base) const
     {
       Dest& X(X_base.derived());
       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first");
@@ -255,8 +250,9 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       *
       * \sa logAbsDeterminant(), signDeterminant()
       */
-     Scalar absDeterminant()
+    Scalar absDeterminant()
     {
+      using std::abs;
       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
       // Initialize with the determinant of the row matrix
       Scalar det = Scalar(1.);
@@ -268,42 +264,43 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
         {
           if(it.index() == j)
           {
-            using std::abs;
             det *= abs(it.value());
             break;
           }
         }
-       }
-       return det;
-     }
+      }
+      return det;
+    }
 
-     /** \returns the natural log of the absolute value of the determinant of the matrix
-       * of which **this is the QR decomposition
-       *
-       * \note This method is useful to work around the risk of overflow/underflow that's
-       * inherent to the determinant computation.
-       *
-       * \sa absDeterminant(), signDeterminant()
-       */
-     Scalar logAbsDeterminant() const
-     {
-       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
-       Scalar det = Scalar(0.);
-       for (Index j = 0; j < this->cols(); ++j)
-       {
-         for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
-         {
-           if(it.row() < j) continue;
-           if(it.row() == j)
-           {
-             using std::log; using std::abs;
-             det += log(abs(it.value()));
-             break;
-           }
-         }
-       }
-       return det;
-     }
+    /** \returns the natural log of the absolute value of the determinant of the matrix
+      * of which **this is the QR decomposition
+      *
+      * \note This method is useful to work around the risk of overflow/underflow that's
+      * inherent to the determinant computation.
+      *
+      * \sa absDeterminant(), signDeterminant()
+      */
+    Scalar logAbsDeterminant() const
+    {
+      using std::log;
+      using std::abs;
+
+      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+      Scalar det = Scalar(0.);
+      for (Index j = 0; j < this->cols(); ++j)
+      {
+        for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
+        {
+          if(it.row() < j) continue;
+          if(it.row() == j)
+          {
+            det += log(abs(it.value()));
+            break;
+          }
+        }
+      }
+      return det;
+    }
 
     /** \returns A number representing the sign of the determinant
       *
@@ -355,7 +352,7 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
           }
         }
       }
-      return det * Scalar(m_detPermR * m_detPermC);
+      return (m_detPermR * m_detPermC) > 0 ? det : -det;
     }
 
   protected:
@@ -372,13 +369,12 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       
     // Variables 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     bool m_factorizationIsOk;
     bool m_analysisIsOk;
     std::string m_lastError;
     NCMatrix m_mat; // The input (permuted ) matrix 
     SCMatrix m_Lstore; // The lower triangular matrix (supernodal)
-    MappedSparseMatrix<Scalar,ColMajor,Index> m_Ustore; // The upper triangular matrix
+    MappedSparseMatrix<Scalar,ColMajor,StorageIndex> m_Ustore; // The upper triangular matrix
     PermutationType m_perm_c; // Column permutation 
     PermutationType m_perm_r ; // Row permutation
     IndexVector m_etree; // Column elimination tree 
@@ -388,7 +384,7 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
     // SparseLU options 
     bool m_symmetricmode;
     // values for performance 
-    internal::perfvalues<Index> m_perfv; 
+    internal::perfvalues m_perfv;
     RealScalar m_diagpivotthresh; // Specifies the threshold used for a diagonal entry to be an acceptable pivot
     Index m_nnzL, m_nnzU; // Nonzeros in L and U factors
     Index m_detPermR, m_detPermC; // Determinants of the permutation matrices
@@ -417,30 +413,32 @@ void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat)
   
   //TODO  It is possible as in SuperLU to compute row and columns scaling vectors to equilibrate the matrix mat.
   
+  // Firstly, copy the whole input matrix. 
+  m_mat = mat;
+  
+  // Compute fill-in ordering
   OrderingType ord; 
-  ord(mat,m_perm_c);
+  ord(m_mat,m_perm_c);
   
   // Apply the permutation to the column of the input  matrix
-  //First copy the whole input matrix. 
-  m_mat = mat;
-  if (m_perm_c.size()) {
+  if (m_perm_c.size())
+  {
     m_mat.uncompress(); //NOTE: The effect of this command is only to create the InnerNonzeros pointers. FIXME : This vector is filled but not subsequently used.  
-    //Then, permute only the column pointers
-    const Index * outerIndexPtr;
-    if (mat.isCompressed()) outerIndexPtr = mat.outerIndexPtr();
-    else
-    {
-      Index *outerIndexPtr_t = new Index[mat.cols()+1];
-      for(Index i = 0; i <= mat.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
-      outerIndexPtr = outerIndexPtr_t;
-    }
+    // Then, permute only the column pointers
+    ei_declare_aligned_stack_constructed_variable(StorageIndex,outerIndexPtr,mat.cols()+1,mat.isCompressed()?const_cast<StorageIndex*>(mat.outerIndexPtr()):0);
+    
+    // If the input matrix 'mat' is uncompressed, then the outer-indices do not match the ones of m_mat, and a copy is thus needed.
+    if(!mat.isCompressed()) 
+      IndexVector::Map(outerIndexPtr, mat.cols()+1) = IndexVector::Map(m_mat.outerIndexPtr(),mat.cols()+1);
+    
+    // Apply the permutation and compute the nnz per column.
     for (Index i = 0; i < mat.cols(); i++)
     {
       m_mat.outerIndexPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i];
       m_mat.innerNonZeroPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i+1] - outerIndexPtr[i];
     }
-    if(!mat.isCompressed()) delete[] outerIndexPtr;
   }
+  
   // Compute the column elimination tree of the permuted matrix 
   IndexVector firstRowElt;
   internal::coletree(m_mat, m_etree,firstRowElt); 
@@ -449,7 +447,7 @@ void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat)
   if (!m_symmetricmode) {
     IndexVector post, iwork; 
     // Post order etree
-    internal::treePostorder(m_mat.cols(), m_etree, post); 
+    internal::treePostorder(StorageIndex(m_mat.cols()), m_etree, post); 
       
    
     // Renumber etree in postorder 
@@ -501,7 +499,9 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
   eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices");
   
-  typedef typename IndexVector::Scalar Index; 
+  typedef typename IndexVector::Scalar StorageIndex; 
+  
+  m_isInitialized = true;
   
   
   // Apply the column permutation computed in analyzepattern()
@@ -511,11 +511,11 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   {
     m_mat.uncompress(); //NOTE: The effect of this command is only to create the InnerNonzeros pointers.
     //Then, permute only the column pointers
-    const Index * outerIndexPtr;
+    const StorageIndex * outerIndexPtr;
     if (matrix.isCompressed()) outerIndexPtr = matrix.outerIndexPtr();
     else
     {
-      Index* outerIndexPtr_t = new Index[matrix.cols()+1];
+      StorageIndex* outerIndexPtr_t = new StorageIndex[matrix.cols()+1];
       for(Index i = 0; i <= matrix.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
       outerIndexPtr = outerIndexPtr_t;
     }
@@ -529,7 +529,7 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   else 
   { //FIXME This should not be needed if the empty permutation is handled transparently
     m_perm_c.resize(matrix.cols());
-    for(Index i = 0; i < matrix.cols(); ++i) m_perm_c.indices()(i) = i;
+    for(StorageIndex i = 0; i < matrix.cols(); ++i) m_perm_c.indices()(i) = i;
   }
   
   Index m = m_mat.rows();
@@ -694,7 +694,7 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   // Create supernode matrix L 
   m_Lstore.setInfos(m, n, m_glu.lusup, m_glu.xlusup, m_glu.lsub, m_glu.xlsub, m_glu.supno, m_glu.xsup); 
   // Create the column major upper sparse matrix  U; 
-  new (&m_Ustore) MappedSparseMatrix<Scalar, ColMajor, Index> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() ); 
+  new (&m_Ustore) MappedSparseMatrix<Scalar, ColMajor, StorageIndex> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() );
   
   m_info = Success;
   m_factorizationIsOk = true;
@@ -703,9 +703,8 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
 template<typename MappedSupernodalType>
 struct SparseLUMatrixLReturnType : internal::no_assignment_operator
 {
-  typedef typename MappedSupernodalType::Index Index;
   typedef typename MappedSupernodalType::Scalar Scalar;
-  SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
+  explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
   { }
   Index rows() { return m_mapL.rows(); }
   Index cols() { return m_mapL.cols(); }
@@ -720,7 +719,6 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator
 template<typename MatrixLType, typename MatrixUType>
 struct SparseLUMatrixUReturnType : internal::no_assignment_operator
 {
-  typedef typename MatrixLType::Index Index;
   typedef typename MatrixLType::Scalar Scalar;
   SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
   : m_mapL(mapL),m_mapU(mapU)
@@ -731,7 +729,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
   template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
   {
     Index nrhs = X.cols();
-    Index n = X.rows();
+    Index n    = X.rows();
     // Backward solve with U
     for (Index k = m_mapL.nsuper(); k >= 0; k--)
     {
@@ -749,8 +747,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
       }
       else
       {
-        Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
         U = A.template triangularView<Upper>().solve(U);
       }
 
@@ -772,35 +770,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
   const MatrixUType& m_mapU;
 };
 
-namespace internal {
-  
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct solve_retval<SparseLU<_MatrixType,Derived>, Rhs>
-  : solve_retval_base<SparseLU<_MatrixType,Derived>, Rhs>
-{
-  typedef SparseLU<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct sparse_solve_retval<SparseLU<_MatrixType,Derived>, Rhs>
-  : sparse_solve_retval_base<SparseLU<_MatrixType,Derived>, Rhs>
-{
-  typedef SparseLU<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-} // end namespace internal
-
 } // End namespace Eigen 
 
 #endif
diff --git a/Eigen/src/SparseLU/SparseLUImpl.h b/Eigen/src/SparseLU/SparseLUImpl.h
index 14d70897d..fc0cfc4de 100644
--- a/Eigen/src/SparseLU/SparseLUImpl.h
+++ b/Eigen/src/SparseLU/SparseLUImpl.h
@@ -16,17 +16,19 @@ namespace internal {
   * \class SparseLUImpl
   * Base class for sparseLU
   */
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 class SparseLUImpl
 {
   public:
     typedef Matrix<Scalar,Dynamic,1> ScalarVector;
-    typedef Matrix<Index,Dynamic,1> IndexVector; 
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector; 
+    typedef Matrix<Scalar,Dynamic,Dynamic,ColMajor> ScalarMatrix;
+    typedef Map<ScalarMatrix, 0,  OuterStride<> > MappedMatrixBlock;
     typedef typename ScalarVector::RealScalar RealScalar; 
     typedef Ref<Matrix<Scalar,Dynamic,1> > BlockScalarVector;
-    typedef Ref<Matrix<Index,Dynamic,1> > BlockIndexVector;
+    typedef Ref<Matrix<StorageIndex,Dynamic,1> > BlockIndexVector;
     typedef LU_GlobalLU_t<IndexVector, ScalarVector> GlobalLU_t; 
-    typedef SparseMatrix<Scalar,ColMajor,Index> MatrixType; 
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> MatrixType; 
     
   protected:
      template <typename VectorType>
@@ -40,7 +42,7 @@ class SparseLUImpl
      Index snode_bmod (const Index jcol, const Index fsupc, ScalarVector& dense, GlobalLU_t& glu);
      Index pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu);
      template <typename Traits>
-     void dfs_kernel(const Index jj, IndexVector& perm_r,
+     void dfs_kernel(const StorageIndex jj, IndexVector& perm_r,
                     Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
                     Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
                     IndexVector& xplore, GlobalLU_t& glu, Index& nextl_col, Index krow, Traits& traits);
diff --git a/Eigen/src/SparseLU/SparseLU_Memory.h b/Eigen/src/SparseLU/SparseLU_Memory.h
index 1ffa7d54e..4dc42e87b 100644
--- a/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/Eigen/src/SparseLU/SparseLU_Memory.h
@@ -36,13 +36,12 @@ namespace internal {
   
 enum { LUNoMarker = 3 };
 enum {emptyIdxLU = -1};
-template<typename Index>
 inline Index LUnumTempV(Index& m, Index& w, Index& t, Index& b)
 {
   return (std::max)(m, (t+b)*w);
 }
 
-template< typename Scalar, typename Index>
+template< typename Scalar>
 inline Index LUTempSpace(Index&m, Index& w)
 {
   return (2*w + 4 + LUNoMarker) * m * sizeof(Index) + (w + 1) * m * sizeof(Scalar);
@@ -59,9 +58,9 @@ inline Index LUTempSpace(Index&m, Index& w)
   * \param keep_prev  1: use length  and do not expand the vector; 0: compute new_len and expand
   * \param[in,out] num_expansions Number of times the memory has been expanded
   */
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename VectorType>
-Index  SparseLUImpl<Scalar,Index>::expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions) 
+Index  SparseLUImpl<Scalar,StorageIndex>::expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions) 
 {
   
   float alpha = 1.5; // Ratio of the memory increase 
@@ -148,13 +147,13 @@ Index  SparseLUImpl<Scalar,Index>::expand(VectorType& vec, Index& length, Index
  * \return an estimated size of the required memory if lwork = -1; otherwise, return the size of actually allocated memory when allocation failed, and 0 on success
  * \note Unlike SuperLU, this routine does not support successive factorization with the same pattern and the same row permutation
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size,  GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size,  GlobalLU_t& glu)
 {
   Index& num_expansions = glu.num_expansions; //No memory expansions so far
   num_expansions = 0;
-  glu.nzumax = glu.nzlumax = (std::min)(fillratio * annz / n, m) * n; // estimated number of nonzeros in U 
-  glu.nzlmax = (std::max)(Index(4), fillratio) * annz / 4; // estimated  nnz in L factor
+  glu.nzumax = glu.nzlumax = (std::min)(fillratio * (annz+1) / n, m) * n; // estimated number of nonzeros in U 
+  glu.nzlmax = (std::max)(Index(4), fillratio) * (annz+1) / 4; // estimated  nnz in L factor
   // Return the estimated size to the user if necessary
   Index tempSpace;
   tempSpace = (2*panel_size + 4 + LUNoMarker) * m * sizeof(Index) + (panel_size + 1) * m * sizeof(Scalar);
@@ -205,9 +204,9 @@ Index SparseLUImpl<Scalar,Index>::memInit(Index m, Index n, Index annz, Index lw
  * \param num_expansions Number of expansions 
  * \return 0 on success, > 0 size of the memory allocated so far
  */
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename VectorType>
-Index SparseLUImpl<Scalar,Index>::memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions)
+Index SparseLUImpl<Scalar,StorageIndex>::memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions)
 {
   Index failed_size; 
   if (memtype == USUB)
diff --git a/Eigen/src/SparseLU/SparseLU_Structs.h b/Eigen/src/SparseLU/SparseLU_Structs.h
index 24d6bf179..cf5ec449b 100644
--- a/Eigen/src/SparseLU/SparseLU_Structs.h
+++ b/Eigen/src/SparseLU/SparseLU_Structs.h
@@ -75,7 +75,7 @@ typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType;
 
 template <typename IndexVector, typename ScalarVector>
 struct LU_GlobalLU_t {
-  typedef typename IndexVector::Scalar Index; 
+  typedef typename IndexVector::Scalar StorageIndex; 
   IndexVector xsup; //First supernode column ... xsup(s) points to the beginning of the s-th supernode
   IndexVector supno; // Supernode number corresponding to this column (column to supernode mapping)
   ScalarVector  lusup; // nonzero values of L ordered by columns 
@@ -93,7 +93,6 @@ struct LU_GlobalLU_t {
 };
 
 // Values to set for performance
-template <typename Index>
 struct perfvalues {
   Index panel_size; // a panel consists of at most <panel_size> consecutive columns
   Index relax; // To control degree of relaxing supernodes. If the number of nodes (columns) 
diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index b16afd6a4..721e1883b 100644
--- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -29,20 +29,20 @@ namespace internal {
  * SuperInnerIterator to iterate through all supernodes 
  * Function for triangular solve
  */
-template <typename _Scalar, typename _Index>
+template <typename _Scalar, typename _StorageIndex>
 class MappedSuperNodalMatrix
 {
   public:
     typedef _Scalar Scalar; 
-    typedef _Index Index;
-    typedef Matrix<Index,Dynamic,1> IndexVector; 
+    typedef _StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
     typedef Matrix<Scalar,Dynamic,1> ScalarVector;
   public:
     MappedSuperNodalMatrix()
     {
       
     }
-    MappedSuperNodalMatrix(Index m, Index n,  ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind, 
+    MappedSuperNodalMatrix(Index m, Index n,  ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind,
              IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col )
     {
       setInfos(m, n, nzval, nzval_colptr, rowind, rowind_colptr, col_to_sup, sup_to_col);
@@ -58,7 +58,7 @@ class MappedSuperNodalMatrix
      * FIXME This class will be modified such that it can be use in the course 
      * of the factorization.
      */
-    void setInfos(Index m, Index n, ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind, 
+    void setInfos(Index m, Index n, ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind,
              IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col )
     {
       m_row = m;
@@ -96,12 +96,12 @@ class MappedSuperNodalMatrix
     /**
      * Return the pointers to the beginning of each column in \ref valuePtr()
      */
-    Index* colIndexPtr()
+    StorageIndex* colIndexPtr()
     {
       return m_nzval_colptr; 
     }
     
-    const Index* colIndexPtr() const
+    const StorageIndex* colIndexPtr() const
     {
       return m_nzval_colptr; 
     }
@@ -109,9 +109,9 @@ class MappedSuperNodalMatrix
     /**
      * Return the array of compressed row indices of all supernodes
      */
-    Index* rowIndex()  { return m_rowind; }
+    StorageIndex* rowIndex()  { return m_rowind; }
     
-    const Index* rowIndex() const
+    const StorageIndex* rowIndex() const
     {
       return m_rowind; 
     }
@@ -119,9 +119,9 @@ class MappedSuperNodalMatrix
     /**
      * Return the location in \em rowvaluePtr() which starts each column
      */
-    Index* rowIndexPtr() { return m_rowind_colptr; }
+    StorageIndex* rowIndexPtr() { return m_rowind_colptr; }
     
-    const Index* rowIndexPtr() const 
+    const StorageIndex* rowIndexPtr() const
     {
       return m_rowind_colptr; 
     }
@@ -129,18 +129,18 @@ class MappedSuperNodalMatrix
     /** 
      * Return the array of column-to-supernode mapping 
      */
-    Index* colToSup()  { return m_col_to_sup; }
+    StorageIndex* colToSup()  { return m_col_to_sup; }
     
-    const Index* colToSup() const
+    const StorageIndex* colToSup() const
     {
       return m_col_to_sup;       
     }
     /**
      * Return the array of supernode-to-column mapping
      */
-    Index* supToCol() { return m_sup_to_col; }
+    StorageIndex* supToCol() { return m_sup_to_col; }
     
-    const Index* supToCol() const 
+    const StorageIndex* supToCol() const
     {
       return m_sup_to_col;
     }
@@ -148,7 +148,7 @@ class MappedSuperNodalMatrix
     /**
      * Return the number of supernodes
      */
-    Index nsuper() const 
+    Index nsuper() const
     {
       return m_nsuper; 
     }
@@ -162,14 +162,14 @@ class MappedSuperNodalMatrix
     
   protected:
     Index m_row; // Number of rows
-    Index m_col; // Number of columns 
-    Index m_nsuper; // Number of supernodes 
+    Index m_col; // Number of columns
+    Index m_nsuper; // Number of supernodes
     Scalar* m_nzval; //array of nonzero values packed by column
-    Index* m_nzval_colptr; //nzval_colptr[j] Stores the location in nzval[] which starts column j 
-    Index* m_rowind; // Array of compressed row indices of rectangular supernodes
-    Index* m_rowind_colptr; //rowind_colptr[j] stores the location in rowind[] which starts column j
-    Index* m_col_to_sup; // col_to_sup[j] is the supernode number to which column j belongs
-    Index* m_sup_to_col; //sup_to_col[s] points to the starting column of the s-th supernode
+    StorageIndex* m_nzval_colptr; //nzval_colptr[j] Stores the location in nzval[] which starts column j
+    StorageIndex* m_rowind; // Array of compressed row indices of rectangular supernodes
+    StorageIndex* m_rowind_colptr; //rowind_colptr[j] stores the location in rowind[] which starts column j
+    StorageIndex* m_col_to_sup; // col_to_sup[j] is the supernode number to which column j belongs
+    StorageIndex* m_sup_to_col; //sup_to_col[s] points to the starting column of the s-th supernode
     
   private :
 };
@@ -178,13 +178,13 @@ class MappedSuperNodalMatrix
   * \brief InnerIterator class to iterate over nonzero values of the current column in the supernodal matrix L
   * 
   */
-template<typename Scalar, typename Index>
-class MappedSuperNodalMatrix<Scalar,Index>::InnerIterator
+template<typename Scalar, typename StorageIndex>
+class MappedSuperNodalMatrix<Scalar,StorageIndex>::InnerIterator
 {
   public:
      InnerIterator(const MappedSuperNodalMatrix& mat, Index outer)
       : m_matrix(mat),
-        m_outer(outer), 
+        m_outer(outer),
         m_supno(mat.colToSup()[outer]),
         m_idval(mat.colIndexPtr()[outer]),
         m_startidval(m_idval),
@@ -229,14 +229,17 @@ class MappedSuperNodalMatrix<Scalar,Index>::InnerIterator
  * \brief Solve with the supernode triangular matrix
  * 
  */
-template<typename Scalar, typename Index>
+template<typename Scalar, typename Index_>
 template<typename Dest>
-void MappedSuperNodalMatrix<Scalar,Index>::solveInPlace( MatrixBase<Dest>&X) const
+void MappedSuperNodalMatrix<Scalar,Index_>::solveInPlace( MatrixBase<Dest>&X) const
 {
-    Index n = X.rows(); 
-    Index nrhs = X.cols(); 
+    /* Explicit type conversion as the Index type of MatrixBase<Dest> may be wider than Index */
+//    eigen_assert(X.rows() <= NumTraits<Index>::highest());
+//    eigen_assert(X.cols() <= NumTraits<Index>::highest());
+    Index n    = int(X.rows());
+    Index nrhs = Index(X.cols());
     const Scalar * Lval = valuePtr();                 // Nonzero values 
-    Matrix<Scalar,Dynamic,Dynamic> work(n, nrhs);     // working vector
+    Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor> work(n, nrhs);     // working vector
     work.setZero();
     for (Index k = 0; k <= nsuper(); k ++)
     {
@@ -267,13 +270,13 @@ void MappedSuperNodalMatrix<Scalar,Index>::solveInPlace( MatrixBase<Dest>&X) con
         Index lda = colIndexPtr()[fsupc+1] - luptr;
         
         // Triangular solve 
-        Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); 
-        Map< Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); 
+        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
         U = A.template triangularView<UnitLower>().solve(U); 
         
         // Matrix-vector product 
-        new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); 
-        work.block(0, 0, nrow, nrhs) = A * U; 
+        new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
+        work.topRows(nrow).noalias() = A * U;
         
         //Begin Scatter 
         for (Index j = 0; j < nrhs; j++)
diff --git a/Eigen/src/SparseLU/SparseLU_Utils.h b/Eigen/src/SparseLU/SparseLU_Utils.h
index 15352ac33..9e3dab44d 100644
--- a/Eigen/src/SparseLU/SparseLU_Utils.h
+++ b/Eigen/src/SparseLU/SparseLU_Utils.h
@@ -17,8 +17,8 @@ namespace internal {
 /**
  * \brief Count Nonzero elements in the factors
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu)
 {
  nnzL = 0; 
  nnzU = (glu.xusub)(n); 
@@ -48,12 +48,12 @@ void SparseLUImpl<Scalar,Index>::countnz(const Index n, Index& nnzL, Index& nnzU
  * and applies permutation to the remaining subscripts
  * 
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu)
 {
   Index fsupc, i, j, k, jstart; 
   
-  Index nextl = 0; 
+  StorageIndex nextl = 0; 
   Index nsuper = (glu.supno)(n); 
   
   // For each supernode 
diff --git a/Eigen/src/SparseLU/SparseLU_column_bmod.h b/Eigen/src/SparseLU/SparseLU_column_bmod.h
index f24bd87d3..b57f06802 100644
--- a/Eigen/src/SparseLU/SparseLU_column_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_column_bmod.h
@@ -49,8 +49,9 @@ namespace internal {
  *         > 0 - number of bytes allocated when run out of space
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv, BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv,
+                                                     BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu)
 {
   Index  jsupno, k, ksub, krep, ksupno; 
   Index lptr, nrow, isub, irow, nextlu, new_next, ufirst; 
@@ -137,7 +138,7 @@ Index SparseLUImpl<Scalar,Index>::column_bmod(const Index jcol, const Index nseg
     glu.lusup.segment(nextlu,offset).setZero();
     nextlu += offset;
   }
-  glu.xlusup(jcol + 1) = nextlu;  // close L\U(*,jcol); 
+  glu.xlusup(jcol + 1) = StorageIndex(nextlu);  // close L\U(*,jcol); 
   
   /* For more updates within the panel (also within the current supernode),
    * should start from the first column of the panel, or the first column
@@ -162,11 +163,11 @@ Index SparseLUImpl<Scalar,Index>::column_bmod(const Index jcol, const Index nseg
     // points to the beginning of jcol in snode L\U(jsupno) 
     ufirst = glu.xlusup(jcol) + d_fsupc; 
     Index lda = glu.xlusup(jcol+1) - glu.xlusup(jcol);
-    Map<Matrix<Scalar,Dynamic,Dynamic>, 0,  OuterStride<> > A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); 
+    MappedMatrixBlock A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
     VectorBlock<ScalarVector> u(glu.lusup, ufirst, nsupc); 
     u = A.template triangularView<UnitLower>().solve(u); 
     
-    new (&A) Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); 
+    new (&A) MappedMatrixBlock ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
     VectorBlock<ScalarVector> l(glu.lusup, ufirst+nsupc, nrow); 
     l.noalias() -= A * u;
     
diff --git a/Eigen/src/SparseLU/SparseLU_column_dfs.h b/Eigen/src/SparseLU/SparseLU_column_dfs.h
index 4c04b0e44..c98b30e32 100644
--- a/Eigen/src/SparseLU/SparseLU_column_dfs.h
+++ b/Eigen/src/SparseLU/SparseLU_column_dfs.h
@@ -30,7 +30,7 @@
 #ifndef SPARSELU_COLUMN_DFS_H
 #define SPARSELU_COLUMN_DFS_H
 
-template <typename Scalar, typename Index> class SparseLUImpl;
+template <typename Scalar, typename StorageIndex> class SparseLUImpl;
 namespace Eigen {
 
 namespace internal {
@@ -39,8 +39,8 @@ template<typename IndexVector, typename ScalarVector>
 struct column_dfs_traits : no_assignment_operator
 {
   typedef typename ScalarVector::Scalar Scalar;
-  typedef typename IndexVector::Scalar Index;
-  column_dfs_traits(Index jcol, Index& jsuper, typename SparseLUImpl<Scalar, Index>::GlobalLU_t& glu, SparseLUImpl<Scalar, Index>& luImpl)
+  typedef typename IndexVector::Scalar StorageIndex;
+  column_dfs_traits(Index jcol, Index& jsuper, typename SparseLUImpl<Scalar, StorageIndex>::GlobalLU_t& glu, SparseLUImpl<Scalar, StorageIndex>& luImpl)
    : m_jcol(jcol), m_jsuper_ref(jsuper), m_glu(glu), m_luImpl(luImpl)
  {}
   bool update_segrep(Index /*krep*/, Index /*jj*/)
@@ -57,8 +57,8 @@ struct column_dfs_traits : no_assignment_operator
   
   Index m_jcol;
   Index& m_jsuper_ref;
-  typename SparseLUImpl<Scalar, Index>::GlobalLU_t& m_glu;
-  SparseLUImpl<Scalar, Index>& m_luImpl;
+  typename SparseLUImpl<Scalar, StorageIndex>::GlobalLU_t& m_glu;
+  SparseLUImpl<Scalar, StorageIndex>& m_luImpl;
 };
 
 
@@ -89,8 +89,10 @@ struct column_dfs_traits : no_assignment_operator
  *         > 0 number of bytes allocated when run out of space
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,  BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,
+                                                    BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune,
+                                                    IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
 {
   
   Index jsuper = glu.supno(jcol); 
@@ -110,13 +112,13 @@ Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, In
     // krow was visited before, go to the next nonz; 
     if (kmark == jcol) continue;
     
-    dfs_kernel(jcol, perm_r, nseg, glu.lsub, segrep, repfnz, xprune, marker2, parent,
+    dfs_kernel(StorageIndex(jcol), perm_r, nseg, glu.lsub, segrep, repfnz, xprune, marker2, parent,
                    xplore, glu, nextl, krow, traits);
   } // for each nonzero ... 
   
-  Index fsupc, jptr, jm1ptr, ito, ifrom, istop;
-  Index nsuper = glu.supno(jcol);
-  Index jcolp1 = jcol + 1;
+  Index fsupc;
+  StorageIndex nsuper = glu.supno(jcol);
+  StorageIndex jcolp1 = StorageIndex(jcol) + 1;
   Index jcolm1 = jcol - 1;
   
   // check to see if j belongs in the same supernode as j-1
@@ -127,8 +129,8 @@ Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, In
   else 
   {
     fsupc = glu.xsup(nsuper); 
-    jptr = glu.xlsub(jcol); // Not yet compressed
-    jm1ptr = glu.xlsub(jcolm1); 
+    StorageIndex jptr = glu.xlsub(jcol); // Not yet compressed
+    StorageIndex jm1ptr = glu.xlsub(jcolm1); 
     
     // Use supernodes of type T2 : see SuperLU paper
     if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = emptyIdxLU;
@@ -146,13 +148,13 @@ Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, In
     { // starts a new supernode 
       if ( (fsupc < jcolm1-1) ) 
       { // >= 3 columns in nsuper
-        ito = glu.xlsub(fsupc+1);
+        StorageIndex ito = glu.xlsub(fsupc+1);
         glu.xlsub(jcolm1) = ito; 
-        istop = ito + jptr - jm1ptr; 
+        StorageIndex istop = ito + jptr - jm1ptr; 
         xprune(jcolm1) = istop; // intialize xprune(jcol-1)
         glu.xlsub(jcol) = istop; 
         
-        for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
+        for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
           glu.lsub(ito) = glu.lsub(ifrom); 
         nextl = ito;  // = istop + length(jcol)
       }
@@ -164,8 +166,8 @@ Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, In
   // Tidy up the pointers before exit
   glu.xsup(nsuper+1) = jcolp1; 
   glu.supno(jcolp1) = nsuper; 
-  xprune(jcol) = nextl;  // Intialize upper bound for pruning
-  glu.xlsub(jcolp1) = nextl; 
+  xprune(jcol) = StorageIndex(nextl);  // Intialize upper bound for pruning
+  glu.xlsub(jcolp1) = StorageIndex(nextl); 
   
   return 0; 
 }
diff --git a/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h b/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
index 170610d9f..c32d8d8b1 100644
--- a/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
+++ b/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
@@ -46,8 +46,9 @@ namespace internal {
  *         > 0 - number of bytes allocated when run out of space
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep, BlockIndexVector repfnz ,IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep,
+                                                      BlockIndexVector repfnz ,IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu)
 {  
   Index ksub, krep, ksupno; 
     
@@ -55,7 +56,7 @@ Index SparseLUImpl<Scalar,Index>::copy_to_ucol(const Index jcol, const Index nse
   
   // For each nonzero supernode segment of U[*,j] in topological order 
   Index k = nseg - 1, i; 
-  Index nextu = glu.xusub(jcol); 
+  StorageIndex nextu = glu.xusub(jcol); 
   Index kfnz, isub, segsize; 
   Index new_next,irow; 
   Index fsupc, mem; 
diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
index 9e4e3e72b..95ba7413f 100644
--- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@@ -21,7 +21,7 @@ namespace internal {
   *  - lda and ldc must be multiples of the respective packet size
   *  - C must have the same alignment as A
   */
-template<typename Scalar,typename Index>
+template<typename Scalar>
 EIGEN_DONT_INLINE
 void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc)
 {
@@ -39,9 +39,9 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
   };
   Index d_end = (d/RK)*RK;    // number of columns of A (rows of B) suitable for full register blocking
   Index n_end = (n/RN)*RN;    // number of columns of B-C suitable for processing RN columns at once
-  Index i0 = internal::first_aligned(A,m);
+  Index i0 = internal::first_default_aligned(A,m);
   
-  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m)));
+  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m)));
   
   // handle the non aligned rows of A and C without any optimization:
   for(Index i=0; i<i0; ++i)
@@ -72,14 +72,14 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
         
         // load and expand a RN x RK block of B
         Packet b00, b10, b20, b30, b01, b11, b21, b31;
-                  b00 = pset1<Packet>(Bc0[0]);
-                  b10 = pset1<Packet>(Bc0[1]);
-        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
-        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
-                  b01 = pset1<Packet>(Bc1[0]);
-                  b11 = pset1<Packet>(Bc1[1]);
-        if(RK==4) b21 = pset1<Packet>(Bc1[2]);
-        if(RK==4) b31 = pset1<Packet>(Bc1[3]);
+                  { b00 = pset1<Packet>(Bc0[0]); }
+                  { b10 = pset1<Packet>(Bc0[1]); }
+        if(RK==4) { b20 = pset1<Packet>(Bc0[2]); }
+        if(RK==4) { b30 = pset1<Packet>(Bc0[3]); }
+                  { b01 = pset1<Packet>(Bc1[0]); }
+                  { b11 = pset1<Packet>(Bc1[1]); }
+        if(RK==4) { b21 = pset1<Packet>(Bc1[2]); }
+        if(RK==4) { b31 = pset1<Packet>(Bc1[3]); }
         
         Packet a0, a1, a2, a3, c0, c1, t0, t1;
         
@@ -106,22 +106,22 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
         
 #define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}
 #define WORK(I)  \
-                    c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
-                    c1 = pload<Packet>(C1+i+(I)*PacketSize);   \
-                    KMADD(c0, a0, b00, t0)      \
-                    KMADD(c1, a0, b01, t1)      \
-                    a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
-                    KMADD(c0, a1, b10, t0)      \
-                    KMADD(c1, a1, b11, t1)       \
-                    a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
-          if(RK==4) KMADD(c0, a2, b20, t0)       \
-          if(RK==4) KMADD(c1, a2, b21, t1)       \
-          if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
-          if(RK==4) KMADD(c0, a3, b30, t0)       \
-          if(RK==4) KMADD(c1, a3, b31, t1)       \
-          if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
-                    pstore(C0+i+(I)*PacketSize, c0);           \
-                    pstore(C1+i+(I)*PacketSize, c1)
+                     c0 = pload<Packet>(C0+i+(I)*PacketSize);    \
+                     c1 = pload<Packet>(C1+i+(I)*PacketSize);    \
+                     KMADD(c0, a0, b00, t0)                      \
+                     KMADD(c1, a0, b01, t1)                      \
+                     a0 = pload<Packet>(A0+i+(I+1)*PacketSize);  \
+                     KMADD(c0, a1, b10, t0)                      \
+                     KMADD(c1, a1, b11, t1)                      \
+                     a1 = pload<Packet>(A1+i+(I+1)*PacketSize);  \
+          if(RK==4){ KMADD(c0, a2, b20, t0)                     }\
+          if(RK==4){ KMADD(c1, a2, b21, t1)                     }\
+          if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize); }\
+          if(RK==4){ KMADD(c0, a3, b30, t0)                     }\
+          if(RK==4){ KMADD(c1, a3, b31, t1)                     }\
+          if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize); }\
+                     pstore(C0+i+(I)*PacketSize, c0);            \
+                     pstore(C1+i+(I)*PacketSize, c1)
         
         // process rows of A' - C' with aggressive vectorization and peeling 
         for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
@@ -131,14 +131,15 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
                     prefetch((A1+i+(5)*PacketSize));
           if(RK==4) prefetch((A2+i+(5)*PacketSize));
           if(RK==4) prefetch((A3+i+(5)*PacketSize));
-                    WORK(0);
-                    WORK(1);
-                    WORK(2);
-                    WORK(3);
-                    WORK(4);
-                    WORK(5);
-                    WORK(6);
-                    WORK(7);
+
+          WORK(0);
+          WORK(1);
+          WORK(2);
+          WORK(3);
+          WORK(4);
+          WORK(5);
+          WORK(6);
+          WORK(7);
         }
         // process the remaining rows with vectorization only
         for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
@@ -165,7 +166,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
         Bc1 += RK;
       } // peeled loop on k
     } // peeled loop on the columns j
-    // process the last column (we now perform a matrux-vector product)
+    // process the last column (we now perform a matrix-vector product)
     if((n-n_end)>0)
     {
       const Scalar* Bc0 = B+(n-1)*ldb;
@@ -203,16 +204,16 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
         }
         
 #define WORK(I) \
-                  c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
-                  KMADD(c0, a0, b00, t0)       \
-                  a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
-                  KMADD(c0, a1, b10, t0)       \
-                  a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
-        if(RK==4) KMADD(c0, a2, b20, t0)       \
-        if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
-        if(RK==4) KMADD(c0, a3, b30, t0)       \
-        if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
-                  pstore(C0+i+(I)*PacketSize, c0);
+                   c0 = pload<Packet>(C0+i+(I)*PacketSize);     \
+                   KMADD(c0, a0, b00, t0)                       \
+                   a0 = pload<Packet>(A0+i+(I+1)*PacketSize);   \
+                   KMADD(c0, a1, b10, t0)                       \
+                   a1 = pload<Packet>(A1+i+(I+1)*PacketSize);   \
+        if(RK==4){ KMADD(c0, a2, b20, t0)                      }\
+        if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize);  }\
+        if(RK==4){ KMADD(c0, a3, b30, t0)                      }\
+        if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\
+                   pstore(C0+i+(I)*PacketSize, c0);
         
         // agressive vectorization and peeling
         for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
diff --git a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
index 7a4e4305a..6f75d500e 100644
--- a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
+++ b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
@@ -42,21 +42,20 @@ namespace internal {
  * \param descendants Number of descendants of each node in the etree
  * \param relax_end last column in a supernode
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
 {
   
   // The etree may not be postordered, but its heap ordered  
   IndexVector post;
-  internal::treePostorder(n, et, post); // Post order etree
+  internal::treePostorder(StorageIndex(n), et, post); // Post order etree
   IndexVector inv_post(n+1); 
-  Index i;
-  for (i = 0; i < n+1; ++i) inv_post(post(i)) = i; // inv_post = post.inverse()???
+  for (StorageIndex i = 0; i < n+1; ++i) inv_post(post(i)) = i; // inv_post = post.inverse()???
   
   // Renumber etree in postorder 
   IndexVector iwork(n);
   IndexVector et_save(n+1);
-  for (i = 0; i < n; ++i)
+  for (Index i = 0; i < n; ++i)
   {
     iwork(post(i)) = post(et(i));
   }
@@ -75,10 +74,10 @@ void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& e
   }
   // Identify the relaxed supernodes by postorder traversal of the etree
   Index snode_start; // beginning of a snode 
-  Index k;
+  StorageIndex k;
   Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree 
   Index nsuper_et = 0; // Number of relaxed snodes in the original etree 
-  Index l; 
+  StorageIndex l; 
   for (j = 0; j < n; )
   {
     parent = et(j);
@@ -90,8 +89,8 @@ void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& e
     }
     // Found a supernode in postordered etree, j is the last column 
     ++nsuper_et_post;
-    k = n;
-    for (i = snode_start; i <= j; ++i)
+    k = StorageIndex(n);
+    for (Index i = snode_start; i <= j; ++i)
       k = (std::min)(k, inv_post(i));
     l = inv_post(j);
     if ( (l - k) == (j - snode_start) )  // Same number of columns in the snode
@@ -102,7 +101,7 @@ void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& e
     }
     else 
     {
-      for (i = snode_start; i <= j; ++i) 
+      for (Index i = snode_start; i <= j; ++i) 
       {
         l = inv_post(i);
         if (descendants(i) == 0) 
diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
index 0d0283b13..8c1b3e8bc 100644
--- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
@@ -14,30 +14,29 @@
 namespace Eigen {
 namespace internal {
   
-/**
- * \brief Performs numeric block updates from a given supernode to a single column
- * 
- * \param segsize Size of the segment (and blocks ) to use for updates
- * \param[in,out] dense Packed values of the original matrix
- * \param tempv temporary vector to use for updates
- * \param lusup array containing the supernodes
- * \param lda Leading dimension in the supernode
- * \param nrow Number of rows in the rectangular part of the supernode
- * \param lsub compressed row subscripts of supernodes
- * \param lptr pointer to the first column of the current supernode in lsub
- * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
- * \return 0 on success
- */
 template <int SegSizeAtCompileTime> struct LU_kernel_bmod
 {
-  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-  static EIGEN_DONT_INLINE void run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
+  /** \internal
+    * \brief Performs numeric block updates from a given supernode to a single column
+    *
+    * \param segsize Size of the segment (and blocks ) to use for updates
+    * \param[in,out] dense Packed values of the original matrix
+    * \param tempv temporary vector to use for updates
+    * \param lusup array containing the supernodes
+    * \param lda Leading dimension in the supernode
+    * \param nrow Number of rows in the rectangular part of the supernode
+    * \param lsub compressed row subscripts of supernodes
+    * \param lptr pointer to the first column of the current supernode in lsub
+    * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
+    */
+  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+  static EIGEN_DONT_INLINE void run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
                                     const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
 };
 
 template <int SegSizeAtCompileTime>
-template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
+template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
                                                                   const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros)
 {
   typedef typename ScalarVector::Scalar Scalar;
@@ -45,7 +44,7 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsi
   // The result of triangular solve is in tempv[*]; 
     // The result of matric-vector update is in dense[*]
   Index isub = lptr + no_zeros; 
-  int i;
+  Index i;
   Index irow;
   for (i = 0; i < ((SegSizeAtCompileTime==Dynamic)?segsize:SegSizeAtCompileTime); i++)
   {
@@ -56,7 +55,7 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsi
   // Dense triangular solve -- start effective triangle
   luptr += lda * no_zeros + no_zeros; 
   // Form Eigen matrix and vector 
-  Map<Matrix<Scalar,SegSizeAtCompileTime,SegSizeAtCompileTime>, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda) );
+  Map<Matrix<Scalar,SegSizeAtCompileTime,SegSizeAtCompileTime, ColMajor>, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda) );
   Map<Matrix<Scalar,SegSizeAtCompileTime,1> > u(tempv.data(), segsize);
   
   u = A.template triangularView<UnitLower>().solve(u); 
@@ -65,9 +64,9 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsi
   luptr += segsize;
   const Index PacketSize = internal::packet_traits<Scalar>::size;
   Index ldl = internal::first_multiple(nrow, PacketSize);
-  Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
-  Index aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize);
-  Index aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize;
+  Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime, ColMajor>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
+  Index aligned_offset = internal::first_default_aligned(tempv.data()+segsize, PacketSize);
+  Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize;
   Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
   
   l.setZero();
@@ -91,21 +90,22 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsi
 
 template <> struct LU_kernel_bmod<1>
 {
-  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-  static EIGEN_DONT_INLINE void run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
+  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+  static EIGEN_DONT_INLINE void run(const Index /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
                                     const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
 };
 
 
-template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-EIGEN_DONT_INLINE void LU_kernel_bmod<1>::run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
+template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+EIGEN_DONT_INLINE void LU_kernel_bmod<1>::run(const Index /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
                                               const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros)
 {
   typedef typename ScalarVector::Scalar Scalar;
+  typedef typename IndexVector::Scalar StorageIndex;
   Scalar f = dense(lsub(lptr + no_zeros));
   luptr += lda * no_zeros + no_zeros + 1;
   const Scalar* a(lusup.data() + luptr);
-  const /*typename IndexVector::Scalar*/Index*  irow(lsub.data()+lptr + no_zeros + 1);
+  const StorageIndex*  irow(lsub.data()+lptr + no_zeros + 1);
   Index i = 0;
   for (; i+1 < nrow; i+=2)
   {
diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index da0e0fc3c..822cf32c3 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -52,8 +52,8 @@ namespace internal {
  * 
  * 
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::panel_bmod(const Index m, const Index w, const Index jcol, 
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::panel_bmod(const Index m, const Index w, const Index jcol, 
                                             const Index nseg, ScalarVector& dense, ScalarVector& tempv,
                                             IndexVector& segrep, IndexVector& repfnz, GlobalLU_t& glu)
 {
@@ -102,7 +102,7 @@ void SparseLUImpl<Scalar,Index>::panel_bmod(const Index m, const Index w, const
     if(nsupc >= 2)
     { 
       Index ldu = internal::first_multiple<Index>(u_rows, PacketSize);
-      Map<Matrix<Scalar,Dynamic,Dynamic>, Aligned,  OuterStride<> > U(tempv.data(), u_rows, u_cols, OuterStride<>(ldu));
+      Map<ScalarMatrix, Aligned,  OuterStride<> > U(tempv.data(), u_rows, u_cols, OuterStride<>(ldu));
       
       // gather U
       Index u_col = 0;
@@ -136,17 +136,17 @@ void SparseLUImpl<Scalar,Index>::panel_bmod(const Index m, const Index w, const
       Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc);
       no_zeros = (krep - u_rows + 1) - fsupc;
       luptr += lda * no_zeros + no_zeros;
-      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A(glu.lusup.data()+luptr, u_rows, u_rows, OuterStride<>(lda) );
+      MappedMatrixBlock A(glu.lusup.data()+luptr, u_rows, u_rows, OuterStride<>(lda) );
       U = A.template triangularView<UnitLower>().solve(U);
       
       // update
       luptr += u_rows;
-      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > B(glu.lusup.data()+luptr, nrow, u_rows, OuterStride<>(lda) );
+      MappedMatrixBlock B(glu.lusup.data()+luptr, nrow, u_rows, OuterStride<>(lda) );
       eigen_assert(tempv.size()>w*ldu + nrow*w + 1);
       
       Index ldl = internal::first_multiple<Index>(nrow, PacketSize);
-      Index offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize;
-      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
+      Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
+      MappedMatrixBlock L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
       
       L.setZero();
       internal::sparselu_gemm<Scalar>(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride());
diff --git a/Eigen/src/SparseLU/SparseLU_panel_dfs.h b/Eigen/src/SparseLU/SparseLU_panel_dfs.h
index dc0054efd..155df7336 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_dfs.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_dfs.h
@@ -37,11 +37,11 @@ namespace internal {
 template<typename IndexVector>
 struct panel_dfs_traits
 {
-  typedef typename IndexVector::Scalar Index;
-  panel_dfs_traits(Index jcol, Index* marker)
+  typedef typename IndexVector::Scalar StorageIndex;
+  panel_dfs_traits(Index jcol, StorageIndex* marker)
     : m_jcol(jcol), m_marker(marker)
   {}
-  bool update_segrep(Index krep, Index jj)
+  bool update_segrep(Index krep, StorageIndex jj)
   {
     if(m_marker[krep]<m_jcol)
     {
@@ -53,13 +53,13 @@ struct panel_dfs_traits
   void mem_expand(IndexVector& /*glu.lsub*/, Index /*nextl*/, Index /*chmark*/) {}
   enum { ExpandMem = false };
   Index m_jcol;
-  Index* m_marker;
+  StorageIndex* m_marker;
 };
 
 
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename Traits>
-void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
+void SparseLUImpl<Scalar,StorageIndex>::dfs_kernel(const StorageIndex jj, IndexVector& perm_r,
                    Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
                    Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
                    IndexVector& xplore, GlobalLU_t& glu,
@@ -67,14 +67,14 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
                   )
 {
   
-  Index kmark = marker(krow);
+  StorageIndex kmark = marker(krow);
       
   // For each unmarked krow of jj
   marker(krow) = jj; 
-  Index kperm = perm_r(krow); 
+  StorageIndex kperm = perm_r(krow); 
   if (kperm == emptyIdxLU ) {
     // krow is in L : place it in structure of L(*, jj)
-    panel_lsub(nextl_col++) = krow;  // krow is indexed into A
+    panel_lsub(nextl_col++) = StorageIndex(krow);  // krow is indexed into A
     
     traits.mem_expand(panel_lsub, nextl_col, kmark);
   }
@@ -83,9 +83,9 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
     // krow is in U : if its supernode-representative krep
     // has been explored, update repfnz(*)
     // krep = supernode representative of the current row
-    Index krep = glu.xsup(glu.supno(kperm)+1) - 1; 
+    StorageIndex krep = glu.xsup(glu.supno(kperm)+1) - 1; 
     // First nonzero element in the current column:
-    Index myfnz = repfnz_col(krep); 
+    StorageIndex myfnz = repfnz_col(krep); 
     
     if (myfnz != emptyIdxLU )
     {
@@ -96,26 +96,26 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
     else 
     {
       // Otherwise, perform dfs starting at krep
-      Index oldrep = emptyIdxLU; 
+      StorageIndex oldrep = emptyIdxLU; 
       parent(krep) = oldrep; 
       repfnz_col(krep) = kperm; 
-      Index xdfs =  glu.xlsub(krep); 
+      StorageIndex xdfs =  glu.xlsub(krep); 
       Index maxdfs = xprune(krep); 
       
-      Index kpar;
+      StorageIndex kpar;
       do 
       {
         // For each unmarked kchild of krep
         while (xdfs < maxdfs) 
         {
-          Index kchild = glu.lsub(xdfs); 
+          StorageIndex kchild = glu.lsub(xdfs); 
           xdfs++; 
-          Index chmark = marker(kchild); 
+          StorageIndex chmark = marker(kchild); 
           
           if (chmark != jj ) 
           {
             marker(kchild) = jj; 
-            Index chperm = perm_r(kchild); 
+            StorageIndex chperm = perm_r(kchild); 
             
             if (chperm == emptyIdxLU) 
             {
@@ -128,7 +128,7 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
               // case kchild is in U :
               // chrep = its supernode-rep. If its rep has been explored, 
               // update its repfnz(*)
-              Index chrep = glu.xsup(glu.supno(chperm)+1) - 1; 
+              StorageIndex chrep = glu.xsup(glu.supno(chperm)+1) - 1; 
               myfnz = repfnz_col(chrep); 
               
               if (myfnz != emptyIdxLU) 
@@ -215,8 +215,8 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
  * 
  */
 
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg, ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg, ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
 {
   Index nextl_col; // Next available position in panel_lsub[*,jj] 
   
@@ -227,7 +227,7 @@ void SparseLUImpl<Scalar,Index>::panel_dfs(const Index m, const Index w, const I
   panel_dfs_traits<IndexVector> traits(jcol, marker1.data());
   
   // For each column in the panel 
-  for (Index jj = jcol; jj < jcol + w; jj++) 
+  for (StorageIndex jj = StorageIndex(jcol); jj < jcol + w; jj++) 
   {
     nextl_col = (jj - jcol) * m; 
     
@@ -241,7 +241,7 @@ void SparseLUImpl<Scalar,Index>::panel_dfs(const Index m, const Index w, const I
       Index krow = it.row(); 
       dense_col(krow) = it.value();
       
-      Index kmark = marker(krow); 
+      StorageIndex kmark = marker(krow); 
       if (kmark == jj) 
         continue; // krow visited before, go to the next nonzero
       
diff --git a/Eigen/src/SparseLU/SparseLU_pivotL.h b/Eigen/src/SparseLU/SparseLU_pivotL.h
index 457789c78..a86dac93f 100644
--- a/Eigen/src/SparseLU/SparseLU_pivotL.h
+++ b/Eigen/src/SparseLU/SparseLU_pivotL.h
@@ -56,8 +56,8 @@ namespace internal {
  * \return 0 if success, i > 0 if U(i,i) is exactly zero 
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu)
 {
   
   Index fsupc = (glu.xsup)((glu.supno)(jcol)); // First column in the supernode containing the column jcol
@@ -67,11 +67,11 @@ Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& dia
   Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc); // leading dimension
   Scalar* lu_sup_ptr = &(glu.lusup.data()[glu.xlusup(fsupc)]); // Start of the current supernode
   Scalar* lu_col_ptr = &(glu.lusup.data()[glu.xlusup(jcol)]); // Start of jcol in the supernode
-  Index* lsub_ptr = &(glu.lsub.data()[lptr]); // Start of row indices of the supernode
+  StorageIndex* lsub_ptr = &(glu.lsub.data()[lptr]); // Start of row indices of the supernode
   
   // Determine the largest abs numerical value for partial pivoting 
   Index diagind = iperm_c(jcol); // diagonal index 
-  RealScalar pivmax = 0.0; 
+  RealScalar pivmax(-1.0);
   Index pivptr = nsupc; 
   Index diag = emptyIdxLU; 
   RealScalar rtemp;
@@ -87,9 +87,10 @@ Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& dia
   }
   
   // Test for singularity
-  if ( pivmax == 0.0 ) {
-    pivrow = lsub_ptr[pivptr];
-    perm_r(pivrow) = jcol;
+  if ( pivmax <= RealScalar(0.0) ) {
+    // if pivmax == -1, the column is structurally empty, otherwise it is only numerically zero
+    pivrow = pivmax < RealScalar(0.0) ? diagind : lsub_ptr[pivptr];
+    perm_r(pivrow) = StorageIndex(jcol);
     return (jcol+1);
   }
   
@@ -104,13 +105,13 @@ Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& dia
       // Diagonal element exists
       using std::abs;
       rtemp = abs(lu_col_ptr[diag]);
-      if (rtemp != 0.0 && rtemp >= thresh) pivptr = diag;
+      if (rtemp != RealScalar(0.0) && rtemp >= thresh) pivptr = diag;
     }
     pivrow = lsub_ptr[pivptr];
   }
   
   // Record pivot row
-  perm_r(pivrow) = jcol; 
+  perm_r(pivrow) = StorageIndex(jcol);
   // Interchange row subscripts
   if (pivptr != nsupc )
   {
diff --git a/Eigen/src/SparseLU/SparseLU_pruneL.h b/Eigen/src/SparseLU/SparseLU_pruneL.h
index 66460d168..ad32fed5e 100644
--- a/Eigen/src/SparseLU/SparseLU_pruneL.h
+++ b/Eigen/src/SparseLU/SparseLU_pruneL.h
@@ -49,8 +49,9 @@ namespace internal {
  * \param glu Global LU data
  * 
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg, const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg,
+                                               const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu)
 {
   // For each supernode-rep irep in U(*,j]
   Index jsupno = glu.supno(jcol); 
@@ -123,7 +124,7 @@ void SparseLUImpl<Scalar,Index>::pruneL(const Index jcol, const IndexVector& per
           }
         } // end while 
         
-        xprune(irep) = kmin;  //Pruning 
+        xprune(irep) = StorageIndex(kmin);  //Pruning 
       } // end if do_prune 
     } // end pruning 
   } // End for each U-segment
diff --git a/Eigen/src/SparseLU/SparseLU_relax_snode.h b/Eigen/src/SparseLU/SparseLU_relax_snode.h
index 58ec32e27..c408d01b4 100644
--- a/Eigen/src/SparseLU/SparseLU_relax_snode.h
+++ b/Eigen/src/SparseLU/SparseLU_relax_snode.h
@@ -43,15 +43,15 @@ namespace internal {
  * \param descendants Number of descendants of each node in the etree
  * \param relax_end last column in a supernode
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
 {
   
   // compute the number of descendants of each node in the etree
-  Index j, parent; 
+  Index parent; 
   relax_end.setConstant(emptyIdxLU);
   descendants.setZero();
-  for (j = 0; j < n; j++) 
+  for (Index j = 0; j < n; j++) 
   {
     parent = et(j);
     if (parent != n) // not the dummy root
@@ -59,7 +59,7 @@ void SparseLUImpl<Scalar,Index>::relax_snode (const Index n, IndexVector& et, co
   }
   // Identify the relaxed supernodes by postorder traversal of the etree
   Index snode_start; // beginning of a snode 
-  for (j = 0; j < n; )
+  for (Index j = 0; j < n; )
   {
     parent = et(j);
     snode_start = j; 
@@ -69,7 +69,7 @@ void SparseLUImpl<Scalar,Index>::relax_snode (const Index n, IndexVector& et, co
       parent = et(j);
     }
     // Found a supernode in postordered etree, j is the last column 
-    relax_end(snode_start) = j; // Record last column
+    relax_end(snode_start) = StorageIndex(j); // Record last column
     j++;
     // Search for a new leaf
     while (descendants(j) != 0 && j < n) j++;
diff --git a/Eigen/src/SparseQR/CMakeLists.txt b/Eigen/src/SparseQR/CMakeLists.txt
deleted file mode 100644
index f9ddf2bdb..000000000
--- a/Eigen/src/SparseQR/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseQR_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SparseQR_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SparseQR/ COMPONENT Devel
-  )
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index a00bd5db1..2d4498b03 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -21,8 +21,12 @@ namespace internal {
   template <typename SparseQRType> struct traits<SparseQRMatrixQReturnType<SparseQRType> >
   {
     typedef typename SparseQRType::MatrixType ReturnType;
-    typedef typename ReturnType::Index Index;
+    typedef typename ReturnType::StorageIndex StorageIndex;
     typedef typename ReturnType::StorageKind StorageKind;
+    enum {
+      RowsAtCompileTime = Dynamic,
+      ColsAtCompileTime = Dynamic
+    };
   };
   template <typename SparseQRType> struct traits<SparseQRMatrixQTransposeReturnType<SparseQRType> >
   {
@@ -58,24 +62,36 @@ namespace internal {
   * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module 
   *  OrderingMethods \endlink module for the list of built-in and external ordering methods.
   * 
+  * \implsparsesolverconcept
+  *
   * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
   * 
   */
 template<typename _MatrixType, typename _OrderingType>
-class SparseQR
+class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
 {
+  protected:
+    typedef SparseSolverBase<SparseQR<_MatrixType,_OrderingType> > Base;
+    using Base::m_isInitialized;
   public:
+    using Base::_solve_impl;
     typedef _MatrixType MatrixType;
     typedef _OrderingType OrderingType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> QRMatrixType;
-    typedef Matrix<Index, Dynamic, 1> IndexVector;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> QRMatrixType;
+    typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
     typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    
   public:
-    SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
+    SparseQR () :  m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
     { }
     
     /** Construct a QR factorization of the matrix \a mat.
@@ -84,7 +100,7 @@ class SparseQR
       * 
       * \sa compute()
       */
-    SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
+    explicit SparseQR(const MatrixType& mat) : m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
     {
       compute(mat);
     }
@@ -112,6 +128,17 @@ class SparseQR
     inline Index cols() const { return m_pmat.cols();}
     
     /** \returns a const reference to the \b sparse upper triangular matrix R of the QR factorization.
+      * \warning The entries of the returned matrix are not sorted. This means that using it in algorithms
+      *          expecting sorted entries will fail. This include random coefficient accesses (SpaseMatrix::coeff()),
+      *          and coefficient-wise operations. Matrix products and triangular solves are fine though.
+      *
+      * To sort the entries, you can assign it to a row-major matrix, and if a column-major matrix
+      * is required, you can copy it again:
+      * \code
+      * SparseMatrix<double>          R  = qr.matrixR();  // column-major, not sorted!
+      * SparseMatrix<double,RowMajor> Rr = qr.matrixR();  // row-major, sorted
+      * SparseMatrix<double>          Rc = Rr;            // column-major, sorted
+      * \endcode
       */
     const QRMatrixType& matrixR() const { return m_R; }
     
@@ -119,7 +146,7 @@ class SparseQR
       *
       * \sa setPivotThreshold()
       */
-    Index rank() const 
+    Index rank() const
     {
       eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
       return m_nonzeropivots; 
@@ -162,7 +189,7 @@ class SparseQR
     
     /** \internal */
     template<typename Rhs, typename Dest>
-    bool _solve(const MatrixBase<Rhs> &B, MatrixBase<Dest> &dest) const
+    bool _solve_impl(const MatrixBase<Rhs> &B, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
       eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
@@ -175,10 +202,10 @@ class SparseQR
       b = y;
       
       // Solve with the triangular matrix R
-      y.resize((std::max)(cols(),Index(y.rows())),y.cols());
+      y.resize((std::max<Index>)(cols(),y.rows()),y.cols());
       y.topRows(rank) = this->matrixR().topLeftCorner(rank, rank).template triangularView<Upper>().solve(b.topRows(rank));
       y.bottomRows(y.rows()-rank).setZero();
-
+      
       // Apply the column permutation
       if (m_perm_c.size())  dest = colsPermutation() * y.topRows(cols());
       else                  dest = y.topRows(cols());
@@ -186,7 +213,6 @@ class SparseQR
       m_info = Success;
       return true;
     }
-    
 
     /** Sets the threshold that is used to determine linearly dependent columns during the factorization.
       *
@@ -204,18 +230,18 @@ class SparseQR
       * \sa compute()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
+    inline const Solve<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
     {
       eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
       eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
-      return internal::solve_retval<SparseQR, Rhs>(*this, B.derived());
+      return Solve<SparseQR, Rhs>(*this, B.derived());
     }
     template<typename Rhs>
-    inline const internal::sparse_solve_retval<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const
+    inline const Solve<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const
     {
           eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
           eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
-          return internal::sparse_solve_retval<SparseQR, Rhs>(*this, B.derived());
+          return Solve<SparseQR, Rhs>(*this, B.derived());
     }
     
     /** \brief Reports whether previous computation was successful.
@@ -232,8 +258,9 @@ class SparseQR
       return m_info;
     }
 
-  protected:
-    inline void sort_matrix_Q()
+
+    /** \internal */
+    inline void _sort_matrix_Q()
     {
       if(this->m_isQSorted) return;
       // The matrix Q is sorted during the transposition
@@ -244,7 +271,6 @@ class SparseQR
 
     
   protected:
-    bool m_isInitialized;
     bool m_analysisIsok;
     bool m_factorizationIsok;
     mutable ComputationInfo m_info;
@@ -258,14 +284,13 @@ class SparseQR
     PermutationType m_outputPerm_c; // The final column permutation
     RealScalar m_threshold;         // Threshold to determine null Householder reflections
     bool m_useDefaultThreshold;     // Use default threshold
-    Index m_nonzeropivots;          // Number of non zero pivots found 
+    Index m_nonzeropivots;          // Number of non zero pivots found
     IndexVector m_etree;            // Column elimination tree
     IndexVector m_firstRowElt;      // First element in each row
     bool m_isQSorted;               // whether Q is sorted or not
     bool m_isEtreeOk;               // whether the elimination tree match the initial input matrix
     
     template <typename, typename > friend struct SparseQR_QProduct;
-    template <typename > friend struct SparseQRMatrixQReturnType;
     
 };
 
@@ -294,7 +319,7 @@ void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
   if (!m_perm_c.size())
   {
     m_perm_c.resize(n);
-    m_perm_c.indices().setLinSpaced(n, 0,n-1);
+    m_perm_c.indices().setLinSpaced(n, 0,StorageIndex(n-1));
   }
   
   // Compute the column elimination tree of the permuted matrix
@@ -323,12 +348,11 @@ template <typename MatrixType, typename OrderingType>
 void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
 {
   using std::abs;
-  using std::max;
   
   eigen_assert(m_analysisIsok && "analyzePattern() should be called before this step");
-  Index m = mat.rows();
-  Index n = mat.cols();
-  Index diagSize = (std::min)(m,n);
+  StorageIndex m = StorageIndex(mat.rows());
+  StorageIndex n = StorageIndex(mat.cols());
+  StorageIndex diagSize = (std::min)(m,n);
   IndexVector mark((std::max)(m,n)); mark.setConstant(-1);  // Record the visited nodes
   IndexVector Ridx(n), Qidx(m);                             // Store temporarily the row indexes for the current column of R and Q
   Index nzcolR, nzcolQ;                                     // Number of nonzero for the current column of R and Q
@@ -353,7 +377,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
     // otherwise directly use the input matrix
     // 
     IndexVector originalOuterIndicesCpy;
-    const Index *originalOuterIndices = mat.outerIndexPtr();
+    const StorageIndex *originalOuterIndices = mat.outerIndexPtr();
     if(MatrixType::IsRowMajor)
     {
       originalOuterIndicesCpy = IndexVector::Map(m_pmat.outerIndexPtr(),n+1);
@@ -375,7 +399,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
   if(m_useDefaultThreshold) 
   {
     RealScalar max2Norm = 0.0;
-    for (int j = 0; j < n; j++) max2Norm = (max)(max2Norm, m_pmat.col(j).norm());
+    for (int j = 0; j < n; j++) max2Norm = numext::maxi(max2Norm, m_pmat.col(j).norm());
     if(max2Norm==RealScalar(0))
       max2Norm = RealScalar(1);
     pivotThreshold = 20 * (m + n) * max2Norm * NumTraits<RealScalar>::epsilon();
@@ -384,11 +408,11 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
   // Initialize the numerical permutation
   m_pivotperm.setIdentity(n);
   
-  Index nonzeroCol = 0; // Record the number of valid pivots
+  StorageIndex nonzeroCol = 0; // Record the number of valid pivots
   m_Q.startVec(0);
 
   // Left looking rank-revealing QR factorization: compute a column of R and Q at a time
-  for (Index col = 0; col < n; ++col)
+  for (StorageIndex col = 0; col < n; ++col)
   {
     mark.setConstant(-1);
     m_R.startVec(col);
@@ -404,12 +428,12 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
     // thus the trick with found_diag that permits to do one more iteration on the diagonal element if this one has not been found.
     for (typename QRMatrixType::InnerIterator itp(m_pmat, col); itp || !found_diag; ++itp)
     {
-      Index curIdx = nonzeroCol;
-      if(itp) curIdx = itp.row();
+      StorageIndex curIdx = nonzeroCol;
+      if(itp) curIdx = StorageIndex(itp.row());
       if(curIdx == nonzeroCol) found_diag = true;
       
       // Get the nonzeros indexes of the current column of R
-      Index st = m_firstRowElt(curIdx); // The traversal of the etree starts here 
+      StorageIndex st = m_firstRowElt(curIdx); // The traversal of the etree starts here
       if (st < 0 )
       {
         m_lastError = "Empty row found during numerical factorization";
@@ -466,7 +490,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
       {
         for (typename QRMatrixType::InnerIterator itq(m_Q, curIdx); itq; ++itq)
         {
-          Index iQ = itq.row();
+          StorageIndex iQ = StorageIndex(itq.row());
           if (mark(iQ) != col)
           {
             Qidx(nzcolQ++) = iQ;  // Add this row to the pattern of Q,
@@ -476,7 +500,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
       }
     } // End update current column
     
-    Scalar tau = 0;
+    Scalar tau = RealScalar(0);
     RealScalar beta = 0;
     
     if(nonzeroCol < diagSize)
@@ -572,40 +596,11 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
   m_info = Success;
 }
 
-namespace internal {
-  
-template<typename _MatrixType, typename OrderingType, typename Rhs>
-struct solve_retval<SparseQR<_MatrixType,OrderingType>, Rhs>
-  : solve_retval_base<SparseQR<_MatrixType,OrderingType>, Rhs>
-{
-  typedef SparseQR<_MatrixType,OrderingType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-template<typename _MatrixType, typename OrderingType, typename Rhs>
-struct sparse_solve_retval<SparseQR<_MatrixType, OrderingType>, Rhs>
- : sparse_solve_retval_base<SparseQR<_MatrixType, OrderingType>, Rhs>
-{
-  typedef SparseQR<_MatrixType, OrderingType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec, Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-} // end namespace internal
-
 template <typename SparseQRType, typename Derived>
 struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived> >
 {
   typedef typename SparseQRType::QRMatrixType MatrixType;
   typedef typename SparseQRType::Scalar Scalar;
-  typedef typename SparseQRType::Index Index;
   // Get the references 
   SparseQR_QProduct(const SparseQRType& qr, const Derived& other, bool transpose) : 
   m_qr(qr),m_other(other),m_transpose(transpose) {}
@@ -661,10 +656,13 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
 template<typename SparseQRType>
 struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<SparseQRType> >
 {  
-  typedef typename SparseQRType::Index Index;
   typedef typename SparseQRType::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
-  SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic
+  };
+  explicit SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
   template<typename Derived>
   SparseQR_QProduct<SparseQRType, Derived> operator*(const MatrixBase<Derived>& other)
   {
@@ -681,26 +679,13 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
   {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
-  template<typename Dest> void evalTo(MatrixBase<Dest>& dest) const
-  {
-    dest.derived() = m_qr.matrixQ() * Dest::Identity(m_qr.rows(), m_qr.rows());
-  }
-  template<typename Dest> void evalTo(SparseMatrixBase<Dest>& dest) const
-  {
-    Dest idMat(m_qr.rows(), m_qr.rows());
-    idMat.setIdentity();
-    // Sort the sparse householder reflectors if needed
-    const_cast<SparseQRType *>(&m_qr)->sort_matrix_Q();
-    dest.derived() = SparseQR_QProduct<SparseQRType, Dest>(m_qr, idMat, false);
-  }
-
   const SparseQRType& m_qr;
 };
 
 template<typename SparseQRType>
 struct SparseQRMatrixQTransposeReturnType
 {
-  SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}
+  explicit SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}
   template<typename Derived>
   SparseQR_QProduct<SparseQRType,Derived> operator*(const MatrixBase<Derived>& other)
   {
@@ -709,6 +694,46 @@ struct SparseQRMatrixQTransposeReturnType
   const SparseQRType& m_qr;
 };
 
+namespace internal {
+  
+template<typename SparseQRType>
+struct evaluator_traits<SparseQRMatrixQReturnType<SparseQRType> >
+{
+  typedef typename SparseQRType::MatrixType MatrixType;
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseShape Shape;
+};
+
+template< typename DstXprType, typename SparseQRType>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>, Sparse2Sparse>
+{
+  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
+  {
+    typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows());
+    idMat.setIdentity();
+    // Sort the sparse householder reflectors if needed
+    const_cast<SparseQRType *>(&src.m_qr)->_sort_matrix_Q();
+    dst = SparseQR_QProduct<SparseQRType, DstXprType>(src.m_qr, idMat, false);
+  }
+};
+
+template< typename DstXprType, typename SparseQRType>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>, Sparse2Dense>
+{
+  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
+  {
+    dst = src.m_qr.matrixQ() * DstXprType::Identity(src.m_qr.rows(), src.m_qr.rows());
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif
diff --git a/Eigen/src/StlSupport/CMakeLists.txt b/Eigen/src/StlSupport/CMakeLists.txt
deleted file mode 100644
index 0f094f637..000000000
--- a/Eigen/src/StlSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_StlSupport_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_StlSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/StlSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h
index aaf66330b..cf1fedf92 100644
--- a/Eigen/src/StlSupport/StdDeque.h
+++ b/Eigen/src/StlSupport/StdDeque.h
@@ -13,32 +13,24 @@
 
 #include "details.h"
 
-// Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if defined(__INTEL_COMPILER) || defined(__GNUC__)
-  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...) template class std::deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
-#else
-  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...)
-#endif
-
 /**
  * This section contains a convenience MACRO which allows an easy specialization of
  * std::deque such that for data types with alignment issues the correct allocator
  * is used automatically.
  */
 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) \
-EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(__VA_ARGS__) \
 namespace std \
 { \
-  template<typename _Ay> \
-  class deque<__VA_ARGS__, _Ay>  \
+  template<> \
+  class deque<__VA_ARGS__, std::allocator<__VA_ARGS__> >           \
     : public deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
   { \
     typedef deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > deque_base; \
   public: \
     typedef __VA_ARGS__ value_type; \
-    typedef typename deque_base::allocator_type allocator_type; \
-    typedef typename deque_base::size_type size_type;  \
-    typedef typename deque_base::iterator iterator;  \
+    typedef deque_base::allocator_type allocator_type; \
+    typedef deque_base::size_type size_type;  \
+    typedef deque_base::iterator iterator;  \
     explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {}  \
     template<typename InputIterator> \
     deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \
@@ -53,7 +45,7 @@ namespace std \
 }
 
 // check whether we really need the std::deque specialization
-#if !(defined(_GLIBCXX_DEQUE) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::deque::resize(size_type,const T&). */
+#if !EIGEN_HAS_CXX11_CONTAINERS && !(defined(_GLIBCXX_DEQUE) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::deque::resize(size_type,const T&). */
 
 namespace std {
 
diff --git a/Eigen/src/StlSupport/StdList.h b/Eigen/src/StlSupport/StdList.h
index 3c742430c..e1eba4985 100644
--- a/Eigen/src/StlSupport/StdList.h
+++ b/Eigen/src/StlSupport/StdList.h
@@ -12,32 +12,24 @@
 
 #include "details.h"
 
-// Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if defined(__INTEL_COMPILER) || defined(__GNUC__)
-  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...) template class std::list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
-#else
-  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...)
-#endif
-
 /**
  * This section contains a convenience MACRO which allows an easy specialization of
  * std::list such that for data types with alignment issues the correct allocator
  * is used automatically.
  */
 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) \
-EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(__VA_ARGS__) \
 namespace std \
 { \
-  template<typename _Ay> \
-  class list<__VA_ARGS__, _Ay>  \
+  template<> \
+  class list<__VA_ARGS__, std::allocator<__VA_ARGS__> >           \
     : public list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
   { \
     typedef list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > list_base; \
   public: \
     typedef __VA_ARGS__ value_type; \
-    typedef typename list_base::allocator_type allocator_type; \
-    typedef typename list_base::size_type size_type;  \
-    typedef typename list_base::iterator iterator;  \
+    typedef list_base::allocator_type allocator_type; \
+    typedef list_base::size_type size_type;  \
+    typedef list_base::iterator iterator;  \
     explicit list(const allocator_type& a = allocator_type()) : list_base(a) {}  \
     template<typename InputIterator> \
     list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \
@@ -51,8 +43,8 @@ namespace std \
   }; \
 }
 
-// check whether we really need the std::vector specialization
-#if !(defined(_GLIBCXX_VECTOR) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::list::resize(size_type,const T&). */
+// check whether we really need the std::list specialization
+#if !EIGEN_HAS_CXX11_CONTAINERS && !(defined(_GLIBCXX_LIST) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::list::resize(size_type,const T&). */
 
 namespace std
 {
diff --git a/Eigen/src/StlSupport/StdVector.h b/Eigen/src/StlSupport/StdVector.h
index 611664a2e..ec22821d2 100644
--- a/Eigen/src/StlSupport/StdVector.h
+++ b/Eigen/src/StlSupport/StdVector.h
@@ -44,6 +44,9 @@ namespace std \
   }; \
 }
 
+// Don't specialize if containers are implemented according to C++11
+#if !EIGEN_HAS_CXX11_CONTAINERS
+
 namespace std {
 
 #define EIGEN_STD_VECTOR_SPECIALIZATION_BODY \
@@ -122,5 +125,7 @@ namespace std {
 #endif
   };
 }
+#endif // !EIGEN_HAS_CXX11_CONTAINERS
+
 
 #endif // EIGEN_STDVECTOR_H
diff --git a/Eigen/src/StlSupport/details.h b/Eigen/src/StlSupport/details.h
index d8debc7c4..2cfd13e03 100644
--- a/Eigen/src/StlSupport/details.h
+++ b/Eigen/src/StlSupport/details.h
@@ -22,13 +22,13 @@ namespace Eigen {
   class aligned_allocator_indirection : public EIGEN_ALIGNED_ALLOCATOR<T>
   {
   public:
-    typedef size_t    size_type;
-    typedef ptrdiff_t difference_type;
-    typedef T*        pointer;
-    typedef const T*  const_pointer;
-    typedef T&        reference;
-    typedef const T&  const_reference;
-    typedef T         value_type;
+    typedef std::size_t     size_type;
+    typedef std::ptrdiff_t  difference_type;
+    typedef T*              pointer;
+    typedef const T*        const_pointer;
+    typedef T&              reference;
+    typedef const T&        const_reference;
+    typedef T               value_type;
 
     template<class U>
     struct rebind
@@ -46,7 +46,7 @@ namespace Eigen {
     ~aligned_allocator_indirection() {}
   };
 
-#ifdef _MSC_VER
+#if EIGEN_COMP_MSVC
 
   // sometimes, MSVC detects, at compile time, that the argument x
   // in std::vector::resize(size_t s,T x) won't be aligned and generate an error
diff --git a/Eigen/src/SuperLUSupport/CMakeLists.txt b/Eigen/src/SuperLUSupport/CMakeLists.txt
deleted file mode 100644
index b28ebe583..000000000
--- a/Eigen/src/SuperLUSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SuperLUSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_SuperLUSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SuperLUSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index bcb355760..50a69f306 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,16 +10,16 @@
 #ifndef EIGEN_SUPERLUSUPPORT_H
 #define EIGEN_SUPERLUSUPPORT_H
 
-namespace Eigen { 
+namespace Eigen {
 
+#if defined(SUPERLU_MAJOR_VERSION) && (SUPERLU_MAJOR_VERSION >= 5)
 #define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
     extern "C" {                                                                                          \
-      typedef struct { FLOATTYPE for_lu; FLOATTYPE total_needed; int expansions; } PREFIX##mem_usage_t;   \
       extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
                                 char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
                                 void *, int, SuperMatrix *, SuperMatrix *,                                \
                                 FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
-                                PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                           \
+                                GlobalLU_t *, mem_usage_t *, SuperLUStat_t *, int *);                     \
     }                                                                                                     \
     inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
          int *perm_c, int *perm_r, int *etree, char *equed,                                               \
@@ -29,12 +29,37 @@ namespace Eigen {
          FLOATTYPE *recip_pivot_growth,                                                                   \
          FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
          SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
-    PREFIX##mem_usage_t mem_usage;                                                                        \
+    mem_usage_t mem_usage;                                                                                \
+    GlobalLU_t gLU;                                                                                       \
+    PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
+         U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
+         ferr, berr, &gLU, &mem_usage, stats, info);                                                      \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                       \
+  }
+#else // version < 5.0
+#define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
+    extern "C" {                                                                                          \
+      extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
+                                char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
+                                void *, int, SuperMatrix *, SuperMatrix *,                                \
+                                FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
+                                mem_usage_t *, SuperLUStat_t *, int *);                                   \
+    }                                                                                                     \
+    inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
+         int *perm_c, int *perm_r, int *etree, char *equed,                                               \
+         FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L,                                                      \
+         SuperMatrix *U, void *work, int lwork,                                                           \
+         SuperMatrix *B, SuperMatrix *X,                                                                  \
+         FLOATTYPE *recip_pivot_growth,                                                                   \
+         FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
+         SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
+    mem_usage_t mem_usage;                                                                                \
     PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
          U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
          ferr, berr, &mem_usage, stats, info);                                                            \
     return mem_usage.for_lu; /* bytes used by the factor storage */                                       \
   }
+#endif
 
 DECL_GSSVX(s,float,float)
 DECL_GSSVX(c,float,std::complex<float>)
@@ -53,7 +78,7 @@ DECL_GSSVX(z,double,std::complex<double>)
       extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *,        \
                          char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,        \
                          void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *, FLOATTYPE *,   \
-                         PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                        \
+                         mem_usage_t *, SuperLUStat_t *, int *);                        \
     }                                                                                           \
     inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A,                      \
          int *perm_c, int *perm_r, int *etree, char *equed,                                     \
@@ -63,7 +88,7 @@ DECL_GSSVX(z,double,std::complex<double>)
          FLOATTYPE *recip_pivot_growth,                                                         \
          FLOATTYPE *rcond,                                                                      \
          SuperLUStat_t *stats, int *info, KEYTYPE) {                                            \
-    PREFIX##mem_usage_t mem_usage;                                                              \
+    mem_usage_t mem_usage;                                                              \
     PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L,                            \
          U, work, lwork, B, X, recip_pivot_growth, rcond,                                       \
          &mem_usage, stats, info);                                                              \
@@ -156,37 +181,38 @@ struct SluMatrix : SuperMatrix
     res.setScalarType<typename MatrixType::Scalar>();
     res.Mtype     = SLU_GE;
 
-    res.nrow      = mat.rows();
-    res.ncol      = mat.cols();
+    res.nrow      = internal::convert_index<int>(mat.rows());
+    res.ncol      = internal::convert_index<int>(mat.cols());
 
-    res.storage.lda       = MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride();
+    res.storage.lda       = internal::convert_index<int>(MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride());
     res.storage.values    = (void*)(mat.data());
     return res;
   }
 
   template<typename MatrixType>
-  static SluMatrix Map(SparseMatrixBase<MatrixType>& mat)
+  static SluMatrix Map(SparseMatrixBase<MatrixType>& a_mat)
   {
+    MatrixType &mat(a_mat.derived());
     SluMatrix res;
     if ((MatrixType::Flags&RowMajorBit)==RowMajorBit)
     {
       res.setStorageType(SLU_NR);
-      res.nrow      = mat.cols();
-      res.ncol      = mat.rows();
+      res.nrow      = internal::convert_index<int>(mat.cols());
+      res.ncol      = internal::convert_index<int>(mat.rows());
     }
     else
     {
       res.setStorageType(SLU_NC);
-      res.nrow      = mat.rows();
-      res.ncol      = mat.cols();
+      res.nrow      = internal::convert_index<int>(mat.rows());
+      res.ncol      = internal::convert_index<int>(mat.cols());
     }
 
     res.Mtype       = SLU_GE;
 
-    res.storage.nnz       = mat.nonZeros();
-    res.storage.values    = mat.derived().valuePtr();
-    res.storage.innerInd  = mat.derived().innerIndexPtr();
-    res.storage.outerInd  = mat.derived().outerIndexPtr();
+    res.storage.nnz       = internal::convert_index<int>(mat.nonZeros());
+    res.storage.values    = mat.valuePtr();
+    res.storage.innerInd  = mat.innerIndexPtr();
+    res.storage.outerInd  = mat.outerIndexPtr();
 
     res.setScalarType<typename MatrixType::Scalar>();
 
@@ -288,17 +314,26 @@ MappedSparseMatrix<Scalar,Flags,Index> map_superlu(SluMatrix& sluMat)
   * \brief The base class for the direct and incomplete LU factorization of SuperLU
   */
 template<typename _MatrixType, typename Derived>
-class SuperLUBase : internal::noncopyable
+class SuperLUBase : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
   public:
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;    
+    typedef Map<PermutationMatrix<Dynamic,Dynamic,int> > PermutationMap;
     typedef SparseMatrix<Scalar> LUMatrixType;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
@@ -309,9 +344,6 @@ class SuperLUBase : internal::noncopyable
       clearFactors();
     }
     
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
     
@@ -335,33 +367,7 @@ class SuperLUBase : internal::noncopyable
       derived().analyzePattern(matrix);
       derived().factorize(matrix);
     }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SuperLUBase, Rhs> solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "SuperLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SuperLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<SuperLUBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SuperLUBase, Rhs> solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "SuperLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SuperLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<SuperLUBase, Rhs>(*this, b.derived());
-    }
-    
+
     /** Performs a symbolic decomposition on the sparcity of \a matrix.
       *
       * This function is particularly useful when solving for several problems having the same structure.
@@ -386,7 +392,7 @@ class SuperLUBase : internal::noncopyable
     {
       set_default_options(&this->m_sluOptions);
       
-      const int size = a.rows();
+      const Index size = a.rows();
       m_matrix = a;
 
       m_sluA = internal::asSluMatrix(m_matrix);
@@ -405,7 +411,7 @@ class SuperLUBase : internal::noncopyable
       m_sluB.storage.values = 0;
       m_sluB.nrow           = 0;
       m_sluB.ncol           = 0;
-      m_sluB.storage.lda    = size;
+      m_sluB.storage.lda    = internal::convert_index<int>(size);
       m_sluX                = m_sluB;
       
       m_extractedDataAreDirty = true;
@@ -453,7 +459,6 @@ class SuperLUBase : internal::noncopyable
     mutable char m_sluEqued;
 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     int m_factorizationIsOk;
     int m_analysisIsOk;
     mutable bool m_extractedDataAreDirty;
@@ -473,7 +478,11 @@ class SuperLUBase : internal::noncopyable
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SparseLU
   */
 template<typename _MatrixType>
 class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
@@ -483,18 +492,20 @@ class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
     typedef _MatrixType MatrixType;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::RealScalar RealScalar;
-    typedef typename Base::Index Index;
+    typedef typename Base::StorageIndex StorageIndex;
     typedef typename Base::IntRowVectorType IntRowVectorType;
-    typedef typename Base::IntColVectorType IntColVectorType;    
+    typedef typename Base::IntColVectorType IntColVectorType;   
+    typedef typename Base::PermutationMap PermutationMap;
     typedef typename Base::LUMatrixType LUMatrixType;
     typedef TriangularView<LUMatrixType, Lower|UnitDiag>  LMatrixType;
-    typedef TriangularView<LUMatrixType,  Upper>           UMatrixType;
+    typedef TriangularView<LUMatrixType,  Upper>          UMatrixType;
 
   public:
+    using Base::_solve_impl;
 
     SuperLU() : Base() { init(); }
 
-    SuperLU(const MatrixType& matrix) : Base()
+    explicit SuperLU(const MatrixType& matrix) : Base()
     {
       init();
       Base::compute(matrix);
@@ -525,11 +536,9 @@ class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
       */
     void factorize(const MatrixType& matrix);
     
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
-    #endif // EIGEN_PARSED_BY_DOXYGEN
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
     
     inline const LMatrixType& matrixL() const
     {
@@ -637,12 +646,12 @@ void SuperLU<MatrixType>::factorize(const MatrixType& a)
 
 template<typename MatrixType>
 template<typename Rhs,typename Dest>
-void SuperLU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
+void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
 {
   eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
 
-  const int size = m_matrix.rows();
-  const int rhsCols = b.cols();
+  const Index size = m_matrix.rows();
+  const Index rhsCols = b.cols();
   eigen_assert(size==b.rows());
 
   m_sluOptions.Trans = NOTRANS;
@@ -652,8 +661,12 @@ void SuperLU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x)
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+  
+  Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
+  
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
   
   typename Rhs::PlainObject b_cpy;
   if(m_sluEqued!='N')
@@ -676,6 +689,10 @@ void SuperLU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x)
                 &m_sluFerr[0], &m_sluBerr[0],
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
+  
+  if(x.derived().data() != x_ref.data())
+    x = x_ref;
+  
   m_info = info==0 ? Success : NumericalIssue;
 }
 
@@ -699,7 +716,7 @@ void SuperLUBase<MatrixType,Derived>::extractData() const
     NCformat    *Ustore = static_cast<NCformat*>(m_sluU.Store);
     Scalar      *SNptr;
 
-    const int size = m_matrix.rows();
+    const Index size = m_matrix.rows();
     m_l.resize(size,size);
     m_l.resizeNonZeros(Lstore->nnz);
     m_u.resize(size,size);
@@ -791,6 +808,8 @@ typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
         det *= m_u.valuePtr()[lastId];
     }
   }
+  if(PermutationMap(m_p.data(),m_p.size()).determinant()*PermutationMap(m_q.data(),m_q.size()).determinant()<0)
+    det = -det;
   if(m_sluEqued!='N')
     return det/m_sluRscale.prod()/m_sluCscale.prod();
   else
@@ -810,11 +829,13 @@ typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
   * This class allows to solve for an approximate solution of A.X = B sparse linear problems via an incomplete LU factorization
   * using the SuperLU library. This class is aimed to be used as a preconditioner of the iterative linear solvers.
   *
-  * \warning This class requires SuperLU 4 or later.
+  * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
-  * \sa \ref TutorialSparseDirectSolvers, class ConjugateGradient, class BiCGSTAB
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class IncompleteLUT, class ConjugateGradient, class BiCGSTAB
   */
 
 template<typename _MatrixType>
@@ -825,9 +846,9 @@ class SuperILU : public SuperLUBase<_MatrixType,SuperILU<_MatrixType> >
     typedef _MatrixType MatrixType;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::RealScalar RealScalar;
-    typedef typename Base::Index Index;
 
   public:
+    using Base::_solve_impl;
 
     SuperILU() : Base() { init(); }
 
@@ -863,7 +884,7 @@ class SuperILU : public SuperLUBase<_MatrixType,SuperILU<_MatrixType> >
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
     #endif // EIGEN_PARSED_BY_DOXYGEN
     
   protected:
@@ -946,9 +967,10 @@ void SuperILU<MatrixType>::factorize(const MatrixType& a)
   m_factorizationIsOk = true;
 }
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType>
 template<typename Rhs,typename Dest>
-void SuperILU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
+void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
 {
   eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
 
@@ -962,8 +984,12 @@ void SuperILU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x)
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+  
+  Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
+  
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
 
   typename Rhs::PlainObject b_cpy;
   if(m_sluEqued!='N')
@@ -986,40 +1012,15 @@ void SuperILU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x)
                 &recip_pivot_growth, &rcond,
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
+  
+  if(x.derived().data() != x_ref.data())
+    x = x_ref;
 
   m_info = info==0 ? Success : NumericalIssue;
 }
 #endif
 
-namespace internal {
-  
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct solve_retval<SuperLUBase<_MatrixType,Derived>, Rhs>
-  : solve_retval_base<SuperLUBase<_MatrixType,Derived>, Rhs>
-{
-  typedef SuperLUBase<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct sparse_solve_retval<SuperLUBase<_MatrixType,Derived>, Rhs>
-  : sparse_solve_retval_base<SuperLUBase<_MatrixType,Derived>, Rhs>
-{
-  typedef SuperLUBase<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
+#endif
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/UmfPackSupport/CMakeLists.txt b/Eigen/src/UmfPackSupport/CMakeLists.txt
deleted file mode 100644
index a57de0020..000000000
--- a/Eigen/src/UmfPackSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_UmfPackSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_UmfPackSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/UmfPackSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 29c60c378..dc74de935 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -16,6 +16,13 @@ namespace Eigen {
 
 // generic double/complex<double> wrapper functions:
 
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double) 
+{ umfpack_di_defaults(control); }
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>) 
+{ umfpack_zi_defaults(control); }
+
 inline void umfpack_free_numeric(void **Numeric, double)
 { umfpack_di_free_numeric(Numeric); *Numeric = 0; }
 
@@ -107,15 +114,6 @@ inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *N
   return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
 }
 
-namespace internal {
-  template<typename T> struct umfpack_helper_is_sparse_plain : false_type {};
-  template<typename Scalar, int Options, typename StorageIndex>
-  struct umfpack_helper_is_sparse_plain<SparseMatrix<Scalar,Options,StorageIndex> >
-    : true_type {};
-  template<typename Scalar, int Options, typename StorageIndex>
-  struct umfpack_helper_is_sparse_plain<MappedSparseMatrix<Scalar,Options,StorageIndex> >
-    : true_type {};
-}
 
 /** \ingroup UmfPackSupport_Module
   * \brief A sparse LU factorization and solver based on UmfPack
@@ -128,27 +126,46 @@ namespace internal {
   * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SparseLU
   */
 template<typename _MatrixType>
-class UmfPackLU : internal::noncopyable
+class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 {
+  protected:
+    typedef SparseSolverBase<UmfPackLU<_MatrixType> > Base;
+    using Base::m_isInitialized;
   public:
+    using Base::_solve_impl;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
     typedef SparseMatrix<Scalar> LUMatrixType;
     typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
+    typedef Ref<const UmfpackMatrixType, StandardCompressedFormat> UmfpackMatrixRef;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
-    UmfPackLU() { init(); }
+    typedef Array<double, UMFPACK_CONTROL, 1> UmfpackControl;
 
-    UmfPackLU(const MatrixType& matrix)
+    UmfPackLU()
+      : m_dummy(0,0), mp_matrix(m_dummy)
+    {
+      init();
+    }
+
+    template<typename InputMatrixType>
+    explicit UmfPackLU(const InputMatrixType& matrix)
+      : mp_matrix(matrix)
     {
       init();
       compute(matrix);
@@ -160,8 +177,8 @@ class UmfPackLU : internal::noncopyable
       if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
     }
 
-    inline Index rows() const { return m_copyMatrix.rows(); }
-    inline Index cols() const { return m_copyMatrix.cols(); }
+    inline Index rows() const { return mp_matrix.rows(); }
+    inline Index cols() const { return mp_matrix.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
@@ -207,37 +224,11 @@ class UmfPackLU : internal::noncopyable
     {
       if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
       if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
-      grapInput(matrix.derived());
+      grab(matrix.derived());
       analyzePattern_impl();
       factorize_impl();
     }
 
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<UmfPackLU, Rhs> solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "UmfPackLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "UmfPackLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<UmfPackLU, Rhs>(*this, b.derived());
-    }
-
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<UmfPackLU, Rhs> solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "UmfPackLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "UmfPackLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<UmfPackLU, Rhs>(*this, b.derived());
-    }
-
     /** Performs a symbolic decomposition on the sparcity of \a matrix.
       *
       * This function is particularly useful when solving for several problems having the same structure.
@@ -250,11 +241,44 @@ class UmfPackLU : internal::noncopyable
       if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
       if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
       
-      grapInput(matrix.derived());
+      grab(matrix.derived());
 
       analyzePattern_impl();
     }
 
+    /** Provides the return status code returned by UmfPack during the numeric
+      * factorization.
+      *
+      * \sa factorize(), compute()
+      */
+    inline int umfpackFactorizeReturncode() const
+    {
+      eigen_assert(m_numeric && "UmfPackLU: you must first call factorize()");
+      return m_fact_errorCode;
+    }
+
+    /** Provides access to the control settings array used by UmfPack.
+      *
+      * If this array contains NaN's, the default values are used.
+      *
+      * See UMFPACK documentation for details.
+      */
+    inline const UmfpackControl& umfpackControl() const
+    {
+      return m_control;
+    }
+    
+    /** Provides access to the control settings array used by UmfPack.
+      *
+      * If this array contains NaN's, the default values are used.
+      *
+      * See UMFPACK documentation for details.
+      */
+    inline UmfpackControl& umfpackControl()
+    {
+      return m_control;
+    }
+    
     /** Performs a numeric decomposition of \a matrix
       *
       * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
@@ -268,16 +292,14 @@ class UmfPackLU : internal::noncopyable
       if(m_numeric)
         umfpack_free_numeric(&m_numeric,Scalar());
 
-      grapInput(matrix.derived());
+      grab(matrix.derived());
       
       factorize_impl();
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename BDerived,typename XDerived>
-    bool _solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
-    #endif
+    bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
 
     Scalar determinant() const;
 
@@ -291,52 +313,17 @@ class UmfPackLU : internal::noncopyable
       m_isInitialized         = false;
       m_numeric               = 0;
       m_symbolic              = 0;
-      m_outerIndexPtr         = 0;
-      m_innerIndexPtr         = 0;
-      m_valuePtr              = 0;
       m_extractedDataAreDirty = true;
     }
     
-    template<typename InputMatrixType>
-    void grapInput_impl(const InputMatrixType& mat, internal::true_type)
-    {
-      m_copyMatrix.resize(mat.rows(), mat.cols());
-      if( ((MatrixType::Flags&RowMajorBit)==RowMajorBit) || sizeof(typename MatrixType::Index)!=sizeof(int) || !mat.isCompressed() )
-      {
-        // non supported input -> copy
-        m_copyMatrix = mat;
-        m_outerIndexPtr = m_copyMatrix.outerIndexPtr();
-        m_innerIndexPtr = m_copyMatrix.innerIndexPtr();
-        m_valuePtr      = m_copyMatrix.valuePtr();
-      }
-      else
-      {
-        m_outerIndexPtr = mat.outerIndexPtr();
-        m_innerIndexPtr = mat.innerIndexPtr();
-        m_valuePtr      = mat.valuePtr();
-      }
-    }
-    
-    template<typename InputMatrixType>
-    void grapInput_impl(const InputMatrixType& mat, internal::false_type)
-    {
-      m_copyMatrix = mat;
-      m_outerIndexPtr = m_copyMatrix.outerIndexPtr();
-      m_innerIndexPtr = m_copyMatrix.innerIndexPtr();
-      m_valuePtr      = m_copyMatrix.valuePtr();
-    }
-    
-    template<typename InputMatrixType>
-    void grapInput(const InputMatrixType& mat)
-    {
-      grapInput_impl(mat, internal::umfpack_helper_is_sparse_plain<InputMatrixType>());
-    }
-    
     void analyzePattern_impl()
     {
+      umfpack_defaults(m_control.data(), Scalar());
       int errorCode = 0;
-      errorCode = umfpack_symbolic(m_copyMatrix.rows(), m_copyMatrix.cols(), m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-                                   &m_symbolic, 0, 0);
+      errorCode = umfpack_symbolic(internal::convert_index<int>(mp_matrix.rows()),
+                                   internal::convert_index<int>(mp_matrix.cols()),
+                                   mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                   &m_symbolic, m_control.data(), 0);
 
       m_isInitialized = true;
       m_info = errorCode ? InvalidInput : Success;
@@ -347,36 +334,52 @@ class UmfPackLU : internal::noncopyable
     
     void factorize_impl()
     {
-      int errorCode;
-      errorCode = umfpack_numeric(m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-                                  m_symbolic, &m_numeric, 0, 0);
+      m_fact_errorCode = umfpack_numeric(mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                         m_symbolic, &m_numeric, m_control.data(), 0);
 
-      m_info = errorCode ? NumericalIssue : Success;
+      m_info = m_fact_errorCode == UMFPACK_OK ? Success : NumericalIssue;
       m_factorizationIsOk = true;
       m_extractedDataAreDirty = true;
     }
-
+    
+    template<typename MatrixDerived>
+    void grab(const EigenBase<MatrixDerived> &A)
+    {
+      mp_matrix.~UmfpackMatrixRef();
+      ::new (&mp_matrix) UmfpackMatrixRef(A.derived());
+    }
+    
+    void grab(const UmfpackMatrixRef &A)
+    {
+      if(&(A.derived()) != &mp_matrix)
+      {
+        mp_matrix.~UmfpackMatrixRef();
+        ::new (&mp_matrix) UmfpackMatrixRef(A);
+      }
+    }
+  
     // cached data to reduce reallocation, etc.
     mutable LUMatrixType m_l;
+    int m_fact_errorCode;
+    UmfpackControl m_control;
+    
     mutable LUMatrixType m_u;
     mutable IntColVectorType m_p;
     mutable IntRowVectorType m_q;
 
-    UmfpackMatrixType m_copyMatrix;
-    const Scalar* m_valuePtr;
-    const int* m_outerIndexPtr;
-    const int* m_innerIndexPtr;
+    UmfpackMatrixType m_dummy;
+    UmfpackMatrixRef mp_matrix;
+  
     void* m_numeric;
     void* m_symbolic;
 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     int m_factorizationIsOk;
     int m_analysisIsOk;
     mutable bool m_extractedDataAreDirty;
     
   private:
-    UmfPackLU(UmfPackLU& ) { }
+    UmfPackLU(const UmfPackLU& ) { }
 };
 
 
@@ -418,19 +421,30 @@ typename UmfPackLU<MatrixType>::Scalar UmfPackLU<MatrixType>::determinant() cons
 
 template<typename MatrixType>
 template<typename BDerived,typename XDerived>
-bool UmfPackLU<MatrixType>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
+bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
 {
-  const int rhsCols = b.cols();
+  Index rhsCols = b.cols();
   eigen_assert((BDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major rhs yet");
   eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet");
   eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
   
   int errorCode;
+  Scalar* x_ptr = 0;
+  Matrix<Scalar,Dynamic,1> x_tmp;
+  if(x.innerStride()!=1)
+  {
+    x_tmp.resize(x.rows());
+    x_ptr = x_tmp.data();
+  }
   for (int j=0; j<rhsCols; ++j)
   {
+    if(x.innerStride()==1)
+      x_ptr = &x.col(j).coeffRef(0);
     errorCode = umfpack_solve(UMFPACK_A,
-        m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-        &x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
+        mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+        x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), 0);
+    if(x.innerStride()!=1)
+      x.col(j) = x_tmp;
     if (errorCode!=0)
       return false;
   }
@@ -438,37 +452,6 @@ bool UmfPackLU<MatrixType>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDe
   return true;
 }
 
-
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<UmfPackLU<_MatrixType>, Rhs>
-  : solve_retval_base<UmfPackLU<_MatrixType>, Rhs>
-{
-  typedef UmfPackLU<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Rhs>
-struct sparse_solve_retval<UmfPackLU<_MatrixType>, Rhs>
-  : sparse_solve_retval_base<UmfPackLU<_MatrixType>, Rhs>
-{
-  typedef UmfPackLU<_MatrixType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_UMFPACKSUPPORT_H
diff --git a/Eigen/src/misc/CMakeLists.txt b/Eigen/src/misc/CMakeLists.txt
deleted file mode 100644
index a58ffb745..000000000
--- a/Eigen/src/misc/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_misc_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_misc_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/misc COMPONENT Devel
-  )
diff --git a/Eigen/src/misc/Image.h b/Eigen/src/misc/Image.h
index 75c5f433a..b8b8a0455 100644
--- a/Eigen/src/misc/Image.h
+++ b/Eigen/src/misc/Image.h
@@ -38,7 +38,6 @@ template<typename _DecompositionType> struct image_retval_base
   typedef _DecompositionType DecompositionType;
   typedef typename DecompositionType::MatrixType MatrixType;
   typedef ReturnByValue<image_retval_base> Base;
-  typedef typename Base::Index Index;
 
   image_retval_base(const DecompositionType& dec, const MatrixType& originalMatrix)
     : m_dec(dec), m_rank(dec.rank()),
@@ -69,7 +68,6 @@ template<typename _DecompositionType> struct image_retval_base
   typedef typename DecompositionType::MatrixType MatrixType; \
   typedef typename MatrixType::Scalar Scalar; \
   typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
   typedef Eigen::internal::image_retval_base<DecompositionType> Base; \
   using Base::dec; \
   using Base::originalMatrix; \
diff --git a/Eigen/src/misc/Kernel.h b/Eigen/src/misc/Kernel.h
index b9e1518fd..bef5d6ff5 100644
--- a/Eigen/src/misc/Kernel.h
+++ b/Eigen/src/misc/Kernel.h
@@ -39,9 +39,8 @@ template<typename _DecompositionType> struct kernel_retval_base
 {
   typedef _DecompositionType DecompositionType;
   typedef ReturnByValue<kernel_retval_base> Base;
-  typedef typename Base::Index Index;
 
-  kernel_retval_base(const DecompositionType& dec)
+  explicit kernel_retval_base(const DecompositionType& dec)
     : m_dec(dec),
       m_rank(dec.rank()),
       m_cols(m_rank==dec.cols() ? 1 : dec.cols() - m_rank)
@@ -68,7 +67,6 @@ template<typename _DecompositionType> struct kernel_retval_base
   typedef typename DecompositionType::MatrixType MatrixType; \
   typedef typename MatrixType::Scalar Scalar; \
   typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
   typedef Eigen::internal::kernel_retval_base<DecompositionType> Base; \
   using Base::dec; \
   using Base::rank; \
diff --git a/Eigen/src/misc/RealSvd2x2.h b/Eigen/src/misc/RealSvd2x2.h
new file mode 100644
index 000000000..abb4d3c2f
--- /dev/null
+++ b/Eigen/src/misc/RealSvd2x2.h
@@ -0,0 +1,55 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REALSVD2X2_H
+#define EIGEN_REALSVD2X2_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename MatrixType, typename RealScalar, typename Index>
+void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
+                         JacobiRotation<RealScalar> *j_left,
+                         JacobiRotation<RealScalar> *j_right)
+{
+  using std::sqrt;
+  using std::abs;
+  Matrix<RealScalar,2,2> m;
+  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
+       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
+  JacobiRotation<RealScalar> rot1;
+  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
+  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
+
+  if(abs(d) < (std::numeric_limits<RealScalar>::min)())
+  {
+    rot1.s() = RealScalar(0);
+    rot1.c() = RealScalar(1);
+  }
+  else
+  {
+    // If d!=0, then t/d cannot overflow because the magnitude of the
+    // entries forming d are not too small compared to the ones forming t.
+    RealScalar u = t / d;
+    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
+    rot1.s() = RealScalar(1) / tmp;
+    rot1.c() = u / tmp;
+  }
+  m.applyOnTheLeft(0,1,rot1);
+  j_right->makeJacobi(m,0,1);
+  *j_left = rot1 * j_right->transpose();
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_REALSVD2X2_H
diff --git a/Eigen/src/misc/Solve.h b/Eigen/src/misc/Solve.h
deleted file mode 100644
index 7f70d60af..000000000
--- a/Eigen/src/misc/Solve.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MISC_SOLVE_H
-#define EIGEN_MISC_SOLVE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/** \class solve_retval_base
-  *
-  */
-template<typename DecompositionType, typename Rhs>
-struct traits<solve_retval_base<DecompositionType, Rhs> >
-{
-  typedef typename DecompositionType::MatrixType MatrixType;
-  typedef Matrix<typename Rhs::Scalar,
-                 MatrixType::ColsAtCompileTime,
-                 Rhs::ColsAtCompileTime,
-                 Rhs::PlainObject::Options,
-                 MatrixType::MaxColsAtCompileTime,
-                 Rhs::MaxColsAtCompileTime> ReturnType;
-};
-
-template<typename _DecompositionType, typename Rhs> struct solve_retval_base
- : public ReturnByValue<solve_retval_base<_DecompositionType, Rhs> >
-{
-  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
-  typedef _DecompositionType DecompositionType;
-  typedef ReturnByValue<solve_retval_base> Base;
-  typedef typename Base::Index Index;
-
-  solve_retval_base(const DecompositionType& dec, const Rhs& rhs)
-    : m_dec(dec), m_rhs(rhs)
-  {}
-
-  inline Index rows() const { return m_dec.cols(); }
-  inline Index cols() const { return m_rhs.cols(); }
-  inline const DecompositionType& dec() const { return m_dec; }
-  inline const RhsNestedCleaned& rhs() const { return m_rhs; }
-
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    static_cast<const solve_retval<DecompositionType,Rhs>*>(this)->evalTo(dst);
-  }
-
-  protected:
-    const DecompositionType& m_dec;
-    typename Rhs::Nested m_rhs;
-};
-
-} // end namespace internal
-
-#define EIGEN_MAKE_SOLVE_HELPERS(DecompositionType,Rhs) \
-  typedef typename DecompositionType::MatrixType MatrixType; \
-  typedef typename MatrixType::Scalar Scalar; \
-  typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
-  typedef Eigen::internal::solve_retval_base<DecompositionType,Rhs> Base; \
-  using Base::dec; \
-  using Base::rhs; \
-  using Base::rows; \
-  using Base::cols; \
-  solve_retval(const DecompositionType& dec, const Rhs& rhs) \
-    : Base(dec, rhs) {}
-
-} // end namespace Eigen
-
-#endif // EIGEN_MISC_SOLVE_H
diff --git a/Eigen/src/misc/SparseSolve.h b/Eigen/src/misc/SparseSolve.h
deleted file mode 100644
index 244bb8ec7..000000000
--- a/Eigen/src/misc/SparseSolve.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SPARSE_SOLVE_H
-#define EIGEN_SPARSE_SOLVE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval_base;
-template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval;
-  
-template<typename DecompositionType, typename Rhs>
-struct traits<sparse_solve_retval_base<DecompositionType, Rhs> >
-{
-  typedef typename DecompositionType::MatrixType MatrixType;
-  typedef SparseMatrix<typename Rhs::Scalar, Rhs::Options, typename Rhs::Index> ReturnType;
-};
-
-template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval_base
- : public ReturnByValue<sparse_solve_retval_base<_DecompositionType, Rhs> >
-{
-  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
-  typedef _DecompositionType DecompositionType;
-  typedef ReturnByValue<sparse_solve_retval_base> Base;
-  typedef typename Base::Index Index;
-
-  sparse_solve_retval_base(const DecompositionType& dec, const Rhs& rhs)
-    : m_dec(dec), m_rhs(rhs)
-  {}
-
-  inline Index rows() const { return m_dec.cols(); }
-  inline Index cols() const { return m_rhs.cols(); }
-  inline const DecompositionType& dec() const { return m_dec; }
-  inline const RhsNestedCleaned& rhs() const { return m_rhs; }
-
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    static_cast<const sparse_solve_retval<DecompositionType,Rhs>*>(this)->evalTo(dst);
-  }
-
-  protected:
-    template<typename DestScalar, int DestOptions, typename DestIndex>
-    inline void defaultEvalTo(SparseMatrix<DestScalar,DestOptions,DestIndex>& dst) const
-    {
-      // we process the sparse rhs per block of NbColsAtOnce columns temporarily stored into a dense matrix.
-      static const int NbColsAtOnce = 4;
-      int rhsCols = m_rhs.cols();
-      int size = m_rhs.rows();
-      Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmp(size,rhsCols);
-      Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmpX(size,rhsCols);
-      for(int k=0; k<rhsCols; k+=NbColsAtOnce)
-      {
-        int actualCols = std::min<int>(rhsCols-k, NbColsAtOnce);
-        tmp.leftCols(actualCols) = m_rhs.middleCols(k,actualCols);
-        tmpX.leftCols(actualCols) = m_dec.solve(tmp.leftCols(actualCols));
-        dst.middleCols(k,actualCols) = tmpX.leftCols(actualCols).sparseView();
-      }
-    }
-    const DecompositionType& m_dec;
-    typename Rhs::Nested m_rhs;
-};
-
-#define EIGEN_MAKE_SPARSE_SOLVE_HELPERS(DecompositionType,Rhs) \
-  typedef typename DecompositionType::MatrixType MatrixType; \
-  typedef typename MatrixType::Scalar Scalar; \
-  typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
-  typedef Eigen::internal::sparse_solve_retval_base<DecompositionType,Rhs> Base; \
-  using Base::dec; \
-  using Base::rhs; \
-  using Base::rows; \
-  using Base::cols; \
-  sparse_solve_retval(const DecompositionType& dec, const Rhs& rhs) \
-    : Base(dec, rhs) {}
-
-
-
-template<typename DecompositionType, typename Rhs, typename Guess> struct solve_retval_with_guess;
-
-template<typename DecompositionType, typename Rhs, typename Guess>
-struct traits<solve_retval_with_guess<DecompositionType, Rhs, Guess> >
-{
-  typedef typename DecompositionType::MatrixType MatrixType;
-  typedef Matrix<typename Rhs::Scalar,
-                 MatrixType::ColsAtCompileTime,
-                 Rhs::ColsAtCompileTime,
-                 Rhs::PlainObject::Options,
-                 MatrixType::MaxColsAtCompileTime,
-                 Rhs::MaxColsAtCompileTime> ReturnType;
-};
-
-template<typename DecompositionType, typename Rhs, typename Guess> struct solve_retval_with_guess
- : public ReturnByValue<solve_retval_with_guess<DecompositionType, Rhs, Guess> >
-{
-  typedef typename DecompositionType::Index Index;
-
-  solve_retval_with_guess(const DecompositionType& dec, const Rhs& rhs, const Guess& guess)
-    : m_dec(dec), m_rhs(rhs), m_guess(guess)
-  {}
-
-  inline Index rows() const { return m_dec.cols(); }
-  inline Index cols() const { return m_rhs.cols(); }
-
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    dst = m_guess;
-    m_dec._solveWithGuess(m_rhs,dst);
-  }
-
-  protected:
-    const DecompositionType& m_dec;
-    const typename Rhs::Nested m_rhs;
-    const typename Guess::Nested m_guess;
-};
-
-} // namepsace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_SPARSE_SOLVE_H
diff --git a/Eigen/src/misc/blas.h b/Eigen/src/misc/blas.h
index 6fce99ed5..25215b15e 100644
--- a/Eigen/src/misc/blas.h
+++ b/Eigen/src/misc/blas.h
@@ -30,15 +30,15 @@ int  BLASFUNC(cdotcw)  (int *, float  *, int *, float  *, int *, float*);
 int  BLASFUNC(zdotuw)  (int *, double  *, int *, double  *, int *, double*);
 int  BLASFUNC(zdotcw)  (int *, double  *, int *, double  *, int *, double*);
 
-int    BLASFUNC(saxpy) (int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(daxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(qaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(caxpy) (int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(zaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(xaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(caxpyc)(int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(zaxpyc)(int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(xaxpyc)(int *, double *, double *, int *, double *, int *);
+int    BLASFUNC(saxpy) (const int *, const float  *, const float  *, const int *, float  *, const int *);
+int    BLASFUNC(daxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(qaxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(caxpy) (const int *, const float  *, const float  *, const int *, float  *, const int *);
+int    BLASFUNC(zaxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(xaxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(caxpyc)(const int *, const float  *, const float  *, const int *, float  *, const int *);
+int    BLASFUNC(zaxpyc)(const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(xaxpyc)(const int *, const double *, const double *, const int *, double *, const int *);
 
 int    BLASFUNC(scopy) (int *, float  *, int *, float  *, int *);
 int    BLASFUNC(dcopy) (int *, double *, int *, double *, int *);
@@ -177,31 +177,19 @@ int BLASFUNC(xgeru)(int *,    int *, double *, double *, int *,
 int BLASFUNC(xgerc)(int *,    int *, double *, double *, int *,
 		    double *, int *, double *, int *);
 
-int BLASFUNC(sgemv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(qgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(cgemv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(xgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
+int BLASFUNC(sgemv)(const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(cgemv)(const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
-int BLASFUNC(strsv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(dtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(qtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(ctrsv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(ztrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(xtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
+int BLASFUNC(strsv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrsv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(ztrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
 
 int BLASFUNC(stpsv) (char *, char *, char *, int *, float  *, float  *, int *);
 int BLASFUNC(dtpsv) (char *, char *, char *, int *, double *, double *, int *);
@@ -210,18 +198,12 @@ int BLASFUNC(ctpsv) (char *, char *, char *, int *, float  *, float  *, int *);
 int BLASFUNC(ztpsv) (char *, char *, char *, int *, double *, double *, int *);
 int BLASFUNC(xtpsv) (char *, char *, char *, int *, double *, double *, int *);
 
-int BLASFUNC(strmv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(dtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(qtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(ctrmv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(ztrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(xtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
+int BLASFUNC(strmv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrmv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(ztrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
 
 int BLASFUNC(stpmv) (char *, char *, char *, int *, float  *, float  *, int *);
 int BLASFUNC(dtpmv) (char *, char *, char *, int *, double *, double *, int *);
@@ -244,18 +226,9 @@ int BLASFUNC(ctbsv) (char *, char *, char *, int *, int *, float  *, int *, floa
 int BLASFUNC(ztbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
 int BLASFUNC(xtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
 
-int BLASFUNC(ssymv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(dsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(qsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(csymv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
+int BLASFUNC(ssymv) (const char *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(sspmv) (char *, int *, float  *, float *,
 		     float  *, int *, float *, float *, int *);
@@ -263,38 +236,17 @@ int BLASFUNC(dspmv) (char *, int *, double  *, double *,
 		     double  *, int *, double *, double *, int *);
 int BLASFUNC(qspmv) (char *, int *, double  *, double *,
 		     double  *, int *, double *, double *, int *);
-int BLASFUNC(cspmv) (char *, int *, float  *, float *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zspmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xspmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
 
-int BLASFUNC(ssyr) (char *, int *, float   *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(dsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(qsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(csyr) (char *, int *, float   *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(zsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(xsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
+int BLASFUNC(ssyr) (const char *, const int *, const float   *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dsyr) (const char *, const int *, const double  *, const double *, const int *, double *, const int *);
+int BLASFUNC(qsyr) (const char *, const int *, const double  *, const double *, const int *, double *, const int *);
 
-int BLASFUNC(ssyr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(dsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(qsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(csyr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(zsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(xsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
+int BLASFUNC(ssyr2) (const char *, const int *, const float   *, const float  *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(qsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(csyr2) (const char *, const int *, const float   *, const float  *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(zsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(xsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
 
 int BLASFUNC(sspr) (char *, int *, float   *, float  *, int *,
 		    float  *);
@@ -302,12 +254,6 @@ int BLASFUNC(dspr) (char *, int *, double  *, double *, int *,
 		    double *);
 int BLASFUNC(qspr) (char *, int *, double  *, double *, int *,
 		    double *);
-int BLASFUNC(cspr) (char *, int *, float   *, float  *, int *,
-		    float  *);
-int BLASFUNC(zspr) (char *, int *, double  *, double *, int *,
-		    double *);
-int BLASFUNC(xspr) (char *, int *, double  *, double *, int *,
-		    double *);
 
 int BLASFUNC(sspr2) (char *, int *, float   *,
 		     float  *, int *, float  *, int *, float  *);
@@ -347,12 +293,9 @@ int BLASFUNC(zhpr2) (char *, int *, double  *,
 int BLASFUNC(xhpr2) (char *, int *, double  *,
 		     double *, int *, double *, int *, double *);
 
-int BLASFUNC(chemv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zhemv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xhemv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
+int BLASFUNC(chemv) (const char *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zhemv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xhemv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(chpmv) (char *, int *, float  *, float *,
 		     float  *, int *, float *, float *, int *);
@@ -401,18 +344,12 @@ int BLASFUNC(xhbmv)(char *, int *, int *, double *, double *, int *,
 
 /* Level 3 routines */
 
-int BLASFUNC(sgemm)(char *, char *, int *, int *, int *, float *,
-	   float  *, int *, float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(qgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(cgemm)(char *, char *, int *, int *, int *, float *,
-	   float  *, int *, float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(xgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
+int BLASFUNC(sgemm)(const char *, const char *, const int *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(cgemm)(const char *, const char *, const int *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(cgemm3m)(char *, char *, int *, int *, int *, float *,
 	   float  *, int *, float  *, int *, float  *, float  *, int *);
@@ -434,84 +371,48 @@ int BLASFUNC(zge2mm)(char *, char *, char *, int *, int *,
 		     double *, double  *, int *, double  *, int *,
 		     double *, double  *, int *);
 
-int BLASFUNC(strsm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(dtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(qtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(ctrsm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(ztrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(xtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-
-int BLASFUNC(strmm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(dtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(qtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(ctrmm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(ztrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(xtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-
-int BLASFUNC(ssymm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(qsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(csymm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-
-int BLASFUNC(csymm3m)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-
-int BLASFUNC(ssyrk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(dsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(qsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(csyrk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(zsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(xsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-
-int BLASFUNC(ssyr2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(dsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(qsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(csyr2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-
-int BLASFUNC(chemm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zhemm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xhemm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
+int BLASFUNC(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+
+int BLASFUNC(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+
+int BLASFUNC(ssymm)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(csymm)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+int BLASFUNC(csymm3m)(char *, char *, int *, int *, float  *, float  *, int *, float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
+int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
+
+int BLASFUNC(ssyrk)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(csyrk)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+
+int BLASFUNC(ssyr2k)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(qsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(csyr2k)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(xsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+
+int BLASFUNC(chemm)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(chemm3m)(char *, char *, int *, int *, float  *, float  *, int *,
 	   float  *, int *, float  *, float  *, int *);
@@ -520,136 +421,17 @@ int BLASFUNC(zhemm3m)(char *, char *, int *, int *, double *, double *, int *,
 int BLASFUNC(xhemm3m)(char *, char *, int *, int *, double *, double *, int *,
 	   double *, int *, double *, double *, int *);
 
-int BLASFUNC(cherk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(zherk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(xherk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-
-int BLASFUNC(cher2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zher2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xher2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(cher2m)(char *, char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zher2m)(char *, char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xher2m)(char *, char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-
-int BLASFUNC(sgemt)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *,
-		    double *, int *);
-int BLASFUNC(cgemt)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *,
-		    double *, int *);
+int BLASFUNC(cherk)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+
+int BLASFUNC(cher2k)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(cher2m)(const char *, const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(xher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
 
-int BLASFUNC(sgema)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(dgema)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-int BLASFUNC(cgema)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(zgema)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-
-int BLASFUNC(sgems)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(dgems)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-int BLASFUNC(cgems)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(zgems)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-
-int BLASFUNC(sgetf2)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(cgetf2)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *);
-
-int BLASFUNC(sgetrf)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(cgetrf)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *);
-
-int BLASFUNC(slaswp)(int *, float  *, int *, int *, int *, int *, int *);
-int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(claswp)(int *, float  *, int *, int *, int *, int *, int *);
-int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *);
-
-int BLASFUNC(sgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
-int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(cgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
-int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-
-int BLASFUNC(sgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
-int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(cgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
-int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-
-int BLASFUNC(spotf2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotf2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(spotrf)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotrf)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(slauu2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(clauu2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(slauum)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(qlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(clauum)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(xlauum)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(strti2)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(ctrti2)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *);
-
-int BLASFUNC(strtri)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(ctrtri)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *);
-
-int BLASFUNC(spotri)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotri)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotri)(char *, int *, double *, int *, int *);
 
 #ifdef __cplusplus
 }
diff --git a/Eigen/src/misc/lapack.h b/Eigen/src/misc/lapack.h
new file mode 100644
index 000000000..249f3575c
--- /dev/null
+++ b/Eigen/src/misc/lapack.h
@@ -0,0 +1,152 @@
+#ifndef LAPACK_H
+#define LAPACK_H
+
+#include "blas.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+int BLASFUNC(csymv) (const char *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+
+int BLASFUNC(cspmv) (char *, int *, float  *, float *,
+         float  *, int *, float *, float *, int *);
+int BLASFUNC(zspmv) (char *, int *, double  *, double *,
+         double  *, int *, double *, double *, int *);
+int BLASFUNC(xspmv) (char *, int *, double  *, double *,
+         double  *, int *, double *, double *, int *);
+
+int BLASFUNC(csyr) (char *, int *, float   *, float  *, int *,
+        float  *, int *);
+int BLASFUNC(zsyr) (char *, int *, double  *, double *, int *,
+        double *, int *);
+int BLASFUNC(xsyr) (char *, int *, double  *, double *, int *,
+        double *, int *);
+
+int BLASFUNC(cspr) (char *, int *, float   *, float  *, int *,
+        float  *);
+int BLASFUNC(zspr) (char *, int *, double  *, double *, int *,
+        double *);
+int BLASFUNC(xspr) (char *, int *, double  *, double *, int *,
+        double *);
+
+int BLASFUNC(sgemt)(char *, int *, int *, float  *, float  *, int *,
+        float  *, int *);
+int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *,
+        double *, int *);
+int BLASFUNC(cgemt)(char *, int *, int *, float  *, float  *, int *,
+        float  *, int *);
+int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *,
+        double *, int *);
+
+int BLASFUNC(sgema)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(dgema)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+int BLASFUNC(cgema)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(zgema)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+
+int BLASFUNC(sgems)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(dgems)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+int BLASFUNC(cgems)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(zgems)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+
+int BLASFUNC(sgetf2)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(cgetf2)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *);
+
+int BLASFUNC(sgetrf)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(cgetrf)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *);
+
+int BLASFUNC(slaswp)(int *, float  *, int *, int *, int *, int *, int *);
+int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(claswp)(int *, float  *, int *, int *, int *, int *, int *);
+int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *);
+
+int BLASFUNC(sgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
+int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(cgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
+int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+
+int BLASFUNC(sgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
+int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(cgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
+int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+
+int BLASFUNC(spotf2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotf2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(spotrf)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotrf)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(slauu2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(clauu2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(slauum)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(qlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(clauum)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(xlauum)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(strti2)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(ctrti2)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *);
+
+int BLASFUNC(strtri)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(ctrtri)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *);
+
+int BLASFUNC(spotri)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotri)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotri)(char *, int *, double *, int *, int *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h
new file mode 100755
index 000000000..8c7e79b03
--- /dev/null
+++ b/Eigen/src/misc/lapacke.h
@@ -0,0 +1,16291 @@
+/*****************************************************************************
+  Copyright (c) 2010, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK
+* Author: Intel Corporation
+* Generated November, 2011
+*****************************************************************************/
+
+#ifndef _MKL_LAPACKE_H_
+
+#ifndef _LAPACKE_H_
+#define _LAPACKE_H_
+
+/*
+*  Turn on HAVE_LAPACK_CONFIG_H to redefine C-LAPACK datatypes
+*/
+#ifdef HAVE_LAPACK_CONFIG_H
+#include "lapacke_config.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>
+
+#ifndef lapack_int
+#define lapack_int     int
+#endif
+
+#ifndef lapack_logical
+#define lapack_logical lapack_int
+#endif
+
+/* Complex types are structures equivalent to the
+* Fortran complex types COMPLEX(4) and COMPLEX(8).
+*
+* One can also redefine the types with his own types
+* for example by including in the code definitions like
+*
+* #define lapack_complex_float std::complex<float>
+* #define lapack_complex_double std::complex<double>
+*
+* or define these types in the command line:
+*
+* -Dlapack_complex_float="std::complex<float>"
+* -Dlapack_complex_double="std::complex<double>"
+*/
+
+#ifndef LAPACK_COMPLEX_CUSTOM
+
+/* Complex type (single precision) */
+#ifndef lapack_complex_float
+#include <complex.h>
+#define lapack_complex_float    float _Complex
+#endif
+
+#ifndef lapack_complex_float_real
+#define lapack_complex_float_real(z)       (creal(z))
+#endif
+
+#ifndef lapack_complex_float_imag
+#define lapack_complex_float_imag(z)       (cimag(z))
+#endif
+
+lapack_complex_float lapack_make_complex_float( float re, float im );
+
+/* Complex type (double precision) */
+#ifndef lapack_complex_double
+#include <complex.h>
+#define lapack_complex_double   double _Complex
+#endif
+
+#ifndef lapack_complex_double_real
+#define lapack_complex_double_real(z)      (creal(z))
+#endif
+
+#ifndef lapack_complex_double_imag
+#define lapack_complex_double_imag(z)       (cimag(z))
+#endif
+
+lapack_complex_double lapack_make_complex_double( double re, double im );
+
+#endif
+
+#ifndef LAPACKE_malloc
+#define LAPACKE_malloc( size ) malloc( size )
+#endif
+#ifndef LAPACKE_free
+#define LAPACKE_free( p )      free( p )
+#endif
+
+#define LAPACK_C2INT( x ) (lapack_int)(*((float*)&x ))
+#define LAPACK_Z2INT( x ) (lapack_int)(*((double*)&x ))
+
+#define LAPACK_ROW_MAJOR               101
+#define LAPACK_COL_MAJOR               102
+
+#define LAPACK_WORK_MEMORY_ERROR       -1010
+#define LAPACK_TRANSPOSE_MEMORY_ERROR  -1011
+
+/* Callback logical functions of one, two, or three arguments are used
+*  to select eigenvalues to sort to the top left of the Schur form.
+*  The value is selected if function returns TRUE (non-zero). */
+
+typedef lapack_logical (*LAPACK_S_SELECT2) ( const float*, const float* );
+typedef lapack_logical (*LAPACK_S_SELECT3)
+    ( const float*, const float*, const float* );
+typedef lapack_logical (*LAPACK_D_SELECT2) ( const double*, const double* );
+typedef lapack_logical (*LAPACK_D_SELECT3)
+    ( const double*, const double*, const double* );
+
+typedef lapack_logical (*LAPACK_C_SELECT1) ( const lapack_complex_float* );
+typedef lapack_logical (*LAPACK_C_SELECT2)
+    ( const lapack_complex_float*, const lapack_complex_float* );
+typedef lapack_logical (*LAPACK_Z_SELECT1) ( const lapack_complex_double* );
+typedef lapack_logical (*LAPACK_Z_SELECT2)
+    ( const lapack_complex_double*, const lapack_complex_double* );
+
+#include "lapacke_mangling.h"
+
+#define LAPACK_lsame LAPACK_GLOBAL(lsame,LSAME)
+lapack_logical LAPACK_lsame( char* ca,  char* cb,
+                              lapack_int lca, lapack_int lcb );
+
+/* C-LAPACK function prototypes */
+
+lapack_int LAPACKE_sbdsdc( int matrix_order, char uplo, char compq,
+                           lapack_int n, float* d, float* e, float* u,
+                           lapack_int ldu, float* vt, lapack_int ldvt, float* q,
+                           lapack_int* iq );
+lapack_int LAPACKE_dbdsdc( int matrix_order, char uplo, char compq,
+                           lapack_int n, double* d, double* e, double* u,
+                           lapack_int ldu, double* vt, lapack_int ldvt,
+                           double* q, lapack_int* iq );
+
+lapack_int LAPACKE_sbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           float* d, float* e, float* vt, lapack_int ldvt,
+                           float* u, lapack_int ldu, float* c, lapack_int ldc );
+lapack_int LAPACKE_dbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           double* d, double* e, double* vt, lapack_int ldvt,
+                           double* u, lapack_int ldu, double* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_cbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           float* d, float* e, lapack_complex_float* vt,
+                           lapack_int ldvt, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_zbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           double* d, double* e, lapack_complex_double* vt,
+                           lapack_int ldvt, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* c,
+                           lapack_int ldc );
+
+lapack_int LAPACKE_sdisna( char job, lapack_int m, lapack_int n, const float* d,
+                           float* sep );
+lapack_int LAPACKE_ddisna( char job, lapack_int m, lapack_int n,
+                           const double* d, double* sep );
+
+lapack_int LAPACKE_sgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, float* ab, lapack_int ldab, float* d,
+                           float* e, float* q, lapack_int ldq, float* pt,
+                           lapack_int ldpt, float* c, lapack_int ldc );
+lapack_int LAPACKE_dgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, double* ab, lapack_int ldab,
+                           double* d, double* e, double* q, lapack_int ldq,
+                           double* pt, lapack_int ldpt, double* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_cgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, lapack_complex_float* ab,
+                           lapack_int ldab, float* d, float* e,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* pt, lapack_int ldpt,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, lapack_complex_double* ab,
+                           lapack_int ldab, double* d, double* e,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* pt, lapack_int ldpt,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* ab,
+                           lapack_int ldab, const lapack_int* ipiv, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* ab,
+                           lapack_int ldab, const lapack_int* ipiv,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* ab,
+                           lapack_int ldab, float* r, float* c, float* rowcnd,
+                           float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* ab,
+                           lapack_int ldab, double* r, double* c,
+                           double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           float* r, float* c, float* rowcnd, float* colcnd,
+                           float* amax );
+lapack_int LAPACKE_zgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           double* r, double* c, double* rowcnd, double* colcnd,
+                           double* amax );
+
+lapack_int LAPACKE_sgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku, const float* ab,
+                            lapack_int ldab, float* r, float* c, float* rowcnd,
+                            float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku, const double* ab,
+                            lapack_int ldab, double* r, double* c,
+                            double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku,
+                            const lapack_complex_float* ab, lapack_int ldab,
+                            float* r, float* c, float* rowcnd, float* colcnd,
+                            float* amax );
+lapack_int LAPACKE_zgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku,
+                            const lapack_complex_double* ab, lapack_int ldab,
+                            double* r, double* c, double* rowcnd,
+                            double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const float* ab, lapack_int ldab, const float* afb,
+                           lapack_int ldafb, const lapack_int* ipiv,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const double* ab, lapack_int ldab, const double* afb,
+                           lapack_int ldafb, const lapack_int* ipiv,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* afb, lapack_int ldafb,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* afb, lapack_int ldafb,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const float* ab, lapack_int ldab,
+                            const float* afb, lapack_int ldafb,
+                            const lapack_int* ipiv, const float* r,
+                            const float* c, const float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const double* ab, lapack_int ldab,
+                            const double* afb, lapack_int ldafb,
+                            const lapack_int* ipiv, const double* r,
+                            const double* c, const double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const lapack_complex_float* ab,
+                            lapack_int ldab, const lapack_complex_float* afb,
+                            lapack_int ldafb, const lapack_int* ipiv,
+                            const float* r, const float* c,
+                            const lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const lapack_complex_double* ab,
+                            lapack_int ldab, const lapack_complex_double* afb,
+                            lapack_int ldafb, const lapack_int* ipiv,
+                            const double* r, const double* c,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_sgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs, float* ab,
+                          lapack_int ldab, lapack_int* ipiv, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs, double* ab,
+                          lapack_int ldab, lapack_int* ipiv, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs,
+                          lapack_complex_float* ab, lapack_int ldab,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs,
+                          lapack_complex_double* ab, lapack_int ldab,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_sgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, float* ab, lapack_int ldab,
+                           float* afb, lapack_int ldafb, lapack_int* ipiv,
+                           char* equed, float* r, float* c, float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr,
+                           float* rpivot );
+lapack_int LAPACKE_dgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, double* ab, lapack_int ldab,
+                           double* afb, lapack_int ldafb, lapack_int* ipiv,
+                           char* equed, double* r, double* c, double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr,
+                           double* rpivot );
+lapack_int LAPACKE_cgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, lapack_complex_float* ab,
+                           lapack_int ldab, lapack_complex_float* afb,
+                           lapack_int ldafb, lapack_int* ipiv, char* equed,
+                           float* r, float* c, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr, float* rpivot );
+lapack_int LAPACKE_zgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, lapack_complex_double* ab,
+                           lapack_int ldab, lapack_complex_double* afb,
+                           lapack_int ldafb, lapack_int* ipiv, char* equed,
+                           double* r, double* c, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* rcond, double* ferr,
+                           double* berr, double* rpivot );
+
+lapack_int LAPACKE_sgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, float* ab, lapack_int ldab,
+                            float* afb, lapack_int ldafb, lapack_int* ipiv,
+                            char* equed, float* r, float* c, float* b,
+                            lapack_int ldb, float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, double* ab, lapack_int ldab,
+                            double* afb, lapack_int ldafb, lapack_int* ipiv,
+                            char* equed, double* r, double* c, double* b,
+                            lapack_int ldb, double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+lapack_int LAPACKE_cgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, lapack_complex_float* ab,
+                            lapack_int ldab, lapack_complex_float* afb,
+                            lapack_int ldafb, lapack_int* ipiv, char* equed,
+                            float* r, float* c, lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* rpvgrw,
+                            float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, lapack_complex_double* ab,
+                            lapack_int ldab, lapack_complex_double* afb,
+                            lapack_int ldafb, lapack_int* ipiv, char* equed,
+                            double* r, double* c, lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* rpvgrw,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_sgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, float* ab,
+                           lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_dgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, double* ab,
+                           lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_cgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_sgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const float* ab, lapack_int ldab,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const double* ab, lapack_int ldab,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_sgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* scale,
+                           lapack_int m, float* v, lapack_int ldv );
+lapack_int LAPACKE_dgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* scale,
+                           lapack_int m, double* v, lapack_int ldv );
+lapack_int LAPACKE_cgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* scale,
+                           lapack_int m, lapack_complex_float* v,
+                           lapack_int ldv );
+lapack_int LAPACKE_zgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* scale,
+                           lapack_int m, lapack_complex_double* v,
+                           lapack_int ldv );
+
+lapack_int LAPACKE_sgebal( int matrix_order, char job, lapack_int n, float* a,
+                           lapack_int lda, lapack_int* ilo, lapack_int* ihi,
+                           float* scale );
+lapack_int LAPACKE_dgebal( int matrix_order, char job, lapack_int n, double* a,
+                           lapack_int lda, lapack_int* ilo, lapack_int* ihi,
+                           double* scale );
+lapack_int LAPACKE_cgebal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ilo, lapack_int* ihi, float* scale );
+lapack_int LAPACKE_zgebal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ilo, lapack_int* ihi, double* scale );
+
+lapack_int LAPACKE_sgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* d, float* e,
+                           float* tauq, float* taup );
+lapack_int LAPACKE_dgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* d, double* e,
+                           double* tauq, double* taup );
+lapack_int LAPACKE_cgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda, float* d,
+                           float* e, lapack_complex_float* tauq,
+                           lapack_complex_float* taup );
+lapack_int LAPACKE_zgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda, double* d,
+                           double* e, lapack_complex_double* tauq,
+                           lapack_complex_double* taup );
+
+lapack_int LAPACKE_sgecon( int matrix_order, char norm, lapack_int n,
+                           const float* a, lapack_int lda, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dgecon( int matrix_order, char norm, lapack_int n,
+                           const double* a, lapack_int lda, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_cgecon( int matrix_order, char norm, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_zgecon( int matrix_order, char norm, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double anorm, double* rcond );
+
+lapack_int LAPACKE_sgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const float* a, lapack_int lda, float* r, float* c,
+                           float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const double* a, lapack_int lda, double* r,
+                           double* c, double* rowcnd, double* colcnd,
+                           double* amax );
+lapack_int LAPACKE_cgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float* r, float* c, float* rowcnd, float* colcnd,
+                           float* amax );
+lapack_int LAPACKE_zgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double* r, double* c, double* rowcnd, double* colcnd,
+                           double* amax );
+
+lapack_int LAPACKE_sgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const float* a, lapack_int lda, float* r, float* c,
+                            float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const double* a, lapack_int lda, double* r,
+                            double* c, double* rowcnd, double* colcnd,
+                            double* amax );
+lapack_int LAPACKE_cgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* r, float* c, float* rowcnd, float* colcnd,
+                            float* amax );
+lapack_int LAPACKE_zgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* r, double* c, double* rowcnd,
+                            double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_S_SELECT2 select, lapack_int n, float* a,
+                          lapack_int lda, lapack_int* sdim, float* wr,
+                          float* wi, float* vs, lapack_int ldvs );
+lapack_int LAPACKE_dgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_D_SELECT2 select, lapack_int n, double* a,
+                          lapack_int lda, lapack_int* sdim, double* wr,
+                          double* wi, double* vs, lapack_int ldvs );
+lapack_int LAPACKE_cgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_C_SELECT1 select, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_int* sdim, lapack_complex_float* w,
+                          lapack_complex_float* vs, lapack_int ldvs );
+lapack_int LAPACKE_zgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_Z_SELECT1 select, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_int* sdim, lapack_complex_double* w,
+                          lapack_complex_double* vs, lapack_int ldvs );
+
+lapack_int LAPACKE_sgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_S_SELECT2 select, char sense, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* sdim,
+                           float* wr, float* wi, float* vs, lapack_int ldvs,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_dgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_D_SELECT2 select, char sense, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* sdim,
+                           double* wr, double* wi, double* vs, lapack_int ldvs,
+                           double* rconde, double* rcondv );
+lapack_int LAPACKE_cgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_C_SELECT1 select, char sense, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* sdim, lapack_complex_float* w,
+                           lapack_complex_float* vs, lapack_int ldvs,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_zgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_Z_SELECT1 select, char sense, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* sdim, lapack_complex_double* w,
+                           lapack_complex_double* vs, lapack_int ldvs,
+                           double* rconde, double* rcondv );
+
+lapack_int LAPACKE_sgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, float* a, lapack_int lda, float* wr,
+                          float* wi, float* vl, lapack_int ldvl, float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_dgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, double* a, lapack_int lda, double* wr,
+                          double* wi, double* vl, lapack_int ldvl, double* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_cgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* w, lapack_complex_float* vl,
+                          lapack_int ldvl, lapack_complex_float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_zgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* w,
+                          lapack_complex_double* vl, lapack_int ldvl,
+                          lapack_complex_double* vr, lapack_int ldvr );
+
+lapack_int LAPACKE_sgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, float* a,
+                           lapack_int lda, float* wr, float* wi, float* vl,
+                           lapack_int ldvl, float* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, float* scale,
+                           float* abnrm, float* rconde, float* rcondv );
+lapack_int LAPACKE_dgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, double* a,
+                           lapack_int lda, double* wr, double* wi, double* vl,
+                           lapack_int ldvl, double* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, double* scale,
+                           double* abnrm, double* rconde, double* rcondv );
+lapack_int LAPACKE_cgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* w, lapack_complex_float* vl,
+                           lapack_int ldvl, lapack_complex_float* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           float* scale, float* abnrm, float* rconde,
+                           float* rcondv );
+lapack_int LAPACKE_zgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* w, lapack_complex_double* vl,
+                           lapack_int ldvl, lapack_complex_double* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           double* scale, double* abnrm, double* rconde,
+                           double* rcondv );
+
+lapack_int LAPACKE_sgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, float* a, lapack_int lda,
+                           float* tau );
+lapack_int LAPACKE_dgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, double* a, lapack_int lda,
+                           double* tau );
+lapack_int LAPACKE_cgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* tau );
+lapack_int LAPACKE_zgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgejsv( int matrix_order, char joba, char jobu, char jobv,
+                           char jobr, char jobt, char jobp, lapack_int m,
+                           lapack_int n, float* a, lapack_int lda, float* sva,
+                           float* u, lapack_int ldu, float* v, lapack_int ldv,
+                           float* stat, lapack_int* istat );
+lapack_int LAPACKE_dgejsv( int matrix_order, char joba, char jobu, char jobv,
+                           char jobr, char jobt, char jobp, lapack_int m,
+                           lapack_int n, double* a, lapack_int lda, double* sva,
+                           double* u, lapack_int ldu, double* v, lapack_int ldv,
+                           double* stat, lapack_int* istat );
+
+lapack_int LAPACKE_sgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs, float* a,
+                          lapack_int lda, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs, double* a,
+                          lapack_int lda, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_dgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_cgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_zgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+
+lapack_int LAPACKE_sgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_dgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_cgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_zgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+
+lapack_int LAPACKE_sgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, lapack_int* jpvt, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_dgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, lapack_int* jpvt,
+                           double rcond, lapack_int* rank );
+lapack_int LAPACKE_cgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_int* jpvt, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_zgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_int* jpvt, double rcond,
+                           lapack_int* rank );
+
+lapack_int LAPACKE_sgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* jpvt,
+                           float* tau );
+lapack_int LAPACKE_dgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* jpvt,
+                           double* tau );
+lapack_int LAPACKE_cgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* jpvt,
+                           float* tau );
+lapack_int LAPACKE_dgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* jpvt,
+                           double* tau );
+lapack_int LAPACKE_cgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const float* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const double* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs, const float* a,
+                            lapack_int lda, const float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* r,
+                            const float* c, const float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs, const double* a,
+                            lapack_int lda, const double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* r,
+                            const double* c, const double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* r,
+                            const float* c, const lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* r,
+                            const double* c, const lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_sgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, float* a, lapack_int lda, float* s,
+                           float* u, lapack_int ldu, float* vt,
+                           lapack_int ldvt );
+lapack_int LAPACKE_dgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, double* a, lapack_int lda, double* s,
+                           double* u, lapack_int ldu, double* vt,
+                           lapack_int ldvt );
+lapack_int LAPACKE_cgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* vt,
+                           lapack_int ldvt );
+lapack_int LAPACKE_zgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double* s, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* vt,
+                           lapack_int ldvt );
+
+lapack_int LAPACKE_sgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* a, lapack_int lda, lapack_int* ipiv, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* a, lapack_int lda, lapack_int* ipiv,
+                          double* b, lapack_int ldb );
+lapack_int LAPACKE_cgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dsgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                           double* a, lapack_int lda, lapack_int* ipiv,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           lapack_int* iter );
+lapack_int LAPACKE_zcgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, lapack_int* iter );
+
+lapack_int LAPACKE_sgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, float* a, lapack_int lda,
+                           float* s, float* u, lapack_int ldu, float* vt,
+                           lapack_int ldvt, float* superb );
+lapack_int LAPACKE_dgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, double* a,
+                           lapack_int lda, double* s, double* u, lapack_int ldu,
+                           double* vt, lapack_int ldvt, double* superb );
+lapack_int LAPACKE_cgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* vt,
+                           lapack_int ldvt, float* superb );
+lapack_int LAPACKE_zgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double* s, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* vt,
+                           lapack_int ldvt, double* superb );
+
+lapack_int LAPACKE_sgesvj( int matrix_order, char joba, char jobu, char jobv,
+                           lapack_int m, lapack_int n, float* a, lapack_int lda,
+                           float* sva, lapack_int mv, float* v, lapack_int ldv,
+                           float* stat );
+lapack_int LAPACKE_dgesvj( int matrix_order, char joba, char jobu, char jobv,
+                           lapack_int m, lapack_int n, double* a,
+                           lapack_int lda, double* sva, lapack_int mv,
+                           double* v, lapack_int ldv, double* stat );
+
+lapack_int LAPACKE_sgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, float* a,
+                           lapack_int lda, float* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, float* r, float* c,
+                           float* b, lapack_int ldb, float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr,
+                           float* rpivot );
+lapack_int LAPACKE_dgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, double* a,
+                           lapack_int lda, double* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, double* r, double* c,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr,
+                           double* rpivot );
+lapack_int LAPACKE_cgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, float* r, float* c,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr,
+                           float* rpivot );
+lapack_int LAPACKE_zgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, double* r, double* c,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr,
+                           double* rpivot );
+
+lapack_int LAPACKE_sgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs, float* a,
+                            lapack_int lda, float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* r, float* c,
+                            float* b, lapack_int ldb, float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs, double* a,
+                            lapack_int lda, double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* r, double* c,
+                            double* b, lapack_int ldb, double* x,
+                            lapack_int ldx, double* rcond, double* rpvgrw,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* r, float* c,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* r, double* c,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_sgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetri( int matrix_order, lapack_int n, float* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dgetri( int matrix_order, lapack_int n, double* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_cgetri( int matrix_order, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_zgetri( int matrix_order, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* lscale,
+                           const float* rscale, lapack_int m, float* v,
+                           lapack_int ldv );
+lapack_int LAPACKE_dggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* lscale,
+                           const double* rscale, lapack_int m, double* v,
+                           lapack_int ldv );
+lapack_int LAPACKE_cggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* lscale,
+                           const float* rscale, lapack_int m,
+                           lapack_complex_float* v, lapack_int ldv );
+lapack_int LAPACKE_zggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* lscale,
+                           const double* rscale, lapack_int m,
+                           lapack_complex_double* v, lapack_int ldv );
+
+lapack_int LAPACKE_sggbal( int matrix_order, char job, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, float* lscale,
+                           float* rscale );
+lapack_int LAPACKE_dggbal( int matrix_order, char job, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, double* lscale,
+                           double* rscale );
+lapack_int LAPACKE_cggbal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, float* lscale,
+                           float* rscale );
+lapack_int LAPACKE_zggbal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, double* lscale,
+                           double* rscale );
+
+lapack_int LAPACKE_sgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_S_SELECT3 selctg, lapack_int n, float* a,
+                          lapack_int lda, float* b, lapack_int ldb,
+                          lapack_int* sdim, float* alphar, float* alphai,
+                          float* beta, float* vsl, lapack_int ldvsl, float* vsr,
+                          lapack_int ldvsr );
+lapack_int LAPACKE_dgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_D_SELECT3 selctg, lapack_int n, double* a,
+                          lapack_int lda, double* b, lapack_int ldb,
+                          lapack_int* sdim, double* alphar, double* alphai,
+                          double* beta, double* vsl, lapack_int ldvsl,
+                          double* vsr, lapack_int ldvsr );
+lapack_int LAPACKE_cgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_C_SELECT2 selctg, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb,
+                          lapack_int* sdim, lapack_complex_float* alpha,
+                          lapack_complex_float* beta, lapack_complex_float* vsl,
+                          lapack_int ldvsl, lapack_complex_float* vsr,
+                          lapack_int ldvsr );
+lapack_int LAPACKE_zgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_Z_SELECT2 selctg, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb,
+                          lapack_int* sdim, lapack_complex_double* alpha,
+                          lapack_complex_double* beta,
+                          lapack_complex_double* vsl, lapack_int ldvsl,
+                          lapack_complex_double* vsr, lapack_int ldvsr );
+
+lapack_int LAPACKE_sggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_S_SELECT3 selctg, char sense,
+                           lapack_int n, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, lapack_int* sdim, float* alphar,
+                           float* alphai, float* beta, float* vsl,
+                           lapack_int ldvsl, float* vsr, lapack_int ldvsr,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_dggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_D_SELECT3 selctg, char sense,
+                           lapack_int n, double* a, lapack_int lda, double* b,
+                           lapack_int ldb, lapack_int* sdim, double* alphar,
+                           double* alphai, double* beta, double* vsl,
+                           lapack_int ldvsl, double* vsr, lapack_int ldvsr,
+                           double* rconde, double* rcondv );
+lapack_int LAPACKE_cggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_C_SELECT2 selctg, char sense,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_int* sdim,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta,
+                           lapack_complex_float* vsl, lapack_int ldvsl,
+                           lapack_complex_float* vsr, lapack_int ldvsr,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_zggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_Z_SELECT2 selctg, char sense,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_int* sdim,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* vsl, lapack_int ldvsl,
+                           lapack_complex_double* vsr, lapack_int ldvsr,
+                           double* rconde, double* rcondv );
+
+lapack_int LAPACKE_sggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, float* a, lapack_int lda, float* b,
+                          lapack_int ldb, float* alphar, float* alphai,
+                          float* beta, float* vl, lapack_int ldvl, float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_dggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, double* a, lapack_int lda, double* b,
+                          lapack_int ldb, double* alphar, double* alphai,
+                          double* beta, double* vl, lapack_int ldvl, double* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_cggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* alpha,
+                          lapack_complex_float* beta, lapack_complex_float* vl,
+                          lapack_int ldvl, lapack_complex_float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_zggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b,
+                          lapack_int ldb, lapack_complex_double* alpha,
+                          lapack_complex_double* beta,
+                          lapack_complex_double* vl, lapack_int ldvl,
+                          lapack_complex_double* vr, lapack_int ldvr );
+
+lapack_int LAPACKE_sggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           float* alphar, float* alphai, float* beta, float* vl,
+                           lapack_int ldvl, float* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, float* lscale,
+                           float* rscale, float* abnrm, float* bbnrm,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_dggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double* alphar, double* alphai, double* beta,
+                           double* vl, lapack_int ldvl, double* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           double* lscale, double* rscale, double* abnrm,
+                           double* bbnrm, double* rconde, double* rcondv );
+lapack_int LAPACKE_cggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta, lapack_complex_float* vl,
+                           lapack_int ldvl, lapack_complex_float* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           float* lscale, float* rscale, float* abnrm,
+                           float* bbnrm, float* rconde, float* rcondv );
+lapack_int LAPACKE_zggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* vl, lapack_int ldvl,
+                           lapack_complex_double* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, double* lscale,
+                           double* rscale, double* abnrm, double* bbnrm,
+                           double* rconde, double* rcondv );
+
+lapack_int LAPACKE_sggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* d, float* x, float* y );
+lapack_int LAPACKE_dggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, double* a, lapack_int lda, double* b,
+                           lapack_int ldb, double* d, double* x, double* y );
+lapack_int LAPACKE_cggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* d,
+                           lapack_complex_float* x, lapack_complex_float* y );
+lapack_int LAPACKE_zggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* d,
+                           lapack_complex_double* x, lapack_complex_double* y );
+
+lapack_int LAPACKE_sgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           float* a, lapack_int lda, float* b, lapack_int ldb,
+                           float* q, lapack_int ldq, float* z, lapack_int ldz );
+lapack_int LAPACKE_dgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           double* a, lapack_int lda, double* b, lapack_int ldb,
+                           double* q, lapack_int ldq, double* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_cgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* c, float* d, float* x );
+lapack_int LAPACKE_dgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, double* a, lapack_int lda, double* b,
+                           lapack_int ldb, double* c, double* d, double* x );
+lapack_int LAPACKE_cgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* c,
+                           lapack_complex_float* d, lapack_complex_float* x );
+lapack_int LAPACKE_zgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* c,
+                           lapack_complex_double* d, lapack_complex_double* x );
+
+lapack_int LAPACKE_sggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, float* a, lapack_int lda, float* taua,
+                           float* b, lapack_int ldb, float* taub );
+lapack_int LAPACKE_dggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, double* a, lapack_int lda,
+                           double* taua, double* b, lapack_int ldb,
+                           double* taub );
+lapack_int LAPACKE_cggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* taua,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* taub );
+lapack_int LAPACKE_zggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* taua,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* taub );
+
+lapack_int LAPACKE_sggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, float* a, lapack_int lda, float* taua,
+                           float* b, lapack_int ldb, float* taub );
+lapack_int LAPACKE_dggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, double* a, lapack_int lda,
+                           double* taua, double* b, lapack_int ldb,
+                           double* taub );
+lapack_int LAPACKE_cggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* taua,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* taub );
+lapack_int LAPACKE_zggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* taua,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* taub );
+
+lapack_int LAPACKE_sggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           float* alpha, float* beta, float* u, lapack_int ldu,
+                           float* v, lapack_int ldv, float* q, lapack_int ldq,
+                           lapack_int* iwork );
+lapack_int LAPACKE_dggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double* alpha, double* beta, double* u,
+                           lapack_int ldu, double* v, lapack_int ldv, double* q,
+                           lapack_int ldq, lapack_int* iwork );
+lapack_int LAPACKE_cggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           float* alpha, float* beta, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* v,
+                           lapack_int ldv, lapack_complex_float* q,
+                           lapack_int ldq, lapack_int* iwork );
+lapack_int LAPACKE_zggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           double* alpha, double* beta,
+                           lapack_complex_double* u, lapack_int ldu,
+                           lapack_complex_double* v, lapack_int ldv,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_int* iwork );
+
+lapack_int LAPACKE_sggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb, float tola,
+                           float tolb, lapack_int* k, lapack_int* l, float* u,
+                           lapack_int ldu, float* v, lapack_int ldv, float* q,
+                           lapack_int ldq );
+lapack_int LAPACKE_dggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double tola, double tolb, lapack_int* k,
+                           lapack_int* l, double* u, lapack_int ldu, double* v,
+                           lapack_int ldv, double* q, lapack_int ldq );
+lapack_int LAPACKE_cggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb, float tola,
+                           float tolb, lapack_int* k, lapack_int* l,
+                           lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv,
+                           lapack_complex_float* q, lapack_int ldq );
+lapack_int LAPACKE_zggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           double tola, double tolb, lapack_int* k,
+                           lapack_int* l, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* v,
+                           lapack_int ldv, lapack_complex_double* q,
+                           lapack_int ldq );
+
+lapack_int LAPACKE_sgtcon( char norm, lapack_int n, const float* dl,
+                           const float* d, const float* du, const float* du2,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_dgtcon( char norm, lapack_int n, const double* dl,
+                           const double* d, const double* du, const double* du2,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_cgtcon( char norm, lapack_int n,
+                           const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           const lapack_complex_float* du2,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zgtcon( char norm, lapack_int n,
+                           const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           const lapack_complex_double* du2,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* dl, const float* d,
+                           const float* du, const float* dlf, const float* df,
+                           const float* duf, const float* du2,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* dl, const double* d,
+                           const double* du, const double* dlf,
+                           const double* df, const double* duf,
+                           const double* du2, const lapack_int* ipiv,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           const lapack_complex_float* dlf,
+                           const lapack_complex_float* df,
+                           const lapack_complex_float* duf,
+                           const lapack_complex_float* du2,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           const lapack_complex_double* dlf,
+                           const lapack_complex_double* df,
+                           const lapack_complex_double* duf,
+                           const lapack_complex_double* du2,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* dl, float* d, float* du, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* dl, double* d, double* du, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* dl, lapack_complex_float* d,
+                          lapack_complex_float* du, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* dl, lapack_complex_double* d,
+                          lapack_complex_double* du, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_sgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, const float* dl,
+                           const float* d, const float* du, float* dlf,
+                           float* df, float* duf, float* du2, lapack_int* ipiv,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, const double* dl,
+                           const double* d, const double* du, double* dlf,
+                           double* df, double* duf, double* du2,
+                           lapack_int* ipiv, const double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           lapack_complex_float* dlf, lapack_complex_float* df,
+                           lapack_complex_float* duf, lapack_complex_float* du2,
+                           lapack_int* ipiv, const lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           lapack_complex_double* dlf,
+                           lapack_complex_double* df,
+                           lapack_complex_double* duf,
+                           lapack_complex_double* du2, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_sgttrf( lapack_int n, float* dl, float* d, float* du,
+                           float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_dgttrf( lapack_int n, double* dl, double* d, double* du,
+                           double* du2, lapack_int* ipiv );
+lapack_int LAPACKE_cgttrf( lapack_int n, lapack_complex_float* dl,
+                           lapack_complex_float* d, lapack_complex_float* du,
+                           lapack_complex_float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_zgttrf( lapack_int n, lapack_complex_double* dl,
+                           lapack_complex_double* d, lapack_complex_double* du,
+                           lapack_complex_double* du2, lapack_int* ipiv );
+
+lapack_int LAPACKE_sgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* dl, const float* d,
+                           const float* du, const float* du2,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* dl, const double* d,
+                           const double* du, const double* du2,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           const lapack_complex_float* du2,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           const lapack_complex_double* du2,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_chbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, lapack_complex_float* ab,
+                          lapack_int ldab, float* w, lapack_complex_float* z,
+                          lapack_int ldz );
+lapack_int LAPACKE_zhbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, lapack_complex_double* ab,
+                          lapack_int ldab, double* w, lapack_complex_double* z,
+                          lapack_int ldz );
+
+lapack_int LAPACKE_chbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_float* ab,
+                           lapack_int ldab, float* w, lapack_complex_float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_zhbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_double* ab,
+                           lapack_int ldab, double* w, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_chbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* q, lapack_int ldq, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* bb, lapack_int ldbb,
+                           lapack_complex_float* x, lapack_int ldx );
+lapack_int LAPACKE_zhbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* bb, lapack_int ldbb,
+                           lapack_complex_double* x, lapack_int ldx );
+
+lapack_int LAPACKE_chbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb,
+                          lapack_complex_float* ab, lapack_int ldab,
+                          lapack_complex_float* bb, lapack_int ldbb, float* w,
+                          lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb,
+                          lapack_complex_double* ab, lapack_int ldab,
+                          lapack_complex_double* bb, lapack_int ldbb, double* w,
+                          lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* bb, lapack_int ldbb, float* w,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* bb, lapack_int ldbb,
+                           double* w, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_chbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* bb, lapack_int ldbb,
+                           lapack_complex_float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* bb, lapack_int ldbb,
+                           lapack_complex_double* q, lapack_int ldq, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_float* ab,
+                           lapack_int ldab, float* d, float* e,
+                           lapack_complex_float* q, lapack_int ldq );
+lapack_int LAPACKE_zhbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_double* ab,
+                           lapack_int ldab, double* d, double* e,
+                           lapack_complex_double* q, lapack_int ldq );
+
+lapack_int LAPACKE_checon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zhecon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_cheequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* s, float* scond, float* amax );
+lapack_int LAPACKE_zheequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_cheev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_zheev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda, double* w );
+
+lapack_int LAPACKE_cheevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_zheevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           double* w );
+
+lapack_int LAPACKE_cheevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float vl, float vu, lapack_int il,
+                           lapack_int iu, float abstol, lapack_int* m, float* w,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_zheevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double vl, double vu, lapack_int il,
+                           lapack_int iu, double abstol, lapack_int* m,
+                           double* w, lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* isuppz );
+
+lapack_int LAPACKE_cheevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float vl, float vu, lapack_int il,
+                           lapack_int iu, float abstol, lapack_int* m, float* w,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_zheevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double vl, double vu, lapack_int il,
+                           lapack_int iu, double abstol, lapack_int* m,
+                           double* w, lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chegst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zhegst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_chegv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b,
+                          lapack_int ldb, float* w );
+lapack_int LAPACKE_zhegv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b,
+                          lapack_int ldb, double* w );
+
+lapack_int LAPACKE_chegvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float* w );
+lapack_int LAPACKE_zhegvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double* w );
+
+lapack_int LAPACKE_chegvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhegvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_cherfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zherfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_cherfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* s,
+                            const lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zherfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* s,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_chesv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zhesv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chesvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zhesvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_chesvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* s,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zhesvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* s,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_chetrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda, float* d,
+                           float* e, lapack_complex_float* tau );
+lapack_int LAPACKE_zhetrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda, double* d,
+                           double* e, lapack_complex_double* tau );
+
+lapack_int LAPACKE_chetrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zhetrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_chetri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_zhetri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv );
+
+lapack_int LAPACKE_chetrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zhetrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, float alpha,
+                          const lapack_complex_float* a, lapack_int lda,
+                          float beta, lapack_complex_float* c );
+lapack_int LAPACKE_zhfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, double alpha,
+                          const lapack_complex_double* a, lapack_int lda,
+                          double beta, lapack_complex_double* c );
+
+lapack_int LAPACKE_shgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           float* h, lapack_int ldh, float* t, lapack_int ldt,
+                           float* alphar, float* alphai, float* beta, float* q,
+                           lapack_int ldq, float* z, lapack_int ldz );
+lapack_int LAPACKE_dhgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           double* h, lapack_int ldh, double* t, lapack_int ldt,
+                           double* alphar, double* alphai, double* beta,
+                           double* q, lapack_int ldq, double* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_chgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_float* h, lapack_int ldh,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta, lapack_complex_float* q,
+                           lapack_int ldq, lapack_complex_float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_zhgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_double* h, lapack_int ldh,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zhpcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_chpev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_float* ap, float* w,
+                          lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_double* ap, double* w,
+                          lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_float* ap, float* w,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_double* ap, double* w,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_float* ap, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhpevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_double* ap, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chpgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_float* ap,
+                           const lapack_complex_float* bp );
+lapack_int LAPACKE_zhpgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_double* ap,
+                           const lapack_complex_double* bp );
+
+lapack_int LAPACKE_chpgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_float* ap,
+                          lapack_complex_float* bp, float* w,
+                          lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_double* ap,
+                          lapack_complex_double* bp, double* w,
+                          lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_float* ap,
+                           lapack_complex_float* bp, float* w,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_double* ap,
+                           lapack_complex_double* bp, double* w,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_float* ap, lapack_complex_float* bp,
+                           float vl, float vu, lapack_int il, lapack_int iu,
+                           float abstol, lapack_int* m, float* w,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_zhpgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_double* ap, lapack_complex_double* bp,
+                           double vl, double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_complex_float* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zhprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_complex_double* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_chpsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* ap,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zhpsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* ap,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_chpsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           lapack_complex_float* afp, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zhpsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           lapack_complex_double* afp, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_chptrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, float* d, float* e,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zhptrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, double* d, double* e,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_chptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zhptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_chptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, const lapack_int* ipiv );
+lapack_int LAPACKE_zhptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, const lapack_int* ipiv );
+
+lapack_int LAPACKE_chptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zhptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_shsein( int matrix_order, char job, char eigsrc, char initv,
+                           lapack_logical* select, lapack_int n, const float* h,
+                           lapack_int ldh, float* wr, const float* wi,
+                           float* vl, lapack_int ldvl, float* vr,
+                           lapack_int ldvr, lapack_int mm, lapack_int* m,
+                           lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_dhsein( int matrix_order, char job, char eigsrc, char initv,
+                           lapack_logical* select, lapack_int n,
+                           const double* h, lapack_int ldh, double* wr,
+                           const double* wi, double* vl, lapack_int ldvl,
+                           double* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m, lapack_int* ifaill,
+                           lapack_int* ifailr );
+lapack_int LAPACKE_chsein( int matrix_order, char job, char eigsrc, char initv,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* h, lapack_int ldh,
+                           lapack_complex_float* w, lapack_complex_float* vl,
+                           lapack_int ldvl, lapack_complex_float* vr,
+                           lapack_int ldvr, lapack_int mm, lapack_int* m,
+                           lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_zhsein( int matrix_order, char job, char eigsrc, char initv,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* h, lapack_int ldh,
+                           lapack_complex_double* w, lapack_complex_double* vl,
+                           lapack_int ldvl, lapack_complex_double* vr,
+                           lapack_int ldvr, lapack_int mm, lapack_int* m,
+                           lapack_int* ifaill, lapack_int* ifailr );
+
+lapack_int LAPACKE_shseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, float* h,
+                           lapack_int ldh, float* wr, float* wi, float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_dhseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, double* h,
+                           lapack_int ldh, double* wr, double* wi, double* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_chseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi,
+                           lapack_complex_float* h, lapack_int ldh,
+                           lapack_complex_float* w, lapack_complex_float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_zhseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi,
+                           lapack_complex_double* h, lapack_int ldh,
+                           lapack_complex_double* w, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x,
+                           lapack_int incx );
+lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x,
+                           lapack_int incx );
+
+lapack_int LAPACKE_slacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const float* a, lapack_int lda, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dlacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const double* a, lapack_int lda, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_clacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zlacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_zlag2c( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_float* sa, lapack_int ldsa );
+
+lapack_int LAPACKE_slag2d( int matrix_order, lapack_int m, lapack_int n,
+                           const float* sa, lapack_int ldsa, double* a,
+                           lapack_int lda );
+
+lapack_int LAPACKE_dlag2s( int matrix_order, lapack_int m, lapack_int n,
+                           const double* a, lapack_int lda, float* sa,
+                           lapack_int ldsa );
+
+lapack_int LAPACKE_clag2z( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_float* sa, lapack_int ldsa,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* d,
+                           float* a, lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_dlagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* d,
+                           double* a, lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_clagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* d,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* iseed );
+lapack_int LAPACKE_zlagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* d,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* iseed );
+
+float LAPACKE_slamch( char cmach );
+double LAPACKE_dlamch( char cmach );
+
+float LAPACKE_slange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const float* a, lapack_int lda );
+double LAPACKE_dlange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const double* a, lapack_int lda );
+float LAPACKE_clange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda );
+double LAPACKE_zlange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda );
+
+float LAPACKE_clanhe( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda );
+double LAPACKE_zlanhe( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda );
+
+float LAPACKE_slansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const float* a, lapack_int lda );
+double LAPACKE_dlansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const double* a, lapack_int lda );
+float LAPACKE_clansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda );
+double LAPACKE_zlansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda );
+
+float LAPACKE_slantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const float* a,
+                           lapack_int lda );
+double LAPACKE_dlantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const double* a,
+                           lapack_int lda );
+float LAPACKE_clantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda );
+double LAPACKE_zlantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda );
+
+
+lapack_int LAPACKE_slarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const float* v, lapack_int ldv,
+                           const float* t, lapack_int ldt, float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_dlarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const double* v, lapack_int ldv,
+                           const double* t, lapack_int ldt, double* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_clarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const lapack_complex_float* v,
+                           lapack_int ldv, const lapack_complex_float* t,
+                           lapack_int ldt, lapack_complex_float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_zlarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const lapack_complex_double* v,
+                           lapack_int ldv, const lapack_complex_double* t,
+                           lapack_int ldt, lapack_complex_double* c,
+                           lapack_int ldc );
+
+lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x,
+                           lapack_int incx, float* tau );
+lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x,
+                           lapack_int incx, double* tau );
+lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha,
+                           lapack_complex_float* x, lapack_int incx,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha,
+                           lapack_complex_double* x, lapack_int incx,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_slarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k, const float* v,
+                           lapack_int ldv, const float* tau, float* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_dlarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k, const double* v,
+                           lapack_int ldv, const double* tau, double* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_clarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k,
+                           const lapack_complex_float* v, lapack_int ldv,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zlarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k,
+                           const lapack_complex_double* v, lapack_int ldv,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_slarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const float* v, float tau, float* c,
+                           lapack_int ldc, float* work );
+lapack_int LAPACKE_dlarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const double* v, double tau, double* c,
+                           lapack_int ldc, double* work );
+lapack_int LAPACKE_clarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const lapack_complex_float* v,
+                           lapack_complex_float tau, lapack_complex_float* c,
+                           lapack_int ldc, lapack_complex_float* work );
+lapack_int LAPACKE_zlarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const lapack_complex_double* v,
+                           lapack_complex_double tau, lapack_complex_double* c,
+                           lapack_int ldc, lapack_complex_double* work );
+
+lapack_int LAPACKE_slarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           float* x );
+lapack_int LAPACKE_dlarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           double* x );
+lapack_int LAPACKE_clarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           lapack_complex_float* x );
+lapack_int LAPACKE_zlarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           lapack_complex_double* x );
+
+lapack_int LAPACKE_slaset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, float alpha, float beta, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dlaset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, double alpha, double beta, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_claset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, lapack_complex_float alpha,
+                           lapack_complex_float beta, lapack_complex_float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_zlaset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, lapack_complex_double alpha,
+                           lapack_complex_double beta, lapack_complex_double* a,
+                           lapack_int lda );
+
+lapack_int LAPACKE_slasrt( char id, lapack_int n, float* d );
+lapack_int LAPACKE_dlasrt( char id, lapack_int n, double* d );
+
+lapack_int LAPACKE_slaswp( int matrix_order, lapack_int n, float* a,
+                           lapack_int lda, lapack_int k1, lapack_int k2,
+                           const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_dlaswp( int matrix_order, lapack_int n, double* a,
+                           lapack_int lda, lapack_int k1, lapack_int k2,
+                           const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_claswp( int matrix_order, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int k1, lapack_int k2, const lapack_int* ipiv,
+                           lapack_int incx );
+lapack_int LAPACKE_zlaswp( int matrix_order, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int k1, lapack_int k2, const lapack_int* ipiv,
+                           lapack_int incx );
+
+lapack_int LAPACKE_slatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, float* d,
+                           lapack_int mode, float cond, float dmax,
+                           lapack_int kl, lapack_int ku, char pack, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dlatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, double* d,
+                           lapack_int mode, double cond, double dmax,
+                           lapack_int kl, lapack_int ku, char pack, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_clatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, float* d,
+                           lapack_int mode, float cond, float dmax,
+                           lapack_int kl, lapack_int ku, char pack,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, double* d,
+                           lapack_int mode, double cond, double dmax,
+                           lapack_int kl, lapack_int ku, char pack,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slauum( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dlauum( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_clauum( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlauum( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_sopgtr( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, const float* tau, float* q,
+                           lapack_int ldq );
+lapack_int LAPACKE_dopgtr( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, const double* tau, double* q,
+                           lapack_int ldq );
+
+lapack_int LAPACKE_sopmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const float* ap,
+                           const float* tau, float* c, lapack_int ldc );
+lapack_int LAPACKE_dopmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const double* ap,
+                           const double* tau, double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sorgbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, double* a,
+                           lapack_int lda, const double* tau );
+
+lapack_int LAPACKE_sorghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgtr( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, const float* tau );
+lapack_int LAPACKE_dorgtr( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, const double* tau );
+
+lapack_int LAPACKE_sormbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const float* a, lapack_int lda,
+                           const float* tau, float* c, lapack_int ldc );
+lapack_int LAPACKE_dormhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const double* a, lapack_int lda,
+                           const double* tau, double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const float* a, lapack_int lda,
+                           const float* tau, float* c, lapack_int ldc );
+lapack_int LAPACKE_dormrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const double* a, lapack_int lda,
+                           const double* tau, double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const float* a,
+                           lapack_int lda, const float* tau, float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_dormtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const double* a,
+                           lapack_int lda, const double* tau, double* c,
+                           lapack_int ldc );
+
+lapack_int LAPACKE_spbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const float* ab, lapack_int ldab,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_dpbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const double* ab, lapack_int ldab,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cpbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_float* ab,
+                           lapack_int ldab, float anorm, float* rcond );
+lapack_int LAPACKE_zpbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_double* ab,
+                           lapack_int ldab, double anorm, double* rcond );
+
+lapack_int LAPACKE_spbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const float* ab, lapack_int ldab,
+                           float* s, float* scond, float* amax );
+lapack_int LAPACKE_dpbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const double* ab, lapack_int ldab,
+                           double* s, double* scond, double* amax );
+lapack_int LAPACKE_cpbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_float* ab,
+                           lapack_int ldab, float* s, float* scond,
+                           float* amax );
+lapack_int LAPACKE_zpbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_double* ab,
+                           lapack_int ldab, double* s, double* scond,
+                           double* amax );
+
+lapack_int LAPACKE_spbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const float* ab,
+                           lapack_int ldab, const float* afb, lapack_int ldafb,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dpbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const double* ab,
+                           lapack_int ldab, const double* afb, lapack_int ldafb,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cpbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* afb, lapack_int ldafb,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zpbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* afb, lapack_int ldafb,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_spbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, float* bb, lapack_int ldbb );
+lapack_int LAPACKE_dpbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, double* bb, lapack_int ldbb );
+lapack_int LAPACKE_cpbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, lapack_complex_float* bb,
+                           lapack_int ldbb );
+lapack_int LAPACKE_zpbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, lapack_complex_double* bb,
+                           lapack_int ldbb );
+
+lapack_int LAPACKE_spbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs, float* ab,
+                          lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs, double* ab,
+                          lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs,
+                          lapack_complex_float* ab, lapack_int ldab,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs,
+                          lapack_complex_double* ab, lapack_int ldab,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, float* ab,
+                           lapack_int ldab, float* afb, lapack_int ldafb,
+                           char* equed, float* s, float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dpbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, double* ab,
+                           lapack_int ldab, double* afb, lapack_int ldafb,
+                           char* equed, double* s, double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cpbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* afb, lapack_int ldafb,
+                           char* equed, float* s, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zpbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* afb, lapack_int ldafb,
+                           char* equed, double* s, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* rcond, double* ferr,
+                           double* berr );
+
+lapack_int LAPACKE_spbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, float* ab, lapack_int ldab );
+lapack_int LAPACKE_dpbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, double* ab, lapack_int ldab );
+lapack_int LAPACKE_cpbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_float* ab,
+                           lapack_int ldab );
+lapack_int LAPACKE_zpbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_double* ab,
+                           lapack_int ldab );
+
+lapack_int LAPACKE_spbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const float* ab,
+                           lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const double* ab,
+                           lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, float* a );
+lapack_int LAPACKE_dpftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, double* a );
+lapack_int LAPACKE_cpftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, float* a );
+lapack_int LAPACKE_dpftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, double* a );
+lapack_int LAPACKE_cpftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs, const float* a,
+                           float* b, lapack_int ldb );
+lapack_int LAPACKE_dpftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs, const double* a,
+                           double* b, lapack_int ldb );
+lapack_int LAPACKE_cpftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spocon( int matrix_order, char uplo, lapack_int n,
+                           const float* a, lapack_int lda, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dpocon( int matrix_order, char uplo, lapack_int n,
+                           const double* a, lapack_int lda, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_cpocon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_zpocon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double anorm, double* rcond );
+
+lapack_int LAPACKE_spoequ( int matrix_order, lapack_int n, const float* a,
+                           lapack_int lda, float* s, float* scond,
+                           float* amax );
+lapack_int LAPACKE_dpoequ( int matrix_order, lapack_int n, const double* a,
+                           lapack_int lda, double* s, double* scond,
+                           double* amax );
+lapack_int LAPACKE_cpoequ( int matrix_order, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequ( int matrix_order, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_spoequb( int matrix_order, lapack_int n, const float* a,
+                            lapack_int lda, float* s, float* scond,
+                            float* amax );
+lapack_int LAPACKE_dpoequb( int matrix_order, lapack_int n, const double* a,
+                            lapack_int lda, double* s, double* scond,
+                            double* amax );
+lapack_int LAPACKE_cpoequb( int matrix_order, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequb( int matrix_order, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_sporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const float* af, lapack_int ldaf, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const double* af, lapack_int ldaf, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_zporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+
+lapack_int LAPACKE_sporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const float* a,
+                            lapack_int lda, const float* af, lapack_int ldaf,
+                            const float* s, const float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const double* a,
+                            lapack_int lda, const double* af, lapack_int ldaf,
+                            const double* s, const double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const float* s, const lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const double* s, const lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_sposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* a, lapack_int lda, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* a, lapack_int lda, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dsposv( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           lapack_int* iter );
+lapack_int LAPACKE_zcposv( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, lapack_int* iter );
+
+lapack_int LAPACKE_sposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* af,
+                           lapack_int ldaf, char* equed, float* s, float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_dposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* af, lapack_int ldaf, char* equed, double* s,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+lapack_int LAPACKE_cposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* af,
+                           lapack_int ldaf, char* equed, float* s,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* af,
+                           lapack_int ldaf, char* equed, double* s,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_sposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, float* a,
+                            lapack_int lda, float* af, lapack_int ldaf,
+                            char* equed, float* s, float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond,
+                            float* rpvgrw, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_dposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, double* a,
+                            lapack_int lda, double* af, lapack_int ldaf,
+                            char* equed, double* s, double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* rpvgrw, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            char* equed, float* s, lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* rpvgrw,
+                            float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            char* equed, double* s, lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* rpvgrw,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_spotrf( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dpotrf( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_cpotrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotri( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dpotri( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_cpotri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           float* b, lapack_int ldb );
+lapack_int LAPACKE_dpotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           double* b, lapack_int ldb );
+lapack_int LAPACKE_cpotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zpotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_sppcon( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, float anorm, float* rcond );
+lapack_int LAPACKE_dppcon( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, double anorm, double* rcond );
+lapack_int LAPACKE_cppcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_zppcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sppequ( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, float* s, float* scond,
+                           float* amax );
+lapack_int LAPACKE_dppequ( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, double* s, double* scond,
+                           double* amax );
+lapack_int LAPACKE_cppequ( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap, float* s,
+                           float* scond, float* amax );
+lapack_int LAPACKE_zppequ( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap, double* s,
+                           double* scond, double* amax );
+
+lapack_int LAPACKE_spprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, const float* afp,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dpprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, const double* afp,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cpprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_complex_float* afp,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zpprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_complex_double* afp,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* ap, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* ap, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* ap,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* ap,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, float* ap, float* afp, char* equed,
+                           float* s, float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, double* ap, double* afp,
+                           char* equed, double* s, double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* ap,
+                           lapack_complex_float* afp, char* equed, float* s,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* ap,
+                           lapack_complex_double* afp, char* equed, double* s,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_spptrf( int matrix_order, char uplo, lapack_int n,
+                           float* ap );
+lapack_int LAPACKE_dpptrf( int matrix_order, char uplo, lapack_int n,
+                           double* ap );
+lapack_int LAPACKE_cpptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_zpptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptri( int matrix_order, char uplo, lapack_int n,
+                           float* ap );
+lapack_int LAPACKE_dpptri( int matrix_order, char uplo, lapack_int n,
+                           double* ap );
+lapack_int LAPACKE_cpptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_zpptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dpptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_cpptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spstrf( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, lapack_int* piv, lapack_int* rank,
+                           float tol );
+lapack_int LAPACKE_dpstrf( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, lapack_int* piv, lapack_int* rank,
+                           double tol );
+lapack_int LAPACKE_cpstrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* piv, lapack_int* rank, float tol );
+lapack_int LAPACKE_zpstrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* piv, lapack_int* rank, double tol );
+
+lapack_int LAPACKE_sptcon( lapack_int n, const float* d, const float* e,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_dptcon( lapack_int n, const double* d, const double* e,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cptcon( lapack_int n, const float* d,
+                           const lapack_complex_float* e, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_zptcon( lapack_int n, const double* d,
+                           const lapack_complex_double* e, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_spteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dpteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, double* z, lapack_int ldz );
+lapack_int LAPACKE_cpteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zpteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_sptrfs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const float* d, const float* e, const float* df,
+                           const float* ef, const float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dptrfs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const double* d, const double* e, const double* df,
+                           const double* ef, const double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* ferr,
+                           double* berr );
+lapack_int LAPACKE_cptrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* d,
+                           const lapack_complex_float* e, const float* df,
+                           const lapack_complex_float* ef,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zptrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* d,
+                           const lapack_complex_double* e, const double* df,
+                           const lapack_complex_double* ef,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* d, float* e, float* b, lapack_int ldb );
+lapack_int LAPACKE_dptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* d, double* e, double* b, lapack_int ldb );
+lapack_int LAPACKE_cptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* d, lapack_complex_float* e,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* d, lapack_complex_double* e,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const float* d, const float* e,
+                           float* df, float* ef, const float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const double* d, const double* e,
+                           double* df, double* ef, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+lapack_int LAPACKE_cptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const float* d,
+                           const lapack_complex_float* e, float* df,
+                           lapack_complex_float* ef,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const double* d,
+                           const lapack_complex_double* e, double* df,
+                           lapack_complex_double* ef,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_spttrf( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dpttrf( lapack_int n, double* d, double* e );
+lapack_int LAPACKE_cpttrf( lapack_int n, float* d, lapack_complex_float* e );
+lapack_int LAPACKE_zpttrf( lapack_int n, double* d, lapack_complex_double* e );
+
+lapack_int LAPACKE_spttrs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const float* d, const float* e, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dpttrs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const double* d, const double* e, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_cpttrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* d,
+                           const lapack_complex_float* e,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpttrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* d,
+                           const lapack_complex_double* e,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, float* ab, lapack_int ldab, float* w,
+                          float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, double* ab, lapack_int ldab, double* w,
+                          double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, float* ab, lapack_int ldab, float* w,
+                           float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, double* ab, lapack_int ldab,
+                           double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd, float* ab,
+                           lapack_int ldab, float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd, double* ab,
+                           lapack_int ldab, double* q, lapack_int ldq,
+                           double vl, double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, float* ab,
+                           lapack_int ldab, const float* bb, lapack_int ldbb,
+                           float* x, lapack_int ldx );
+lapack_int LAPACKE_dsbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, double* ab,
+                           lapack_int ldab, const double* bb, lapack_int ldbb,
+                           double* x, lapack_int ldx );
+
+lapack_int LAPACKE_ssbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb, float* ab,
+                          lapack_int ldab, float* bb, lapack_int ldbb, float* w,
+                          float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb, double* ab,
+                          lapack_int ldab, double* bb, lapack_int ldbb,
+                          double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, float* ab,
+                           lapack_int ldab, float* bb, lapack_int ldbb,
+                           float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, double* ab,
+                           lapack_int ldab, double* bb, lapack_int ldbb,
+                           double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           float* ab, lapack_int ldab, float* bb,
+                           lapack_int ldbb, float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           double* ab, lapack_int ldab, double* bb,
+                           lapack_int ldbb, double* q, lapack_int ldq,
+                           double vl, double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, float* ab, lapack_int ldab, float* d,
+                           float* e, float* q, lapack_int ldq );
+lapack_int LAPACKE_dsbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, double* ab, lapack_int ldab,
+                           double* d, double* e, double* q, lapack_int ldq );
+
+lapack_int LAPACKE_ssfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, float alpha,
+                          const float* a, lapack_int lda, float beta,
+                          float* c );
+lapack_int LAPACKE_dsfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, double alpha,
+                          const double* a, lapack_int lda, double beta,
+                          double* c );
+
+lapack_int LAPACKE_sspcon( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, const lapack_int* ipiv, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dspcon( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, const lapack_int* ipiv,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cspcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zspcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sspev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          float* ap, float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          double* ap, double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           float* ap, float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           double* ap, double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, float* ap, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dspevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, double* ap, double vl, double vu,
+                           lapack_int il, lapack_int iu, double abstol,
+                           lapack_int* m, double* w, double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_sspgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, float* ap, const float* bp );
+lapack_int LAPACKE_dspgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, double* ap, const double* bp );
+
+lapack_int LAPACKE_sspgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, float* ap, float* bp,
+                          float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, double* ap, double* bp,
+                          double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, float* ap, float* bp,
+                           float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, double* ap, double* bp,
+                           double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, float* ap,
+                           float* bp, float vl, float vu, lapack_int il,
+                           lapack_int iu, float abstol, lapack_int* m, float* w,
+                           float* z, lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_dspgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, double* ap,
+                           double* bp, double vl, double vu, lapack_int il,
+                           lapack_int iu, double abstol, lapack_int* m,
+                           double* w, double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_ssprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, const float* afp,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dsprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, const double* afp,
+                           const lapack_int* ipiv, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_csprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_complex_float* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zsprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_complex_double* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* ap, lapack_int* ipiv,
+                          float* b, lapack_int ldb );
+lapack_int LAPACKE_dspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* ap, lapack_int* ipiv,
+                          double* b, lapack_int ldb );
+lapack_int LAPACKE_cspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* ap,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* ap,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_sspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, float* afp,
+                           lapack_int* ipiv, const float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, double* afp,
+                           lapack_int* ipiv, const double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           lapack_complex_float* afp, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           lapack_complex_double* afp, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_ssptrd( int matrix_order, char uplo, lapack_int n, float* ap,
+                           float* d, float* e, float* tau );
+lapack_int LAPACKE_dsptrd( int matrix_order, char uplo, lapack_int n,
+                           double* ap, double* d, double* e, double* tau );
+
+lapack_int LAPACKE_ssptrf( int matrix_order, char uplo, lapack_int n, float* ap,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_dsptrf( int matrix_order, char uplo, lapack_int n,
+                           double* ap, lapack_int* ipiv );
+lapack_int LAPACKE_csptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zsptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_ssptri( int matrix_order, char uplo, lapack_int n, float* ap,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_dsptri( int matrix_order, char uplo, lapack_int n,
+                           double* ap, const lapack_int* ipiv );
+lapack_int LAPACKE_csptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, const lapack_int* ipiv );
+lapack_int LAPACKE_zsptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, const lapack_int* ipiv );
+
+lapack_int LAPACKE_ssptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dsptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_csptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zsptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_sstebz( char range, char order, lapack_int n, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           const float* d, const float* e, lapack_int* m,
+                           lapack_int* nsplit, float* w, lapack_int* iblock,
+                           lapack_int* isplit );
+lapack_int LAPACKE_dstebz( char range, char order, lapack_int n, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, const double* d, const double* e,
+                           lapack_int* m, lapack_int* nsplit, double* w,
+                           lapack_int* iblock, lapack_int* isplit );
+
+lapack_int LAPACKE_sstedc( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dstedc( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, double* z, lapack_int ldz );
+lapack_int LAPACKE_cstedc( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zstedc( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_sstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_dstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* isuppz );
+lapack_int LAPACKE_cstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* isuppz );
+lapack_int LAPACKE_zstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* isuppz );
+
+lapack_int LAPACKE_sstein( int matrix_order, lapack_int n, const float* d,
+                           const float* e, lapack_int m, const float* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           float* z, lapack_int ldz, lapack_int* ifailv );
+lapack_int LAPACKE_dstein( int matrix_order, lapack_int n, const double* d,
+                           const double* e, lapack_int m, const double* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           double* z, lapack_int ldz, lapack_int* ifailv );
+lapack_int LAPACKE_cstein( int matrix_order, lapack_int n, const float* d,
+                           const float* e, lapack_int m, const float* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* ifailv );
+lapack_int LAPACKE_zstein( int matrix_order, lapack_int n, const double* d,
+                           const double* e, lapack_int m, const double* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifailv );
+
+lapack_int LAPACKE_sstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, lapack_int* m,
+                           float* w, float* z, lapack_int ldz, lapack_int nzc,
+                           lapack_int* isuppz, lapack_logical* tryrac );
+lapack_int LAPACKE_dstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           lapack_int* m, double* w, double* z, lapack_int ldz,
+                           lapack_int nzc, lapack_int* isuppz,
+                           lapack_logical* tryrac );
+lapack_int LAPACKE_cstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, lapack_int* m,
+                           float* w, lapack_complex_float* z, lapack_int ldz,
+                           lapack_int nzc, lapack_int* isuppz,
+                           lapack_logical* tryrac );
+lapack_int LAPACKE_zstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           lapack_int* m, double* w, lapack_complex_double* z,
+                           lapack_int ldz, lapack_int nzc, lapack_int* isuppz,
+                           lapack_logical* tryrac );
+
+lapack_int LAPACKE_ssteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dsteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, double* z, lapack_int ldz );
+lapack_int LAPACKE_csteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zsteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_ssterf( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dsterf( lapack_int n, double* d, double* e );
+
+lapack_int LAPACKE_sstev( int matrix_order, char jobz, lapack_int n, float* d,
+                          float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dstev( int matrix_order, char jobz, lapack_int n, double* d,
+                          double* e, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sstevd( int matrix_order, char jobz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dstevd( int matrix_order, char jobz, lapack_int n, double* d,
+                           double* e, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sstevr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_dstevr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* isuppz );
+
+lapack_int LAPACKE_sstevx( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dstevx( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssycon( int matrix_order, char uplo, lapack_int n,
+                           const float* a, lapack_int lda,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_dsycon( int matrix_order, char uplo, lapack_int n,
+                           const double* a, lapack_int lda,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_csycon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zsycon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_ssyequb( int matrix_order, char uplo, lapack_int n,
+                            const float* a, lapack_int lda, float* s,
+                            float* scond, float* amax );
+lapack_int LAPACKE_dsyequb( int matrix_order, char uplo, lapack_int n,
+                            const double* a, lapack_int lda, double* s,
+                            double* scond, double* amax );
+lapack_int LAPACKE_csyequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* s, float* scond, float* amax );
+lapack_int LAPACKE_zsyequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_ssyev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_dsyev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          double* a, lapack_int lda, double* w );
+
+lapack_int LAPACKE_ssyevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_dsyevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           double* a, lapack_int lda, double* w );
+
+lapack_int LAPACKE_ssyevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, float* a, lapack_int lda, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_dsyevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, double* a, lapack_int lda, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* isuppz );
+
+lapack_int LAPACKE_ssyevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, float* a, lapack_int lda, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsyevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, double* a, lapack_int lda, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssygst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, float* a, lapack_int lda,
+                           const float* b, lapack_int ldb );
+lapack_int LAPACKE_dsygst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, double* a, lapack_int lda,
+                           const double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssygv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, float* a, lapack_int lda,
+                          float* b, lapack_int ldb, float* w );
+lapack_int LAPACKE_dsygv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, double* a, lapack_int lda,
+                          double* b, lapack_int ldb, double* w );
+
+lapack_int LAPACKE_ssygvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, float* a, lapack_int lda,
+                           float* b, lapack_int ldb, float* w );
+lapack_int LAPACKE_dsygvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* w );
+
+lapack_int LAPACKE_ssygvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsygvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const float* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dsyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const double* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_csyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zsyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_ssyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const float* a,
+                            lapack_int lda, const float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* s,
+                            const float* b, lapack_int ldb, float* x,
+                            lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dsyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const double* a,
+                            lapack_int lda, const double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* s,
+                            const double* b, lapack_int ldb, double* x,
+                            lapack_int ldx, double* rcond, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+lapack_int LAPACKE_csyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* s,
+                            const lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zsyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* s,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_ssysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* a, lapack_int lda,
+                          lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dsysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* a, lapack_int lda,
+                          lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_csysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zsysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           float* af, lapack_int ldaf, lapack_int* ipiv,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dsysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           double* af, lapack_int ldaf, lapack_int* ipiv,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* rcond, double* ferr,
+                           double* berr );
+lapack_int LAPACKE_csysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zsysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_ssysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, float* a,
+                            lapack_int lda, float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* s, float* b,
+                            lapack_int ldb, float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dsysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, double* a,
+                            lapack_int lda, double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* s, double* b,
+                            lapack_int ldb, double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+lapack_int LAPACKE_csysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* s,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zsysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* s,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_ssytrd( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, float* d, float* e, float* tau );
+lapack_int LAPACKE_dsytrd( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, double* d, double* e, double* tau );
+
+lapack_int LAPACKE_ssytrf( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dsytrf( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_csytrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zsytrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_ssytri( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dsytri( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_csytri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_zsytri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv );
+
+lapack_int LAPACKE_ssytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dsytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_csytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zsytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd, const float* ab,
+                           lapack_int ldab, float* rcond );
+lapack_int LAPACKE_dtbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd, const double* ab,
+                           lapack_int ldab, double* rcond );
+lapack_int LAPACKE_ctbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           float* rcond );
+lapack_int LAPACKE_ztbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           double* rcond );
+
+lapack_int LAPACKE_stbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const float* ab, lapack_int ldab, const float* b,
+                           lapack_int ldb, const float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dtbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const double* ab, lapack_int ldab, const double* b,
+                           lapack_int ldb, const double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_ctbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_ztbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_stbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const float* ab, lapack_int ldab, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dtbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const double* ab, lapack_int ldab, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_ctbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          float alpha, const float* a, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dtfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          double alpha, const double* a, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_ctfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          lapack_complex_float alpha,
+                          const lapack_complex_float* a,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          lapack_complex_double alpha,
+                          const lapack_complex_double* a,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, float* a );
+lapack_int LAPACKE_dtftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, double* a );
+lapack_int LAPACKE_ctftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_ztftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_stfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* arf, float* ap );
+lapack_int LAPACKE_dtfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* arf, double* ap );
+lapack_int LAPACKE_ctfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* arf,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_ztfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* arf,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_stfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* arf, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dtfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* arf, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_ctfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* arf,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* arf,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_stgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const float* s, lapack_int lds, const float* p,
+                           lapack_int ldp, float* vl, lapack_int ldvl,
+                           float* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_dtgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const double* s, lapack_int lds, const double* p,
+                           lapack_int ldp, double* vl, lapack_int ldvl,
+                           double* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_ctgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* s, lapack_int lds,
+                           const lapack_complex_float* p, lapack_int ldp,
+                           lapack_complex_float* vl, lapack_int ldvl,
+                           lapack_complex_float* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* s, lapack_int lds,
+                           const lapack_complex_double* p, lapack_int ldp,
+                           lapack_complex_double* vl, lapack_int ldvl,
+                           lapack_complex_double* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+
+lapack_int LAPACKE_stgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb, float* q,
+                           lapack_int ldq, float* z, lapack_int ldz,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_dtgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb, double* q,
+                           lapack_int ldq, double* z, lapack_int ldz,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_ctgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_stgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           float* alphar, float* alphai, float* beta, float* q,
+                           lapack_int ldq, float* z, lapack_int ldz,
+                           lapack_int* m, float* pl, float* pr, float* dif );
+lapack_int LAPACKE_dtgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n,
+                           double* a, lapack_int lda, double* b, lapack_int ldb,
+                           double* alphar, double* alphai, double* beta,
+                           double* q, lapack_int ldq, double* z, lapack_int ldz,
+                           lapack_int* m, double* pl, double* pr, double* dif );
+lapack_int LAPACKE_ctgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta, lapack_complex_float* q,
+                           lapack_int ldq, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* m, float* pl, float* pr,
+                           float* dif );
+lapack_int LAPACKE_ztgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* m, double* pl, double* pr, double* dif );
+
+lapack_int LAPACKE_stgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, float* a, lapack_int lda,
+                           float* b, lapack_int ldb, float tola, float tolb,
+                           float* alpha, float* beta, float* u, lapack_int ldu,
+                           float* v, lapack_int ldv, float* q, lapack_int ldq,
+                           lapack_int* ncycle );
+lapack_int LAPACKE_dtgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double tola, double tolb, double* alpha,
+                           double* beta, double* u, lapack_int ldu, double* v,
+                           lapack_int ldv, double* q, lapack_int ldq,
+                           lapack_int* ncycle );
+lapack_int LAPACKE_ctgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float tola, float tolb, float* alpha,
+                           float* beta, lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_int* ncycle );
+lapack_int LAPACKE_ztgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double tola, double tolb,
+                           double* alpha, double* beta,
+                           lapack_complex_double* u, lapack_int ldu,
+                           lapack_complex_double* v, lapack_int ldv,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_int* ncycle );
+
+lapack_int LAPACKE_stgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const float* a, lapack_int lda, const float* b,
+                           lapack_int ldb, const float* vl, lapack_int ldvl,
+                           const float* vr, lapack_int ldvr, float* s,
+                           float* dif, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_dtgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const double* a, lapack_int lda, const double* b,
+                           lapack_int ldb, const double* vl, lapack_int ldvl,
+                           const double* vr, lapack_int ldvr, double* s,
+                           double* dif, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ctgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* vl, lapack_int ldvl,
+                           const lapack_complex_float* vr, lapack_int ldvr,
+                           float* s, float* dif, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* vl, lapack_int ldvl,
+                           const lapack_complex_double* vr, lapack_int ldvr,
+                           double* s, double* dif, lapack_int mm,
+                           lapack_int* m );
+
+lapack_int LAPACKE_stgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n, const float* a,
+                           lapack_int lda, const float* b, lapack_int ldb,
+                           float* c, lapack_int ldc, const float* d,
+                           lapack_int ldd, const float* e, lapack_int lde,
+                           float* f, lapack_int ldf, float* scale, float* dif );
+lapack_int LAPACKE_dtgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n, const double* a,
+                           lapack_int lda, const double* b, lapack_int ldb,
+                           double* c, lapack_int ldc, const double* d,
+                           lapack_int ldd, const double* e, lapack_int lde,
+                           double* f, lapack_int ldf, double* scale,
+                           double* dif );
+lapack_int LAPACKE_ctgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* c, lapack_int ldc,
+                           const lapack_complex_float* d, lapack_int ldd,
+                           const lapack_complex_float* e, lapack_int lde,
+                           lapack_complex_float* f, lapack_int ldf,
+                           float* scale, float* dif );
+lapack_int LAPACKE_ztgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* c, lapack_int ldc,
+                           const lapack_complex_double* d, lapack_int ldd,
+                           const lapack_complex_double* e, lapack_int lde,
+                           lapack_complex_double* f, lapack_int ldf,
+                           double* scale, double* dif );
+
+lapack_int LAPACKE_stpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const float* ap, float* rcond );
+lapack_int LAPACKE_dtpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const double* ap, double* rcond );
+lapack_int LAPACKE_ctpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_float* ap,
+                           float* rcond );
+lapack_int LAPACKE_ztpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_double* ap,
+                           double* rcond );
+
+lapack_int LAPACKE_stprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* ap,
+                           const float* b, lapack_int ldb, const float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dtprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* ap,
+                           const double* b, lapack_int ldb, const double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_ctprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* ap,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_ztprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* ap,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_stptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           float* ap );
+lapack_int LAPACKE_dtptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           double* ap );
+lapack_int LAPACKE_ctptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_ztptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_stptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* ap,
+                           float* b, lapack_int ldb );
+lapack_int LAPACKE_dtptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* ap,
+                           double* b, lapack_int ldb );
+lapack_int LAPACKE_ctptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* ap,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* ap,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* ap, float* arf );
+lapack_int LAPACKE_dtpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* ap, double* arf );
+lapack_int LAPACKE_ctpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* ap,
+                           lapack_complex_float* arf );
+lapack_int LAPACKE_ztpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* ap,
+                           lapack_complex_double* arf );
+
+lapack_int LAPACKE_stpttr( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, float* a, lapack_int lda );
+lapack_int LAPACKE_dtpttr( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, double* a, lapack_int lda );
+lapack_int LAPACKE_ctpttr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztpttr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_strcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const float* a, lapack_int lda,
+                           float* rcond );
+lapack_int LAPACKE_dtrcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const double* a, lapack_int lda,
+                           double* rcond );
+lapack_int LAPACKE_ctrcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda, float* rcond );
+lapack_int LAPACKE_ztrcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, double* rcond );
+
+lapack_int LAPACKE_strevc( int matrix_order, char side, char howmny,
+                           lapack_logical* select, lapack_int n, const float* t,
+                           lapack_int ldt, float* vl, lapack_int ldvl,
+                           float* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_dtrevc( int matrix_order, char side, char howmny,
+                           lapack_logical* select, lapack_int n,
+                           const double* t, lapack_int ldt, double* vl,
+                           lapack_int ldvl, double* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ctrevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* vl, lapack_int ldvl,
+                           lapack_complex_float* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztrevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* vl, lapack_int ldvl,
+                           lapack_complex_double* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+
+lapack_int LAPACKE_strexc( int matrix_order, char compq, lapack_int n, float* t,
+                           lapack_int ldt, float* q, lapack_int ldq,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_dtrexc( int matrix_order, char compq, lapack_int n,
+                           double* t, lapack_int ldt, double* q, lapack_int ldq,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_ctrexc( int matrix_order, char compq, lapack_int n,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztrexc( int matrix_order, char compq, lapack_int n,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_strrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* a,
+                           lapack_int lda, const float* b, lapack_int ldb,
+                           const float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dtrrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* a,
+                           lapack_int lda, const double* b, lapack_int ldb,
+                           const double* x, lapack_int ldx, double* ferr,
+                           double* berr );
+lapack_int LAPACKE_ctrrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_ztrrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_strsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n, float* t,
+                           lapack_int ldt, float* q, lapack_int ldq, float* wr,
+                           float* wi, lapack_int* m, float* s, float* sep );
+lapack_int LAPACKE_dtrsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n,
+                           double* t, lapack_int ldt, double* q, lapack_int ldq,
+                           double* wr, double* wi, lapack_int* m, double* s,
+                           double* sep );
+lapack_int LAPACKE_ctrsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* w, lapack_int* m, float* s,
+                           float* sep );
+lapack_int LAPACKE_ztrsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* w, lapack_int* m, double* s,
+                           double* sep );
+
+lapack_int LAPACKE_strsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const float* t, lapack_int ldt, const float* vl,
+                           lapack_int ldvl, const float* vr, lapack_int ldvr,
+                           float* s, float* sep, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_dtrsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const double* t, lapack_int ldt, const double* vl,
+                           lapack_int ldvl, const double* vr, lapack_int ldvr,
+                           double* s, double* sep, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_ctrsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* t, lapack_int ldt,
+                           const lapack_complex_float* vl, lapack_int ldvl,
+                           const lapack_complex_float* vr, lapack_int ldvr,
+                           float* s, float* sep, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztrsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* t, lapack_int ldt,
+                           const lapack_complex_double* vl, lapack_int ldvl,
+                           const lapack_complex_double* vr, lapack_int ldvr,
+                           double* s, double* sep, lapack_int mm,
+                           lapack_int* m );
+
+lapack_int LAPACKE_strsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const float* a, lapack_int lda, const float* b,
+                           lapack_int ldb, float* c, lapack_int ldc,
+                           float* scale );
+lapack_int LAPACKE_dtrsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const double* a, lapack_int lda, const double* b,
+                           lapack_int ldb, double* c, lapack_int ldc,
+                           double* scale );
+lapack_int LAPACKE_ctrsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* c, lapack_int ldc,
+                           float* scale );
+lapack_int LAPACKE_ztrsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* c, lapack_int ldc,
+                           double* scale );
+
+lapack_int LAPACKE_strtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           float* a, lapack_int lda );
+lapack_int LAPACKE_dtrtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           double* a, lapack_int lda );
+lapack_int LAPACKE_ctrtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztrtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_strtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* a,
+                           lapack_int lda, float* b, lapack_int ldb );
+lapack_int LAPACKE_dtrtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* a,
+                           lapack_int lda, double* b, lapack_int ldb );
+lapack_int LAPACKE_ctrtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztrtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_strttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* a, lapack_int lda,
+                           float* arf );
+lapack_int LAPACKE_dtrttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* a, lapack_int lda,
+                           double* arf );
+lapack_int LAPACKE_ctrttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* arf );
+lapack_int LAPACKE_ztrttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* arf );
+
+lapack_int LAPACKE_strttp( int matrix_order, char uplo, lapack_int n,
+                           const float* a, lapack_int lda, float* ap );
+lapack_int LAPACKE_dtrttp( int matrix_order, char uplo, lapack_int n,
+                           const double* a, lapack_int lda, double* ap );
+lapack_int LAPACKE_ctrttp( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_ztrttp( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_stzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dtzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_ctzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_ztzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cunghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zunghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cunglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zunglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungtr( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau );
+lapack_int LAPACKE_zungtr( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cunmbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cupgtr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* q, lapack_int ldq );
+lapack_int LAPACKE_zupgtr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* q, lapack_int ldq );
+
+lapack_int LAPACKE_cupmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zupmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sbdsdc_work( int matrix_order, char uplo, char compq,
+                                lapack_int n, float* d, float* e, float* u,
+                                lapack_int ldu, float* vt, lapack_int ldvt,
+                                float* q, lapack_int* iq, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dbdsdc_work( int matrix_order, char uplo, char compq,
+                                lapack_int n, double* d, double* e, double* u,
+                                lapack_int ldu, double* vt, lapack_int ldvt,
+                                double* q, lapack_int* iq, double* work,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                float* d, float* e, float* vt, lapack_int ldvt,
+                                float* u, lapack_int ldu, float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_dbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                double* d, double* e, double* vt,
+                                lapack_int ldvt, double* u, lapack_int ldu,
+                                double* c, lapack_int ldc, double* work );
+lapack_int LAPACKE_cbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                float* d, float* e, lapack_complex_float* vt,
+                                lapack_int ldvt, lapack_complex_float* u,
+                                lapack_int ldu, lapack_complex_float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_zbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                double* d, double* e, lapack_complex_double* vt,
+                                lapack_int ldvt, lapack_complex_double* u,
+                                lapack_int ldu, lapack_complex_double* c,
+                                lapack_int ldc, double* work );
+
+lapack_int LAPACKE_sdisna_work( char job, lapack_int m, lapack_int n,
+                                const float* d, float* sep );
+lapack_int LAPACKE_ddisna_work( char job, lapack_int m, lapack_int n,
+                                const double* d, double* sep );
+
+lapack_int LAPACKE_sgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, float* ab, lapack_int ldab,
+                                float* d, float* e, float* q, lapack_int ldq,
+                                float* pt, lapack_int ldpt, float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_dgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, double* ab, lapack_int ldab,
+                                double* d, double* e, double* q, lapack_int ldq,
+                                double* pt, lapack_int ldpt, double* c,
+                                lapack_int ldc, double* work );
+lapack_int LAPACKE_cgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, lapack_complex_float* ab,
+                                lapack_int ldab, float* d, float* e,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* pt, lapack_int ldpt,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, lapack_complex_double* ab,
+                                lapack_int ldab, double* d, double* e,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* pt, lapack_int ldpt,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                double anorm, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* ab,
+                                lapack_int ldab, float* r, float* c,
+                                float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* ab,
+                                lapack_int ldab, double* r, double* c,
+                                double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                float* r, float* c, float* rowcnd,
+                                float* colcnd, float* amax );
+lapack_int LAPACKE_zgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, double* r, double* c,
+                                double* rowcnd, double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku, const float* ab,
+                                 lapack_int ldab, float* r, float* c,
+                                 float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku, const double* ab,
+                                 lapack_int ldab, double* r, double* c,
+                                 double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku,
+                                 const lapack_complex_float* ab,
+                                 lapack_int ldab, float* r, float* c,
+                                 float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_zgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku,
+                                 const lapack_complex_double* ab,
+                                 lapack_int ldab, double* r, double* c,
+                                 double* rowcnd, double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const float* ab, lapack_int ldab,
+                                const float* afb, lapack_int ldafb,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const double* ab, lapack_int ldab,
+                                const double* afb, lapack_int ldafb,
+                                const lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_complex_float* afb,
+                                lapack_int ldafb, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab,
+                                const lapack_complex_double* afb,
+                                lapack_int ldafb, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, const float* ab,
+                                 lapack_int ldab, const float* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const float* r, const float* c, const float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, const double* ab,
+                                 lapack_int ldab, const double* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs,
+                                 const lapack_complex_float* ab,
+                                 lapack_int ldab,
+                                 const lapack_complex_float* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const float* r, const float* c,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs,
+                                 const lapack_complex_double* ab,
+                                 lapack_int ldab,
+                                 const lapack_complex_double* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs, float* ab,
+                               lapack_int ldab, lapack_int* ipiv, float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs, double* ab,
+                               lapack_int ldab, lapack_int* ipiv, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+
+lapack_int LAPACKE_sgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, float* ab, lapack_int ldab,
+                                float* afb, lapack_int ldafb, lapack_int* ipiv,
+                                char* equed, float* r, float* c, float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, double* ab, lapack_int ldab,
+                                double* afb, lapack_int ldafb, lapack_int* ipiv,
+                                char* equed, double* r, double* c, double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, lapack_complex_float* ab,
+                                lapack_int ldab, lapack_complex_float* afb,
+                                lapack_int ldafb, lapack_int* ipiv, char* equed,
+                                float* r, float* c, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* afb,
+                                lapack_int ldafb, lapack_int* ipiv, char* equed,
+                                double* r, double* c, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_sgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, float* ab, lapack_int ldab,
+                                 float* afb, lapack_int ldafb, lapack_int* ipiv,
+                                 char* equed, float* r, float* c, float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, double* ab, lapack_int ldab,
+                                 double* afb, lapack_int ldafb,
+                                 lapack_int* ipiv, char* equed, double* r,
+                                 double* c, double* b, lapack_int ldb,
+                                 double* x, lapack_int ldx, double* rcond,
+                                 double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, lapack_complex_float* ab,
+                                 lapack_int ldab, lapack_complex_float* afb,
+                                 lapack_int ldafb, lapack_int* ipiv,
+                                 char* equed, float* r, float* c,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, lapack_complex_double* ab,
+                                 lapack_int ldab, lapack_complex_double* afb,
+                                 lapack_int ldafb, lapack_int* ipiv,
+                                 char* equed, double* r, double* c,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, float* ab,
+                                lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_dgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, double* ab,
+                                lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_cgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_int* ipiv );
+lapack_int LAPACKE_zgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_int* ipiv );
+
+lapack_int LAPACKE_sgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const float* ab, lapack_int ldab,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const double* ab, lapack_int ldab,
+                                const lapack_int* ipiv, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* scale, lapack_int m, float* v,
+                                lapack_int ldv );
+lapack_int LAPACKE_dgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* scale, lapack_int m, double* v,
+                                lapack_int ldv );
+lapack_int LAPACKE_cgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* scale, lapack_int m,
+                                lapack_complex_float* v, lapack_int ldv );
+lapack_int LAPACKE_zgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* scale, lapack_int m,
+                                lapack_complex_double* v, lapack_int ldv );
+
+lapack_int LAPACKE_sgebal_work( int matrix_order, char job, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ilo,
+                                lapack_int* ihi, float* scale );
+lapack_int LAPACKE_dgebal_work( int matrix_order, char job, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ilo,
+                                lapack_int* ihi, double* scale );
+lapack_int LAPACKE_cgebal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ilo, lapack_int* ihi,
+                                float* scale );
+lapack_int LAPACKE_zgebal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ilo, lapack_int* ihi,
+                                double* scale );
+
+lapack_int LAPACKE_sgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* d, float* e,
+                                float* tauq, float* taup, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* d, double* e,
+                                double* tauq, double* taup, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_cgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float* d, float* e, lapack_complex_float* tauq,
+                                lapack_complex_float* taup,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double* d, double* e,
+                                lapack_complex_double* tauq,
+                                lapack_complex_double* taup,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgecon_work( int matrix_order, char norm, lapack_int n,
+                                const float* a, lapack_int lda, float anorm,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgecon_work( int matrix_order, char norm, lapack_int n,
+                                const double* a, lapack_int lda, double anorm,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgecon_work( int matrix_order, char norm, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float anorm, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgecon_work( int matrix_order, char norm, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const float* a, lapack_int lda, float* r,
+                                float* c, float* rowcnd, float* colcnd,
+                                float* amax );
+lapack_int LAPACKE_dgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda, double* r,
+                                double* c, double* rowcnd, double* colcnd,
+                                double* amax );
+lapack_int LAPACKE_cgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* r, float* c, float* rowcnd,
+                                float* colcnd, float* amax );
+lapack_int LAPACKE_zgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* r, double* c, double* rowcnd,
+                                double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const float* a, lapack_int lda, float* r,
+                                 float* c, float* rowcnd, float* colcnd,
+                                 float* amax );
+lapack_int LAPACKE_dgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const double* a, lapack_int lda, double* r,
+                                 double* c, double* rowcnd, double* colcnd,
+                                 double* amax );
+lapack_int LAPACKE_cgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* r, float* c, float* rowcnd,
+                                 float* colcnd, float* amax );
+lapack_int LAPACKE_zgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* r, double* c, double* rowcnd,
+                                 double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_S_SELECT2 select, lapack_int n, float* a,
+                               lapack_int lda, lapack_int* sdim, float* wr,
+                               float* wi, float* vs, lapack_int ldvs,
+                               float* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_dgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_D_SELECT2 select, lapack_int n, double* a,
+                               lapack_int lda, lapack_int* sdim, double* wr,
+                               double* wi, double* vs, lapack_int ldvs,
+                               double* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_cgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_C_SELECT1 select, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_int* sdim, lapack_complex_float* w,
+                               lapack_complex_float* vs, lapack_int ldvs,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork, lapack_logical* bwork );
+lapack_int LAPACKE_zgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_Z_SELECT1 select, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_int* sdim, lapack_complex_double* w,
+                               lapack_complex_double* vs, lapack_int ldvs,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_S_SELECT2 select, char sense,
+                                lapack_int n, float* a, lapack_int lda,
+                                lapack_int* sdim, float* wr, float* wi,
+                                float* vs, lapack_int ldvs, float* rconde,
+                                float* rcondv, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_dgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_D_SELECT2 select, char sense,
+                                lapack_int n, double* a, lapack_int lda,
+                                lapack_int* sdim, double* wr, double* wi,
+                                double* vs, lapack_int ldvs, double* rconde,
+                                double* rcondv, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_cgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_C_SELECT1 select, char sense,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_int* sdim,
+                                lapack_complex_float* w,
+                                lapack_complex_float* vs, lapack_int ldvs,
+                                float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_logical* bwork );
+lapack_int LAPACKE_zgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_Z_SELECT1 select, char sense,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_int* sdim,
+                                lapack_complex_double* w,
+                                lapack_complex_double* vs, lapack_int ldvs,
+                                double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, float* a, lapack_int lda,
+                               float* wr, float* wi, float* vl, lapack_int ldvl,
+                               float* vr, lapack_int ldvr, float* work,
+                               lapack_int lwork );
+lapack_int LAPACKE_dgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, double* a, lapack_int lda,
+                               double* wr, double* wi, double* vl,
+                               lapack_int ldvl, double* vr, lapack_int ldvr,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* w,
+                               lapack_complex_float* vl, lapack_int ldvl,
+                               lapack_complex_float* vr, lapack_int ldvr,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* w,
+                               lapack_complex_double* vl, lapack_int ldvl,
+                               lapack_complex_double* vr, lapack_int ldvr,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork );
+
+lapack_int LAPACKE_sgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, float* a,
+                                lapack_int lda, float* wr, float* wi, float* vl,
+                                lapack_int ldvl, float* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, float* scale,
+                                float* abnrm, float* rconde, float* rcondv,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, double* a,
+                                lapack_int lda, double* wr, double* wi,
+                                double* vl, lapack_int ldvl, double* vr,
+                                lapack_int ldvr, lapack_int* ilo,
+                                lapack_int* ihi, double* scale, double* abnrm,
+                                double* rconde, double* rcondv, double* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_cgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* w,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, float* scale,
+                                float* abnrm, float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork );
+lapack_int LAPACKE_zgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* w,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, double* scale,
+                                double* abnrm, double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_sgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, float* a, lapack_int lda,
+                                float* tau, float* work, lapack_int lwork );
+lapack_int LAPACKE_dgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, double* a, lapack_int lda,
+                                double* tau, double* work, lapack_int lwork );
+lapack_int LAPACKE_cgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgejsv_work( int matrix_order, char joba, char jobu,
+                                char jobv, char jobr, char jobt, char jobp,
+                                lapack_int m, lapack_int n, float* a,
+                                lapack_int lda, float* sva, float* u,
+                                lapack_int ldu, float* v, lapack_int ldv,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgejsv_work( int matrix_order, char joba, char jobu,
+                                char jobv, char jobr, char jobt, char jobp,
+                                lapack_int m, lapack_int n, double* a,
+                                lapack_int lda, double* sva, double* u,
+                                lapack_int ldu, double* v, lapack_int ldv,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work );
+lapack_int LAPACKE_dgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work );
+lapack_int LAPACKE_cgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_sgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs, float* a,
+                               lapack_int lda, float* b, lapack_int ldb,
+                               float* work, lapack_int lwork );
+lapack_int LAPACKE_dgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs, double* a,
+                               lapack_int lda, double* b, lapack_int ldb,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_cgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* s,
+                                double rcond, lapack_int* rank, double* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_cgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_zgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, double* s, double rcond,
+                                lapack_int* rank, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* s,
+                                double rcond, lapack_int* rank, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_cgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, double* s, double rcond,
+                                lapack_int* rank, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_sgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, lapack_int* jpvt,
+                                float rcond, lapack_int* rank, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, lapack_int* jpvt,
+                                double rcond, lapack_int* rank, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_cgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_int* jpvt, float rcond,
+                                lapack_int* rank, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_int* jpvt, double rcond,
+                                lapack_int* rank, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_sgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* jpvt,
+                                float* tau, float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* jpvt,
+                                double* tau, double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork );
+lapack_int LAPACKE_zgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_sgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* jpvt,
+                                float* tau, float* work );
+lapack_int LAPACKE_dgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* jpvt,
+                                double* tau, double* work );
+lapack_int LAPACKE_cgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_float* tau,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_double* tau,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work );
+lapack_int LAPACKE_dgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work );
+lapack_int LAPACKE_cgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_sgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* tau,
+                                 float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* tau,
+                                 double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* tau,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* tau,
+                                 lapack_complex_double* work,
+                                 lapack_int lwork );
+
+lapack_int LAPACKE_sgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const float* af, lapack_int ldaf,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs, const float* a,
+                                 lapack_int lda, const float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* r, const float* c, const float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs, const double* a,
+                                 lapack_int lda, const double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* r, const float* c,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* s, float* u, lapack_int ldu, float* vt,
+                                lapack_int ldvt, float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* s, double* u, lapack_int ldu,
+                                double* vt, lapack_int ldvt, double* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_cgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, float* s,
+                                lapack_complex_float* u, lapack_int ldu,
+                                lapack_complex_float* vt, lapack_int ldvt,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int* iwork );
+lapack_int LAPACKE_zgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, double* s,
+                                lapack_complex_double* u, lapack_int ldu,
+                                lapack_complex_double* vt, lapack_int ldvt,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int* iwork );
+
+lapack_int LAPACKE_sgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* a, lapack_int lda, lapack_int* ipiv,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* a, lapack_int lda, lapack_int* ipiv,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_cgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dsgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                double* a, lapack_int lda, lapack_int* ipiv,
+                                double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* work, float* swork,
+                                lapack_int* iter );
+lapack_int LAPACKE_zcgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* work,
+                                lapack_complex_float* swork, double* rwork,
+                                lapack_int* iter );
+
+lapack_int LAPACKE_sgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n, float* a,
+                                lapack_int lda, float* s, float* u,
+                                lapack_int ldu, float* vt, lapack_int ldvt,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n, double* a,
+                                lapack_int lda, double* s, double* u,
+                                lapack_int ldu, double* vt, lapack_int ldvt,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float* s, lapack_complex_float* u,
+                                lapack_int ldu, lapack_complex_float* vt,
+                                lapack_int ldvt, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double* s, lapack_complex_double* u,
+                                lapack_int ldu, lapack_complex_double* vt,
+                                lapack_int ldvt, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_sgesvj_work( int matrix_order, char joba, char jobu,
+                                char jobv, lapack_int m, lapack_int n, float* a,
+                                lapack_int lda, float* sva, lapack_int mv,
+                                float* v, lapack_int ldv, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgesvj_work( int matrix_order, char joba, char jobu,
+                                char jobv, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* sva,
+                                lapack_int mv, double* v, lapack_int ldv,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, float* a,
+                                lapack_int lda, float* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, float* r,
+                                float* c, float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, double* a,
+                                lapack_int lda, double* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, double* r,
+                                double* c, double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, float* r,
+                                float* c, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, double* r,
+                                double* c, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_sgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs, float* a,
+                                 lapack_int lda, float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* r,
+                                 float* c, float* b, lapack_int ldb, float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs, double* a,
+                                 lapack_int lda, double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* r,
+                                 double* c, double* b, lapack_int ldb,
+                                 double* x, lapack_int ldx, double* rcond,
+                                 double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* r,
+                                 float* c, lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params,
+                                 lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* r,
+                                 double* c, lapack_complex_double* b,
+                                 lapack_int ldb, lapack_complex_double* x,
+                                 lapack_int ldx, double* rcond, double* rpvgrw,
+                                 double* berr, lapack_int n_err_bnds,
+                                 double* err_bnds_norm, double* err_bnds_comp,
+                                 lapack_int nparams, double* params,
+                                 lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv );
+lapack_int LAPACKE_zgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv );
+lapack_int LAPACKE_zgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetri_work( int matrix_order, lapack_int n, float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgetri_work( int matrix_order, lapack_int n, double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgetri_work( int matrix_order, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgetri_work( int matrix_order, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_cgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* lscale, const float* rscale,
+                                lapack_int m, float* v, lapack_int ldv );
+lapack_int LAPACKE_dggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* lscale, const double* rscale,
+                                lapack_int m, double* v, lapack_int ldv );
+lapack_int LAPACKE_cggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* lscale, const float* rscale,
+                                lapack_int m, lapack_complex_float* v,
+                                lapack_int ldv );
+lapack_int LAPACKE_zggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* lscale, const double* rscale,
+                                lapack_int m, lapack_complex_double* v,
+                                lapack_int ldv );
+
+lapack_int LAPACKE_sggbal_work( int matrix_order, char job, lapack_int n,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, lapack_int* ilo,
+                                lapack_int* ihi, float* lscale, float* rscale,
+                                float* work );
+lapack_int LAPACKE_dggbal_work( int matrix_order, char job, lapack_int n,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, lapack_int* ilo,
+                                lapack_int* ihi, double* lscale, double* rscale,
+                                double* work );
+lapack_int LAPACKE_cggbal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_int* ilo, lapack_int* ihi, float* lscale,
+                                float* rscale, float* work );
+lapack_int LAPACKE_zggbal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_int* ilo, lapack_int* ihi,
+                                double* lscale, double* rscale, double* work );
+
+lapack_int LAPACKE_sgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_S_SELECT3 selctg, lapack_int n,
+                               float* a, lapack_int lda, float* b,
+                               lapack_int ldb, lapack_int* sdim, float* alphar,
+                               float* alphai, float* beta, float* vsl,
+                               lapack_int ldvsl, float* vsr, lapack_int ldvsr,
+                               float* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_dgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_D_SELECT3 selctg, lapack_int n,
+                               double* a, lapack_int lda, double* b,
+                               lapack_int ldb, lapack_int* sdim, double* alphar,
+                               double* alphai, double* beta, double* vsl,
+                               lapack_int ldvsl, double* vsr, lapack_int ldvsr,
+                               double* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_cgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_C_SELECT2 selctg, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_int* sdim, lapack_complex_float* alpha,
+                               lapack_complex_float* beta,
+                               lapack_complex_float* vsl, lapack_int ldvsl,
+                               lapack_complex_float* vsr, lapack_int ldvsr,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork, lapack_logical* bwork );
+lapack_int LAPACKE_zgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_Z_SELECT2 selctg, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_int* sdim, lapack_complex_double* alpha,
+                               lapack_complex_double* beta,
+                               lapack_complex_double* vsl, lapack_int ldvsl,
+                               lapack_complex_double* vsr, lapack_int ldvsr,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_S_SELECT3 selctg, char sense,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, lapack_int* sdim,
+                                float* alphar, float* alphai, float* beta,
+                                float* vsl, lapack_int ldvsl, float* vsr,
+                                lapack_int ldvsr, float* rconde, float* rcondv,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_dggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_D_SELECT3 selctg, char sense,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, lapack_int* sdim,
+                                double* alphar, double* alphai, double* beta,
+                                double* vsl, lapack_int ldvsl, double* vsr,
+                                lapack_int ldvsr, double* rconde,
+                                double* rcondv, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_cggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_C_SELECT2 selctg, char sense,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_int* sdim,
+                                lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* vsl, lapack_int ldvsl,
+                                lapack_complex_float* vsr, lapack_int ldvsr,
+                                float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int* iwork,
+                                lapack_int liwork, lapack_logical* bwork );
+lapack_int LAPACKE_zggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_Z_SELECT2 selctg, char sense,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_int* sdim,
+                                lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* vsl, lapack_int ldvsl,
+                                lapack_complex_double* vsr, lapack_int ldvsr,
+                                double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int* iwork,
+                                lapack_int liwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, float* a, lapack_int lda, float* b,
+                               lapack_int ldb, float* alphar, float* alphai,
+                               float* beta, float* vl, lapack_int ldvl,
+                               float* vr, lapack_int ldvr, float* work,
+                               lapack_int lwork );
+lapack_int LAPACKE_dggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, double* a, lapack_int lda,
+                               double* b, lapack_int ldb, double* alphar,
+                               double* alphai, double* beta, double* vl,
+                               lapack_int ldvl, double* vr, lapack_int ldvr,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_cggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b,
+                               lapack_int ldb, lapack_complex_float* alpha,
+                               lapack_complex_float* beta,
+                               lapack_complex_float* vl, lapack_int ldvl,
+                               lapack_complex_float* vr, lapack_int ldvr,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b,
+                               lapack_int ldb, lapack_complex_double* alpha,
+                               lapack_complex_double* beta,
+                               lapack_complex_double* vl, lapack_int ldvl,
+                               lapack_complex_double* vr, lapack_int ldvr,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork );
+
+lapack_int LAPACKE_sggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float* alphar, float* alphai, float* beta,
+                                float* vl, lapack_int ldvl, float* vr,
+                                lapack_int ldvr, lapack_int* ilo,
+                                lapack_int* ihi, float* lscale, float* rscale,
+                                float* abnrm, float* bbnrm, float* rconde,
+                                float* rcondv, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_logical* bwork );
+lapack_int LAPACKE_dggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* alphar, double* alphai, double* beta,
+                                double* vl, lapack_int ldvl, double* vr,
+                                lapack_int ldvr, lapack_int* ilo,
+                                lapack_int* ihi, double* lscale, double* rscale,
+                                double* abnrm, double* bbnrm, double* rconde,
+                                double* rcondv, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_logical* bwork );
+lapack_int LAPACKE_cggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, float* lscale,
+                                float* rscale, float* abnrm, float* bbnrm,
+                                float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int* iwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_zggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi,
+                                double* lscale, double* rscale, double* abnrm,
+                                double* bbnrm, double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int* iwork,
+                                lapack_logical* bwork );
+
+lapack_int LAPACKE_sggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* d, float* x,
+                                float* y, float* work, lapack_int lwork );
+lapack_int LAPACKE_dggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* d, double* x,
+                                double* y, double* work, lapack_int lwork );
+lapack_int LAPACKE_cggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* d,
+                                lapack_complex_float* x,
+                                lapack_complex_float* y,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* d,
+                                lapack_complex_double* x,
+                                lapack_complex_double* y,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float* q, lapack_int ldq,
+                                float* z, lapack_int ldz );
+lapack_int LAPACKE_dgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double* q, lapack_int ldq,
+                                double* z, lapack_int ldz );
+lapack_int LAPACKE_cgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* c, float* d,
+                                float* x, float* work, lapack_int lwork );
+lapack_int LAPACKE_dgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* c, double* d,
+                                double* x, double* work, lapack_int lwork );
+lapack_int LAPACKE_cgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* c,
+                                lapack_complex_float* d,
+                                lapack_complex_float* x,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* c,
+                                lapack_complex_double* d,
+                                lapack_complex_double* x,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, float* a, lapack_int lda,
+                                float* taua, float* b, lapack_int ldb,
+                                float* taub, float* work, lapack_int lwork );
+lapack_int LAPACKE_dggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, double* a, lapack_int lda,
+                                double* taua, double* b, lapack_int ldb,
+                                double* taub, double* work, lapack_int lwork );
+lapack_int LAPACKE_cggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* taua,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* taub,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* taua,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* taub,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* taua, float* b, lapack_int ldb,
+                                float* taub, float* work, lapack_int lwork );
+lapack_int LAPACKE_dggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* taua, double* b, lapack_int ldb,
+                                double* taub, double* work, lapack_int lwork );
+lapack_int LAPACKE_cggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* taua,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* taub,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* taua,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* taub,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float* alpha, float* beta,
+                                float* u, lapack_int ldu, float* v,
+                                lapack_int ldv, float* q, lapack_int ldq,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double* alpha, double* beta,
+                                double* u, lapack_int ldu, double* v,
+                                lapack_int ldv, double* q, lapack_int ldq,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float* alpha, float* beta,
+                                lapack_complex_float* u, lapack_int ldu,
+                                lapack_complex_float* v, lapack_int ldv,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_zggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double* alpha, double* beta,
+                                lapack_complex_double* u, lapack_int ldu,
+                                lapack_complex_double* v, lapack_int ldv,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float tola,
+                                float tolb, lapack_int* k, lapack_int* l,
+                                float* u, lapack_int ldu, float* v,
+                                lapack_int ldv, float* q, lapack_int ldq,
+                                lapack_int* iwork, float* tau, float* work );
+lapack_int LAPACKE_dggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double tola,
+                                double tolb, lapack_int* k, lapack_int* l,
+                                double* u, lapack_int ldu, double* v,
+                                lapack_int ldv, double* q, lapack_int ldq,
+                                lapack_int* iwork, double* tau, double* work );
+lapack_int LAPACKE_cggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, float tola, float tolb,
+                                lapack_int* k, lapack_int* l,
+                                lapack_complex_float* u, lapack_int ldu,
+                                lapack_complex_float* v, lapack_int ldv,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_int* iwork, float* rwork,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, double tola, double tolb,
+                                lapack_int* k, lapack_int* l,
+                                lapack_complex_double* u, lapack_int ldu,
+                                lapack_complex_double* v, lapack_int ldv,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_int* iwork, double* rwork,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_sgtcon_work( char norm, lapack_int n, const float* dl,
+                                const float* d, const float* du,
+                                const float* du2, const lapack_int* ipiv,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgtcon_work( char norm, lapack_int n, const double* dl,
+                                const double* d, const double* du,
+                                const double* du2, const lapack_int* ipiv,
+                                double anorm, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgtcon_work( char norm, lapack_int n,
+                                const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                const lapack_complex_float* du2,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zgtcon_work( char norm, lapack_int n,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                const lapack_complex_double* du2,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_sgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* dl,
+                                const float* d, const float* du,
+                                const float* dlf, const float* df,
+                                const float* duf, const float* du2,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* dl,
+                                const double* d, const double* du,
+                                const double* dlf, const double* df,
+                                const double* duf, const double* du2,
+                                const lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                const lapack_complex_float* dlf,
+                                const lapack_complex_float* df,
+                                const lapack_complex_float* duf,
+                                const lapack_complex_float* du2,
+                                const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                const lapack_complex_double* dlf,
+                                const lapack_complex_double* df,
+                                const lapack_complex_double* duf,
+                                const lapack_complex_double* du2,
+                                const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* dl, float* d, float* du, float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* dl, double* d, double* du, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* dl,
+                               lapack_complex_float* d,
+                               lapack_complex_float* du,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* dl,
+                               lapack_complex_double* d,
+                               lapack_complex_double* du,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, const float* dl,
+                                const float* d, const float* du, float* dlf,
+                                float* df, float* duf, float* du2,
+                                lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, const double* dl,
+                                const double* d, const double* du, double* dlf,
+                                double* df, double* duf, double* du2,
+                                lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                lapack_complex_float* dlf,
+                                lapack_complex_float* df,
+                                lapack_complex_float* duf,
+                                lapack_complex_float* du2, lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                lapack_complex_double* dlf,
+                                lapack_complex_double* df,
+                                lapack_complex_double* duf,
+                                lapack_complex_double* du2, lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgttrf_work( lapack_int n, float* dl, float* d, float* du,
+                                float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_dgttrf_work( lapack_int n, double* dl, double* d, double* du,
+                                double* du2, lapack_int* ipiv );
+lapack_int LAPACKE_cgttrf_work( lapack_int n, lapack_complex_float* dl,
+                                lapack_complex_float* d,
+                                lapack_complex_float* du,
+                                lapack_complex_float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_zgttrf_work( lapack_int n, lapack_complex_double* dl,
+                                lapack_complex_double* d,
+                                lapack_complex_double* du,
+                                lapack_complex_double* du2, lapack_int* ipiv );
+
+lapack_int LAPACKE_sgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* dl,
+                                const float* d, const float* du,
+                                const float* du2, const lapack_int* ipiv,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* dl,
+                                const double* d, const double* du,
+                                const double* du2, const lapack_int* ipiv,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_cgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                const lapack_complex_float* du2,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                const lapack_complex_double* du2,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work,
+                               float* rwork );
+lapack_int LAPACKE_zhbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work,
+                               double* rwork );
+
+lapack_int LAPACKE_chbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* q, lapack_int ldq,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                float* rwork, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_zhbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_complex_double* q, lapack_int ldq,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                double* rwork, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_chbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_complex_float* bb, lapack_int ldbb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                const lapack_complex_double* bb,
+                                lapack_int ldbb, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_chbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               lapack_complex_float* bb, lapack_int ldbb,
+                               float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work,
+                               float* rwork );
+lapack_int LAPACKE_zhbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               lapack_complex_double* bb, lapack_int ldbb,
+                               double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work,
+                               double* rwork );
+
+lapack_int LAPACKE_chbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* bb, lapack_int ldbb,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_complex_double* bb, lapack_int ldbb,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, lapack_complex_float* ab,
+                                lapack_int ldab, lapack_complex_float* bb,
+                                lapack_int ldbb, lapack_complex_float* q,
+                                lapack_int ldq, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* bb,
+                                lapack_int ldbb, lapack_complex_double* q,
+                                lapack_int ldq, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                float* d, float* e, lapack_complex_float* q,
+                                lapack_int ldq, lapack_complex_float* work );
+lapack_int LAPACKE_zhbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                double* d, double* e, lapack_complex_double* q,
+                                lapack_int ldq, lapack_complex_double* work );
+
+lapack_int LAPACKE_checon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zhecon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_cheequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* s, float* scond, float* amax,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_zheequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* s, double* scond, double* amax,
+                                 lapack_complex_double* work );
+
+lapack_int LAPACKE_cheev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, float* w,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zheev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, double* w,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork );
+
+lapack_int LAPACKE_cheevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, float* w,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_zheevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, double* w,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_cheevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_int* isuppz,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_zheevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_int* isuppz,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_cheevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zheevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chegst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zhegst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_chegv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b,
+                               lapack_int ldb, float* w,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zhegv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb,
+                               double* w, lapack_complex_double* work,
+                               lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_chegvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float* w, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhegvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double* w, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chegvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhegvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_cherfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zherfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_cherfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* s, const lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zherfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* s,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_chesv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zhesv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_chesvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zhesvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_chesvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* s,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zhesvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* s,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_chetrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float* d, float* e, lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zhetrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double* d, double* e,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_chetrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_zhetrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_chetri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zhetri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_chetrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zhetrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               float alpha, const lapack_complex_float* a,
+                               lapack_int lda, float beta,
+                               lapack_complex_float* c );
+lapack_int LAPACKE_zhfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               double alpha, const lapack_complex_double* a,
+                               lapack_int lda, double beta,
+                               lapack_complex_double* c );
+
+lapack_int LAPACKE_shgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, float* h, lapack_int ldh,
+                                float* t, lapack_int ldt, float* alphar,
+                                float* alphai, float* beta, float* q,
+                                lapack_int ldq, float* z, lapack_int ldz,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dhgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, double* h, lapack_int ldh,
+                                double* t, lapack_int ldt, double* alphar,
+                                double* alphai, double* beta, double* q,
+                                lapack_int ldq, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_chgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_float* h,
+                                lapack_int ldh, lapack_complex_float* t,
+                                lapack_int ldt, lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork );
+lapack_int LAPACKE_zhgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_double* h,
+                                lapack_int ldh, lapack_complex_double* t,
+                                lapack_int ldt, lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_chpcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zhpcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_chpev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_float* ap, float* w,
+                               lapack_complex_float* z, lapack_int ldz,
+                               lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhpev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_double* ap,
+                               double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work,
+                               double* rwork );
+
+lapack_int LAPACKE_chpevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_float* ap,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhpevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_double* ap,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chpevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* ap, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhpevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* ap, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chpgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_float* ap,
+                                const lapack_complex_float* bp );
+lapack_int LAPACKE_zhpgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_double* ap,
+                                const lapack_complex_double* bp );
+
+lapack_int LAPACKE_chpgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n,
+                               lapack_complex_float* ap,
+                               lapack_complex_float* bp, float* w,
+                               lapack_complex_float* z, lapack_int ldz,
+                               lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhpgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n,
+                               lapack_complex_double* ap,
+                               lapack_complex_double* bp, double* w,
+                               lapack_complex_double* z, lapack_int ldz,
+                               lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_chpgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                lapack_complex_float* bp, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_zhpgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                lapack_complex_double* bp, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_chpgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                lapack_complex_float* bp, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhpgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                lapack_complex_double* bp, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_complex_float* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_chpsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* ap,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zhpsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* ap,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+
+lapack_int LAPACKE_chpsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* afp, lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhpsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* afp, lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_chptrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap, float* d, float* e,
+                                lapack_complex_float* tau );
+lapack_int LAPACKE_zhptrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap, double* d, double* e,
+                                lapack_complex_double* tau );
+
+lapack_int LAPACKE_chptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zhptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_chptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zhptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_chptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zhptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_shsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, lapack_logical* select,
+                                lapack_int n, const float* h, lapack_int ldh,
+                                float* wr, const float* wi, float* vl,
+                                lapack_int ldvl, float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, float* work,
+                                lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_dhsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, lapack_logical* select,
+                                lapack_int n, const double* h, lapack_int ldh,
+                                double* wr, const double* wi, double* vl,
+                                lapack_int ldvl, double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, double* work,
+                                lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_chsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, const lapack_logical* select,
+                                lapack_int n, const lapack_complex_float* h,
+                                lapack_int ldh, lapack_complex_float* w,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_zhsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, const lapack_logical* select,
+                                lapack_int n, const lapack_complex_double* h,
+                                lapack_int ldh, lapack_complex_double* w,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* ifaill, lapack_int* ifailr );
+
+lapack_int LAPACKE_shseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                float* h, lapack_int ldh, float* wr, float* wi,
+                                float* z, lapack_int ldz, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dhseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                double* h, lapack_int ldh, double* wr,
+                                double* wi, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_chseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_float* h, lapack_int ldh,
+                                lapack_complex_float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zhseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_double* h, lapack_int ldh,
+                                lapack_complex_double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_clacgv_work( lapack_int n, lapack_complex_float* x,
+                                lapack_int incx );
+lapack_int LAPACKE_zlacgv_work( lapack_int n, lapack_complex_double* x,
+                                lapack_int incx );
+
+lapack_int LAPACKE_slacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dlacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_clacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zlacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_zlag2c_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_float* sa, lapack_int ldsa );
+
+lapack_int LAPACKE_slag2d_work( int matrix_order, lapack_int m, lapack_int n,
+                                const float* sa, lapack_int ldsa, double* a,
+                                lapack_int lda );
+
+lapack_int LAPACKE_dlag2s_work( int matrix_order, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda, float* sa,
+                                lapack_int ldsa );
+
+lapack_int LAPACKE_clag2z_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_float* sa, lapack_int ldsa,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* d,
+                                float* a, lapack_int lda, lapack_int* iseed,
+                                float* work );
+lapack_int LAPACKE_dlagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* d,
+                                double* a, lapack_int lda, lapack_int* iseed,
+                                double* work );
+lapack_int LAPACKE_clagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* d,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* iseed, lapack_complex_float* work );
+lapack_int LAPACKE_zlagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* d,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* iseed,
+                                lapack_complex_double* work );
+                                
+lapack_int LAPACKE_claghe_work( int matrix_order, lapack_int n, lapack_int k,
+                                const float* d, lapack_complex_float* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zlaghe_work( int matrix_order, lapack_int n, lapack_int k,
+                                const double* d, lapack_complex_double* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_slagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const float* d, float* a, lapack_int lda,
+                                lapack_int* iseed, float* work );
+lapack_int LAPACKE_dlagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const double* d, double* a, lapack_int lda,
+                                lapack_int* iseed, double* work );
+lapack_int LAPACKE_clagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const float* d, lapack_complex_float* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zlagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const double* d, lapack_complex_double* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_slapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n, float* x,
+                                lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_dlapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n, double* x,
+                                lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_clapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_float* x, lapack_int ldx,
+                                lapack_int* k );
+lapack_int LAPACKE_zlapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_double* x, lapack_int ldx,
+                                lapack_int* k );
+
+lapack_int LAPACKE_slartgp_work( float f, float g, float* cs, float* sn,
+                                 float* r );
+lapack_int LAPACKE_dlartgp_work( double f, double g, double* cs, double* sn,
+                                 double* r );
+
+lapack_int LAPACKE_slartgs_work( float x, float y, float sigma, float* cs,
+                                 float* sn );
+lapack_int LAPACKE_dlartgs_work( double x, double y, double sigma, double* cs,
+                                 double* sn );
+                                
+float LAPACKE_slapy2_work( float x, float y );
+double LAPACKE_dlapy2_work( double x, double y );
+
+float LAPACKE_slapy3_work( float x, float y, float z );
+double LAPACKE_dlapy3_work( double x, double y, double z );
+
+float LAPACKE_slamch_work( char cmach );
+double LAPACKE_dlamch_work( char cmach );
+
+float LAPACKE_slange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* work );
+double LAPACKE_dlange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* work );
+float LAPACKE_clange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_zlange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* work );
+
+float LAPACKE_clanhe_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_zlanhe_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* work );
+
+float LAPACKE_slansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* work );
+double LAPACKE_dlansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* work );
+float LAPACKE_clansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_zlansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* work );
+
+float LAPACKE_slantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n, const float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_dlantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda, double* work );
+float LAPACKE_clantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* work );
+double LAPACKE_zlantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* work );
+
+lapack_int LAPACKE_slarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, const float* v,
+                                lapack_int ldv, const float* t, lapack_int ldt,
+                                float* c, lapack_int ldc, float* work,
+                                lapack_int ldwork );
+lapack_int LAPACKE_dlarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, const double* v,
+                                lapack_int ldv, const double* t, lapack_int ldt,
+                                double* c, lapack_int ldc, double* work,
+                                lapack_int ldwork );
+lapack_int LAPACKE_clarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int ldwork );
+lapack_int LAPACKE_zlarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work,
+                                lapack_int ldwork );
+
+lapack_int LAPACKE_slarfg_work( lapack_int n, float* alpha, float* x,
+                                lapack_int incx, float* tau );
+lapack_int LAPACKE_dlarfg_work( lapack_int n, double* alpha, double* x,
+                                lapack_int incx, double* tau );
+lapack_int LAPACKE_clarfg_work( lapack_int n, lapack_complex_float* alpha,
+                                lapack_complex_float* x, lapack_int incx,
+                                lapack_complex_float* tau );
+lapack_int LAPACKE_zlarfg_work( lapack_int n, lapack_complex_double* alpha,
+                                lapack_complex_double* x, lapack_int incx,
+                                lapack_complex_double* tau );
+
+lapack_int LAPACKE_slarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k, const float* v,
+                                lapack_int ldv, const float* tau, float* t,
+                                lapack_int ldt );
+lapack_int LAPACKE_dlarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k, const double* v,
+                                lapack_int ldv, const double* tau, double* t,
+                                lapack_int ldt );
+lapack_int LAPACKE_clarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zlarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_slarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const float* v, float tau,
+                                float* c, lapack_int ldc, float* work );
+lapack_int LAPACKE_dlarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const double* v, double tau,
+                                double* c, lapack_int ldc, double* work );
+lapack_int LAPACKE_clarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const lapack_complex_float* v,
+                                lapack_complex_float tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zlarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const lapack_complex_double* v,
+                                lapack_complex_double tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_slarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, float* x );
+lapack_int LAPACKE_dlarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, double* x );
+lapack_int LAPACKE_clarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, lapack_complex_float* x );
+lapack_int LAPACKE_zlarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, lapack_complex_double* x );
+
+lapack_int LAPACKE_slaset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, float alpha, float beta, float* a,
+                                lapack_int lda );
+lapack_int LAPACKE_dlaset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, double alpha, double beta,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_claset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, lapack_complex_float alpha,
+                                lapack_complex_float beta,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlaset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, lapack_complex_double alpha,
+                                lapack_complex_double beta,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slasrt_work( char id, lapack_int n, float* d );
+lapack_int LAPACKE_dlasrt_work( char id, lapack_int n, double* d );
+
+lapack_int LAPACKE_slaswp_work( int matrix_order, lapack_int n, float* a,
+                                lapack_int lda, lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_dlaswp_work( int matrix_order, lapack_int n, double* a,
+                                lapack_int lda, lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_claswp_work( int matrix_order, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_zlaswp_work( int matrix_order, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+
+lapack_int LAPACKE_slatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                float* d, lapack_int mode, float cond,
+                                float dmax, lapack_int kl, lapack_int ku,
+                                char pack, float* a, lapack_int lda,
+                                float* work );
+lapack_int LAPACKE_dlatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                double* d, lapack_int mode, double cond,
+                                double dmax, lapack_int kl, lapack_int ku,
+                                char pack, double* a, lapack_int lda,
+                                double* work );
+lapack_int LAPACKE_clatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                float* d, lapack_int mode, float cond,
+                                float dmax, lapack_int kl, lapack_int ku,
+                                char pack, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* work );
+lapack_int LAPACKE_zlatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                double* d, lapack_int mode, double cond,
+                                double dmax, lapack_int kl, lapack_int ku,
+                                char pack, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* work );
+
+lapack_int LAPACKE_slauum_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda );
+lapack_int LAPACKE_dlauum_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_clauum_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlauum_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_sopgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, const float* tau, float* q,
+                                lapack_int ldq, float* work );
+lapack_int LAPACKE_dopgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, const double* tau, double* q,
+                                lapack_int ldq, double* work );
+
+lapack_int LAPACKE_sopmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const float* ap, const float* tau, float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_dopmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const double* ap, const double* tau, double* c,
+                                lapack_int ldc, double* work );
+
+lapack_int LAPACKE_sorgbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k, float* a,
+                                lapack_int lda, const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k, double* a,
+                                lapack_int lda, const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgtr_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, const float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dorgtr_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, const double* tau,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_spbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const float* ab, lapack_int ldab,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dpbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const double* ab,
+                                lapack_int ldab, double anorm, double* rcond,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cpbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_float* ab,
+                                lapack_int ldab, float anorm, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_double* ab,
+                                lapack_int ldab, double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const float* ab, lapack_int ldab,
+                                float* s, float* scond, float* amax );
+lapack_int LAPACKE_dpbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const double* ab,
+                                lapack_int ldab, double* s, double* scond,
+                                double* amax );
+lapack_int LAPACKE_cpbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_float* ab,
+                                lapack_int ldab, float* s, float* scond,
+                                float* amax );
+lapack_int LAPACKE_zpbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_double* ab,
+                                lapack_int ldab, double* s, double* scond,
+                                double* amax );
+
+lapack_int LAPACKE_spbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs, const float* ab,
+                                lapack_int ldab, const float* afb,
+                                lapack_int ldafb, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dpbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const double* ab, lapack_int ldab,
+                                const double* afb, lapack_int ldafb,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cpbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_complex_float* afb,
+                                lapack_int ldafb, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab,
+                                const lapack_complex_double* afb,
+                                lapack_int ldafb,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, float* bb, lapack_int ldbb );
+lapack_int LAPACKE_dpbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, double* bb, lapack_int ldbb );
+lapack_int LAPACKE_cpbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, lapack_complex_float* bb,
+                                lapack_int ldbb );
+lapack_int LAPACKE_zpbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, lapack_complex_double* bb,
+                                lapack_int ldbb );
+
+lapack_int LAPACKE_spbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs, float* ab,
+                               lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs, double* ab,
+                               lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                float* ab, lapack_int ldab, float* afb,
+                                lapack_int ldafb, char* equed, float* s,
+                                float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dpbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                double* ab, lapack_int ldab, double* afb,
+                                lapack_int ldafb, char* equed, double* s,
+                                double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, double* work, lapack_int* iwork );
+lapack_int LAPACKE_cpbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* afb, lapack_int ldafb,
+                                char* equed, float* s, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zpbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_complex_double* afb, lapack_int ldafb,
+                                char* equed, double* s,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, float* ab, lapack_int ldab );
+lapack_int LAPACKE_dpbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, double* ab, lapack_int ldab );
+lapack_int LAPACKE_cpbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_complex_float* ab,
+                                lapack_int ldab );
+lapack_int LAPACKE_zpbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_complex_double* ab,
+                                lapack_int ldab );
+
+lapack_int LAPACKE_spbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs, const float* ab,
+                                lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const double* ab, lapack_int ldab, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cpbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_spftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, float* a );
+lapack_int LAPACKE_dpftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, double* a );
+lapack_int LAPACKE_cpftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, float* a );
+lapack_int LAPACKE_dpftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, double* a );
+lapack_int LAPACKE_cpftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs, const float* a,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dpftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs, const double* a,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_cpftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spocon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* a, lapack_int lda, float anorm,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dpocon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* a, lapack_int lda, double anorm,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cpocon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float anorm, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpocon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spoequ_work( int matrix_order, lapack_int n, const float* a,
+                                lapack_int lda, float* s, float* scond,
+                                float* amax );
+lapack_int LAPACKE_dpoequ_work( int matrix_order, lapack_int n, const double* a,
+                                lapack_int lda, double* s, double* scond,
+                                double* amax );
+lapack_int LAPACKE_cpoequ_work( int matrix_order, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequ_work( int matrix_order, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_spoequb_work( int matrix_order, lapack_int n, const float* a,
+                                 lapack_int lda, float* s, float* scond,
+                                 float* amax );
+lapack_int LAPACKE_dpoequb_work( int matrix_order, lapack_int n,
+                                 const double* a, lapack_int lda, double* s,
+                                 double* scond, double* amax );
+lapack_int LAPACKE_cpoequb_work( int matrix_order, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequb_work( int matrix_order, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_sporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const float* af, lapack_int ldaf,
+                                const float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af,
+                                lapack_int ldaf, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const float* a,
+                                 lapack_int lda, const float* af,
+                                 lapack_int ldaf, const float* s,
+                                 const float* b, lapack_int ldb, float* x,
+                                 lapack_int ldx, float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const double* a,
+                                 lapack_int lda, const double* af,
+                                 lapack_int ldaf, const double* s,
+                                 const double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const float* s,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const double* s,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* a, lapack_int lda,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* a, lapack_int lda,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_cposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dsposv_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* work, float* swork,
+                                lapack_int* iter );
+lapack_int LAPACKE_zcposv_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* work,
+                                lapack_complex_float* swork, double* rwork,
+                                lapack_int* iter );
+
+lapack_int LAPACKE_sposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, float* a,
+                                lapack_int lda, float* af, lapack_int ldaf,
+                                char* equed, float* s, float* b, lapack_int ldb,
+                                float* x, lapack_int ldx, float* rcond,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, double* a,
+                                lapack_int lda, double* af, lapack_int ldaf,
+                                char* equed, double* s, double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                char* equed, float* s, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                char* equed, double* s,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, float* a,
+                                 lapack_int lda, float* af, lapack_int ldaf,
+                                 char* equed, float* s, float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, double* a,
+                                 lapack_int lda, double* af, lapack_int ldaf,
+                                 char* equed, double* s, double* b,
+                                 lapack_int ldb, double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 char* equed, float* s, lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params,
+                                 lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 char* equed, double* s,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_spotrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda );
+lapack_int LAPACKE_dpotrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_cpotrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotri_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda );
+lapack_int LAPACKE_dpotri_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_cpotri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dpotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zpotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_sppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, float anorm, float* rcond,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, double anorm, double* rcond,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap, float anorm,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap, double anorm,
+                                double* rcond, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_sppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, float* s, float* scond,
+                                float* amax );
+lapack_int LAPACKE_dppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, double* s, double* scond,
+                                double* amax );
+lapack_int LAPACKE_cppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap, float* s,
+                                float* scond, float* amax );
+lapack_int LAPACKE_zppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap, double* s,
+                                double* scond, double* amax );
+
+lapack_int LAPACKE_spprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap,
+                                const float* afp, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dpprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap,
+                                const double* afp, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cpprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_complex_float* afp,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* afp,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* ap, float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* ap, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* ap,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* ap,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, float* ap,
+                                float* afp, char* equed, float* s, float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, double* ap,
+                                double* afp, char* equed, double* s, double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* ap,
+                                lapack_complex_float* afp, char* equed,
+                                float* s, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* ap,
+                                lapack_complex_double* afp, char* equed,
+                                double* s, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_spptrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap );
+lapack_int LAPACKE_dpptrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap );
+lapack_int LAPACKE_cpptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_zpptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptri_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap );
+lapack_int LAPACKE_dpptri_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap );
+lapack_int LAPACKE_cpptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_zpptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dpptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cpptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spstrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* piv,
+                                lapack_int* rank, float tol, float* work );
+lapack_int LAPACKE_dpstrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* piv,
+                                lapack_int* rank, double tol, double* work );
+lapack_int LAPACKE_cpstrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* piv, lapack_int* rank, float tol,
+                                float* work );
+lapack_int LAPACKE_zpstrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* piv, lapack_int* rank, double tol,
+                                double* work );
+
+lapack_int LAPACKE_sptcon_work( lapack_int n, const float* d, const float* e,
+                                float anorm, float* rcond, float* work );
+lapack_int LAPACKE_dptcon_work( lapack_int n, const double* d, const double* e,
+                                double anorm, double* rcond, double* work );
+lapack_int LAPACKE_cptcon_work( lapack_int n, const float* d,
+                                const lapack_complex_float* e, float anorm,
+                                float* rcond, float* work );
+lapack_int LAPACKE_zptcon_work( lapack_int n, const double* d,
+                                const lapack_complex_double* e, double anorm,
+                                double* rcond, double* work );
+
+lapack_int LAPACKE_spteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work );
+lapack_int LAPACKE_dpteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work );
+lapack_int LAPACKE_cpteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, lapack_complex_float* z,
+                                lapack_int ldz, float* work );
+lapack_int LAPACKE_zpteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, lapack_complex_double* z,
+                                lapack_int ldz, double* work );
+
+lapack_int LAPACKE_sptrfs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const float* d, const float* e, const float* df,
+                                const float* ef, const float* b, lapack_int ldb,
+                                float* x, lapack_int ldx, float* ferr,
+                                float* berr, float* work );
+lapack_int LAPACKE_dptrfs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const double* d, const double* e,
+                                const double* df, const double* ef,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work );
+lapack_int LAPACKE_cptrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* d,
+                                const lapack_complex_float* e, const float* df,
+                                const lapack_complex_float* ef,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zptrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const lapack_complex_double* e,
+                                const double* df,
+                                const lapack_complex_double* ef,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* d, float* e, float* b, lapack_int ldb );
+lapack_int LAPACKE_dptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* d, double* e, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* d, lapack_complex_float* e,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* d, lapack_complex_double* e,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const float* d, const float* e,
+                                float* df, float* ef, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work );
+lapack_int LAPACKE_dptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const double* e, double* df, double* ef,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, double* work );
+lapack_int LAPACKE_cptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const float* d,
+                                const lapack_complex_float* e, float* df,
+                                lapack_complex_float* ef,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const lapack_complex_double* e, double* df,
+                                lapack_complex_double* ef,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spttrf_work( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dpttrf_work( lapack_int n, double* d, double* e );
+lapack_int LAPACKE_cpttrf_work( lapack_int n, float* d,
+                                lapack_complex_float* e );
+lapack_int LAPACKE_zpttrf_work( lapack_int n, double* d,
+                                lapack_complex_double* e );
+
+lapack_int LAPACKE_spttrs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const float* d, const float* e, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dpttrs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const double* d, const double* e, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cpttrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* d,
+                                const lapack_complex_float* e,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpttrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const lapack_complex_double* e,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd, float* ab,
+                               lapack_int ldab, float* w, float* z,
+                               lapack_int ldz, float* work );
+lapack_int LAPACKE_dsbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd, double* ab,
+                               lapack_int ldab, double* w, double* z,
+                               lapack_int ldz, double* work );
+
+lapack_int LAPACKE_ssbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd, float* ab,
+                                lapack_int ldab, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd, double* ab,
+                                lapack_int ldab, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                float* ab, lapack_int ldab, float* q,
+                                lapack_int ldq, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dsbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                double* ab, lapack_int ldab, double* q,
+                                lapack_int ldq, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_ssbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                float* ab, lapack_int ldab, const float* bb,
+                                lapack_int ldbb, float* x, lapack_int ldx,
+                                float* work );
+lapack_int LAPACKE_dsbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                double* ab, lapack_int ldab, const double* bb,
+                                lapack_int ldbb, double* x, lapack_int ldx,
+                                double* work );
+
+lapack_int LAPACKE_ssbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               float* ab, lapack_int ldab, float* bb,
+                               lapack_int ldbb, float* w, float* z,
+                               lapack_int ldz, float* work );
+lapack_int LAPACKE_dsbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               double* ab, lapack_int ldab, double* bb,
+                               lapack_int ldbb, double* w, double* z,
+                               lapack_int ldz, double* work );
+
+lapack_int LAPACKE_ssbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                float* ab, lapack_int ldab, float* bb,
+                                lapack_int ldbb, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                double* ab, lapack_int ldab, double* bb,
+                                lapack_int ldbb, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, float* ab, lapack_int ldab,
+                                float* bb, lapack_int ldbb, float* q,
+                                lapack_int ldq, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dsbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, double* ab, lapack_int ldab,
+                                double* bb, lapack_int ldbb, double* q,
+                                lapack_int ldq, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_ssbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd, float* ab,
+                                lapack_int ldab, float* d, float* e, float* q,
+                                lapack_int ldq, float* work );
+lapack_int LAPACKE_dsbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd, double* ab,
+                                lapack_int ldab, double* d, double* e,
+                                double* q, lapack_int ldq, double* work );
+
+lapack_int LAPACKE_ssfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               float alpha, const float* a, lapack_int lda,
+                               float beta, float* c );
+lapack_int LAPACKE_dsfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               double alpha, const double* a, lapack_int lda,
+                               double beta, double* c );
+
+lapack_int LAPACKE_sspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, const lapack_int* ipiv,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, const lapack_int* ipiv,
+                                double anorm, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_sspev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, float* ap, float* w, float* z,
+                               lapack_int ldz, float* work );
+lapack_int LAPACKE_dspev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, double* ap, double* w, double* z,
+                               lapack_int ldz, double* work );
+
+lapack_int LAPACKE_sspevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, float* ap, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dspevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, double* ap, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sspevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, float* ap, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dspevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, double* ap, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, double* work,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_sspgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, float* ap, const float* bp );
+lapack_int LAPACKE_dspgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, double* ap, const double* bp );
+
+lapack_int LAPACKE_sspgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, float* ap, float* bp,
+                               float* w, float* z, lapack_int ldz,
+                               float* work );
+lapack_int LAPACKE_dspgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, double* ap, double* bp,
+                               double* w, double* z, lapack_int ldz,
+                               double* work );
+
+lapack_int LAPACKE_sspgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, float* ap, float* bp,
+                                float* w, float* z, lapack_int ldz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dspgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, double* ap, double* bp,
+                                double* w, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sspgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, float* ap,
+                                float* bp, float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, float* z, lapack_int ldz, float* work,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_dspgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, double* ap,
+                                double* bp, double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, double* z, lapack_int ldz,
+                                double* work, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_ssprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap,
+                                const float* afp, const lapack_int* ipiv,
+                                const float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dsprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap,
+                                const double* afp, const lapack_int* ipiv,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_csprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_complex_float* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zsprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* ap, lapack_int* ipiv,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* ap, lapack_int* ipiv,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_cspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* ap,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* ap,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+
+lapack_int LAPACKE_sspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const float* ap,
+                                float* afp, lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const double* ap,
+                                double* afp, lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* afp, lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* afp, lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_ssptrd_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap, float* d, float* e, float* tau );
+lapack_int LAPACKE_dsptrd_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap, double* d, double* e, double* tau );
+
+lapack_int LAPACKE_ssptrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_dsptrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap, lapack_int* ipiv );
+lapack_int LAPACKE_csptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zsptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_ssptri_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap, const lapack_int* ipiv,
+                                float* work );
+lapack_int LAPACKE_dsptri_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap, const lapack_int* ipiv,
+                                double* work );
+lapack_int LAPACKE_csptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zsptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_ssptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dsptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap,
+                                const lapack_int* ipiv, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_csptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zsptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sstebz_work( char range, char order, lapack_int n, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, const float* d, const float* e,
+                                lapack_int* m, lapack_int* nsplit, float* w,
+                                lapack_int* iblock, lapack_int* isplit,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dstebz_work( char range, char order, lapack_int n, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, const double* d, const double* e,
+                                lapack_int* m, lapack_int* nsplit, double* w,
+                                lapack_int* iblock, lapack_int* isplit,
+                                double* work, lapack_int* iwork );
+
+lapack_int LAPACKE_sstedc_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dstedc_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_cstedc_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zstedc_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_sstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, lapack_int* isuppz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_cstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int* isuppz, double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_sstein_work( int matrix_order, lapack_int n, const float* d,
+                                const float* e, lapack_int m, const float* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+lapack_int LAPACKE_dstein_work( int matrix_order, lapack_int n, const double* d,
+                                const double* e, lapack_int m, const double* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit, double* z,
+                                lapack_int ldz, double* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+lapack_int LAPACKE_cstein_work( int matrix_order, lapack_int n, const float* d,
+                                const float* e, lapack_int m, const float* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit,
+                                lapack_complex_float* z, lapack_int ldz,
+                                float* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+lapack_int LAPACKE_zstein_work( int matrix_order, lapack_int n, const double* d,
+                                const double* e, lapack_int m, const double* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit,
+                                lapack_complex_double* z, lapack_int ldz,
+                                double* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+
+lapack_int LAPACKE_sstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int nzc,
+                                lapack_int* isuppz, lapack_logical* tryrac,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, lapack_int nzc,
+                                lapack_int* isuppz, lapack_logical* tryrac,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_cstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int nzc, lapack_int* isuppz,
+                                lapack_logical* tryrac, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int nzc, lapack_int* isuppz,
+                                lapack_logical* tryrac, double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_ssteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work );
+lapack_int LAPACKE_dsteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work );
+lapack_int LAPACKE_csteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, lapack_complex_float* z,
+                                lapack_int ldz, float* work );
+lapack_int LAPACKE_zsteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, lapack_complex_double* z,
+                                lapack_int ldz, double* work );
+
+lapack_int LAPACKE_ssterf_work( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dsterf_work( lapack_int n, double* d, double* e );
+
+lapack_int LAPACKE_sstev_work( int matrix_order, char jobz, lapack_int n,
+                               float* d, float* e, float* z, lapack_int ldz,
+                               float* work );
+lapack_int LAPACKE_dstev_work( int matrix_order, char jobz, lapack_int n,
+                               double* d, double* e, double* z, lapack_int ldz,
+                               double* work );
+
+lapack_int LAPACKE_sstevd_work( int matrix_order, char jobz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dstevd_work( int matrix_order, char jobz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sstevr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dstevr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, lapack_int* isuppz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sstevx_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dstevx_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, double* work,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_ssycon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* a, lapack_int lda,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dsycon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_csycon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zsycon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_ssyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const float* a, lapack_int lda, float* s,
+                                 float* scond, float* amax, float* work );
+lapack_int LAPACKE_dsyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const double* a, lapack_int lda, double* s,
+                                 double* scond, double* amax, double* work );
+lapack_int LAPACKE_csyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* s, float* scond, float* amax,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_zsyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* s, double* scond, double* amax,
+                                 lapack_complex_double* work );
+
+lapack_int LAPACKE_ssyev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, float* a, lapack_int lda, float* w,
+                               float* work, lapack_int lwork );
+lapack_int LAPACKE_dsyev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, double* a, lapack_int lda,
+                               double* w, double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssyevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* w, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsyevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* w, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssyevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, float* a,
+                                lapack_int lda, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dsyevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, double* a,
+                                lapack_int lda, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, lapack_int* isuppz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssyevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, float* a,
+                                lapack_int lda, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_dsyevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, double* a,
+                                lapack_int lda, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_ssygst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, float* a, lapack_int lda,
+                                const float* b, lapack_int ldb );
+lapack_int LAPACKE_dsygst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, double* a, lapack_int lda,
+                                const double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssygv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, float* a,
+                               lapack_int lda, float* b, lapack_int ldb,
+                               float* w, float* work, lapack_int lwork );
+lapack_int LAPACKE_dsygv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, double* a,
+                               lapack_int lda, double* b, lapack_int ldb,
+                               double* w, double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssygvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float* w, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsygvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* w, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssygvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, float* z, lapack_int ldz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dsygvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_ssyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const float* af, lapack_int ldaf,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dsyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_csyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zsyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_ssyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const float* a,
+                                 lapack_int lda, const float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* s, const float* b, lapack_int ldb,
+                                 float* x, lapack_int ldx, float* rcond,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dsyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const double* a,
+                                 lapack_int lda, const double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* s, const double* b,
+                                 lapack_int ldb, double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_csyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* s, const lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zsyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* s,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_ssysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* a, lapack_int lda,
+                               lapack_int* ipiv, float* b, lapack_int ldb,
+                               float* work, lapack_int lwork );
+lapack_int LAPACKE_dsysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* a, lapack_int lda,
+                               lapack_int* ipiv, double* b, lapack_int ldb,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_csysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zsysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const float* a,
+                                lapack_int lda, float* af, lapack_int ldaf,
+                                lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dsysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const double* a,
+                                lapack_int lda, double* af, lapack_int ldaf,
+                                lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_csysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zsysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_ssysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, float* a,
+                                 lapack_int lda, float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* s,
+                                 float* b, lapack_int ldb, float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dsysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, double* a,
+                                 lapack_int lda, double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* s,
+                                 double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* rpvgrw,
+                                 double* berr, lapack_int n_err_bnds,
+                                 double* err_bnds_norm, double* err_bnds_comp,
+                                 lapack_int nparams, double* params,
+                                 double* work, lapack_int* iwork );
+lapack_int LAPACKE_csysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* s,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zsysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* s,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_ssytrd_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, float* d, float* e,
+                                float* tau, float* work, lapack_int lwork );
+lapack_int LAPACKE_dsytrd_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, double* d, double* e,
+                                double* tau, double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssytrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ipiv,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dsytrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ipiv,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_csytrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_zsytrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_ssytri_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda,
+                                const lapack_int* ipiv, float* work );
+lapack_int LAPACKE_dsytri_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda,
+                                const lapack_int* ipiv, double* work );
+lapack_int LAPACKE_csytri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zsytri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_ssytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dsytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_csytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zsytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const float* ab, lapack_int ldab, float* rcond,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const double* ab, lapack_int ldab,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_ztbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const float* ab,
+                                lapack_int ldab, const float* b, lapack_int ldb,
+                                const float* x, lapack_int ldx, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const double* ab,
+                                lapack_int ldab, const double* b,
+                                lapack_int ldb, const double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const lapack_complex_float* ab,
+                                lapack_int ldab, const lapack_complex_float* b,
+                                lapack_int ldb, const lapack_complex_float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, const lapack_complex_double* b,
+                                lapack_int ldb, const lapack_complex_double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const float* ab,
+                                lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dtbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const double* ab,
+                                lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_ctbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const lapack_complex_float* ab,
+                                lapack_int ldab, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_ztbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_stfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, float alpha, const float* a,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dtfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, double alpha, const double* a,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_ctfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, lapack_complex_float alpha,
+                               const lapack_complex_float* a,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, lapack_complex_double alpha,
+                               const lapack_complex_double* a,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n, float* a );
+lapack_int LAPACKE_dtftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n, double* a );
+lapack_int LAPACKE_ctftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n,
+                                lapack_complex_float* a );
+lapack_int LAPACKE_ztftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n,
+                                lapack_complex_double* a );
+
+lapack_int LAPACKE_stfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* arf, float* ap );
+lapack_int LAPACKE_dtfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* arf, double* ap );
+lapack_int LAPACKE_ctfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* arf,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_ztfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* arf,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_stfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* arf, float* a,
+                                lapack_int lda );
+lapack_int LAPACKE_dtfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* arf, double* a,
+                                lapack_int lda );
+lapack_int LAPACKE_ctfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* arf,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* arf,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_stgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const float* s, lapack_int lds, const float* p,
+                                lapack_int ldp, float* vl, lapack_int ldvl,
+                                float* vr, lapack_int ldvr, lapack_int mm,
+                                lapack_int* m, float* work );
+lapack_int LAPACKE_dtgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const double* s, lapack_int lds,
+                                const double* p, lapack_int ldp, double* vl,
+                                lapack_int ldvl, double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, double* work );
+lapack_int LAPACKE_ctgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_float* s, lapack_int lds,
+                                const lapack_complex_float* p, lapack_int ldp,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_double* s, lapack_int lds,
+                                const lapack_complex_double* p, lapack_int ldp,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float* q, lapack_int ldq, float* z,
+                                lapack_int ldz, lapack_int* ifst,
+                                lapack_int* ilst, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dtgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* q, lapack_int ldq, double* z,
+                                lapack_int ldz, lapack_int* ifst,
+                                lapack_int* ilst, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_ctgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_stgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float* alphar, float* alphai,
+                                float* beta, float* q, lapack_int ldq, float* z,
+                                lapack_int ldz, lapack_int* m, float* pl,
+                                float* pr, float* dif, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dtgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double* alphar, double* alphai,
+                                double* beta, double* q, lapack_int ldq,
+                                double* z, lapack_int ldz, lapack_int* m,
+                                double* pl, double* pr, double* dif,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_ctgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int* m, float* pl, float* pr, float* dif,
+                                lapack_complex_float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_ztgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int* m, double* pl, double* pr,
+                                double* dif, lapack_complex_double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_stgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float tola, float tolb,
+                                float* alpha, float* beta, float* u,
+                                lapack_int ldu, float* v, lapack_int ldv,
+                                float* q, lapack_int ldq, float* work,
+                                lapack_int* ncycle );
+lapack_int LAPACKE_dtgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double tola, double tolb,
+                                double* alpha, double* beta, double* u,
+                                lapack_int ldu, double* v, lapack_int ldv,
+                                double* q, lapack_int ldq, double* work,
+                                lapack_int* ncycle );
+lapack_int LAPACKE_ctgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float tola, float tolb, float* alpha,
+                                float* beta, lapack_complex_float* u,
+                                lapack_int ldu, lapack_complex_float* v,
+                                lapack_int ldv, lapack_complex_float* q,
+                                lapack_int ldq, lapack_complex_float* work,
+                                lapack_int* ncycle );
+lapack_int LAPACKE_ztgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double tola, double tolb, double* alpha,
+                                double* beta, lapack_complex_double* u,
+                                lapack_int ldu, lapack_complex_double* v,
+                                lapack_int ldv, lapack_complex_double* q,
+                                lapack_int ldq, lapack_complex_double* work,
+                                lapack_int* ncycle );
+
+lapack_int LAPACKE_stgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const float* a, lapack_int lda, const float* b,
+                                lapack_int ldb, const float* vl,
+                                lapack_int ldvl, const float* vr,
+                                lapack_int ldvr, float* s, float* dif,
+                                lapack_int mm, lapack_int* m, float* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_dtgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const double* b, lapack_int ldb,
+                                const double* vl, lapack_int ldvl,
+                                const double* vr, lapack_int ldvr, double* s,
+                                double* dif, lapack_int mm, lapack_int* m,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                const lapack_complex_float* vl, lapack_int ldvl,
+                                const lapack_complex_float* vr, lapack_int ldvr,
+                                float* s, float* dif, lapack_int mm,
+                                lapack_int* m, lapack_complex_float* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_ztgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                const lapack_complex_double* vl,
+                                lapack_int ldvl,
+                                const lapack_complex_double* vr,
+                                lapack_int ldvr, double* s, double* dif,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, lapack_int lwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_stgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n, const float* a,
+                                lapack_int lda, const float* b, lapack_int ldb,
+                                float* c, lapack_int ldc, const float* d,
+                                lapack_int ldd, const float* e, lapack_int lde,
+                                float* f, lapack_int ldf, float* scale,
+                                float* dif, float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dtgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n, const double* a,
+                                lapack_int lda, const double* b, lapack_int ldb,
+                                double* c, lapack_int ldc, const double* d,
+                                lapack_int ldd, const double* e, lapack_int lde,
+                                double* f, lapack_int ldf, double* scale,
+                                double* dif, double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* c, lapack_int ldc,
+                                const lapack_complex_float* d, lapack_int ldd,
+                                const lapack_complex_float* e, lapack_int lde,
+                                lapack_complex_float* f, lapack_int ldf,
+                                float* scale, float* dif,
+                                lapack_complex_float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ztgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* c, lapack_int ldc,
+                                const lapack_complex_double* d, lapack_int ldd,
+                                const lapack_complex_double* e, lapack_int lde,
+                                lapack_complex_double* f, lapack_int ldf,
+                                double* scale, double* dif,
+                                lapack_complex_double* work, lapack_int lwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_stpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const float* ap,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const double* ap,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_float* ap, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_double* ap, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* ap, const float* b, lapack_int ldb,
+                                const float* x, lapack_int ldx, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* ap, const double* b,
+                                lapack_int ldb, const double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                const lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                const lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, float* ap );
+lapack_int LAPACKE_dtptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, double* ap );
+lapack_int LAPACKE_ctptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_float* ap );
+lapack_int LAPACKE_ztptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_double* ap );
+
+lapack_int LAPACKE_stptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* ap, float* b, lapack_int ldb );
+lapack_int LAPACKE_dtptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* ap, double* b, lapack_int ldb );
+lapack_int LAPACKE_ctptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* ap, float* arf );
+lapack_int LAPACKE_dtpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* ap, double* arf );
+lapack_int LAPACKE_ctpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* ap,
+                                lapack_complex_float* arf );
+lapack_int LAPACKE_ztpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* ap,
+                                lapack_complex_double* arf );
+
+lapack_int LAPACKE_stpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, float* a, lapack_int lda );
+lapack_int LAPACKE_dtpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, double* a, lapack_int lda );
+lapack_int LAPACKE_ctpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_strcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const float* a,
+                                lapack_int lda, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dtrcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const double* a,
+                                lapack_int lda, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctrcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_ztrcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* rcond, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_strevc_work( int matrix_order, char side, char howmny,
+                                lapack_logical* select, lapack_int n,
+                                const float* t, lapack_int ldt, float* vl,
+                                lapack_int ldvl, float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, float* work );
+lapack_int LAPACKE_dtrevc_work( int matrix_order, char side, char howmny,
+                                lapack_logical* select, lapack_int n,
+                                const double* t, lapack_int ldt, double* vl,
+                                lapack_int ldvl, double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, double* work );
+lapack_int LAPACKE_ctrevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztrevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_strexc_work( int matrix_order, char compq, lapack_int n,
+                                float* t, lapack_int ldt, float* q,
+                                lapack_int ldq, lapack_int* ifst,
+                                lapack_int* ilst, float* work );
+lapack_int LAPACKE_dtrexc_work( int matrix_order, char compq, lapack_int n,
+                                double* t, lapack_int ldt, double* q,
+                                lapack_int ldq, lapack_int* ifst,
+                                lapack_int* ilst, double* work );
+lapack_int LAPACKE_ctrexc_work( int matrix_order, char compq, lapack_int n,
+                                lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztrexc_work( int matrix_order, char compq, lapack_int n,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_strrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* a, lapack_int lda, const float* b,
+                                lapack_int ldb, const float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dtrrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* a, lapack_int lda,
+                                const double* b, lapack_int ldb,
+                                const double* x, lapack_int ldx, double* ferr,
+                                double* berr, double* work, lapack_int* iwork );
+lapack_int LAPACKE_ctrrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                const lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztrrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                const lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_strsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                float* t, lapack_int ldt, float* q,
+                                lapack_int ldq, float* wr, float* wi,
+                                lapack_int* m, float* s, float* sep,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dtrsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                double* t, lapack_int ldt, double* q,
+                                lapack_int ldq, double* wr, double* wi,
+                                lapack_int* m, double* s, double* sep,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_ctrsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* w, lapack_int* m,
+                                float* s, float* sep,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_ztrsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* w, lapack_int* m,
+                                double* s, double* sep,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_strsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const float* t, lapack_int ldt, const float* vl,
+                                lapack_int ldvl, const float* vr,
+                                lapack_int ldvr, float* s, float* sep,
+                                lapack_int mm, lapack_int* m, float* work,
+                                lapack_int ldwork, lapack_int* iwork );
+lapack_int LAPACKE_dtrsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const double* t, lapack_int ldt,
+                                const double* vl, lapack_int ldvl,
+                                const double* vr, lapack_int ldvr, double* s,
+                                double* sep, lapack_int mm, lapack_int* m,
+                                double* work, lapack_int ldwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctrsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_float* t, lapack_int ldt,
+                                const lapack_complex_float* vl, lapack_int ldvl,
+                                const lapack_complex_float* vr, lapack_int ldvr,
+                                float* s, float* sep, lapack_int mm,
+                                lapack_int* m, lapack_complex_float* work,
+                                lapack_int ldwork, float* rwork );
+lapack_int LAPACKE_ztrsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_double* t, lapack_int ldt,
+                                const lapack_complex_double* vl,
+                                lapack_int ldvl,
+                                const lapack_complex_double* vr,
+                                lapack_int ldvr, double* s, double* sep,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, lapack_int ldwork,
+                                double* rwork );
+
+lapack_int LAPACKE_strsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const float* a, lapack_int lda, const float* b,
+                                lapack_int ldb, float* c, lapack_int ldc,
+                                float* scale );
+lapack_int LAPACKE_dtrsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const double* b, lapack_int ldb, double* c,
+                                lapack_int ldc, double* scale );
+lapack_int LAPACKE_ctrsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* c, lapack_int ldc,
+                                float* scale );
+lapack_int LAPACKE_ztrsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* c, lapack_int ldc,
+                                double* scale );
+
+lapack_int LAPACKE_strtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, float* a, lapack_int lda );
+lapack_int LAPACKE_dtrtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, double* a, lapack_int lda );
+lapack_int LAPACKE_ctrtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda );
+lapack_int LAPACKE_ztrtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda );
+
+lapack_int LAPACKE_strtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* a, lapack_int lda, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dtrtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* a, lapack_int lda, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_ctrtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztrtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_strttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* arf );
+lapack_int LAPACKE_dtrttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* arf );
+lapack_int LAPACKE_ctrttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* arf );
+lapack_int LAPACKE_ztrttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* arf );
+
+lapack_int LAPACKE_strttp_work( int matrix_order, char uplo, lapack_int n,
+                                const float* a, lapack_int lda, float* ap );
+lapack_int LAPACKE_dtrttp_work( int matrix_order, char uplo, lapack_int n,
+                                const double* a, lapack_int lda, double* ap );
+lapack_int LAPACKE_ctrttp_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_ztrttp_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_stzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dtzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_ctzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_ztzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungtr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungtr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cupgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zupgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_cupmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zupmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_claghe( int matrix_order, lapack_int n, lapack_int k,
+                           const float* d, lapack_complex_float* a,
+                           lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_zlaghe( int matrix_order, lapack_int n, lapack_int k,
+                           const double* d, lapack_complex_double* a,
+                           lapack_int lda, lapack_int* iseed );
+
+lapack_int LAPACKE_slagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const float* d, float* a, lapack_int lda,
+                           lapack_int* iseed );
+lapack_int LAPACKE_dlagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const double* d, double* a, lapack_int lda,
+                           lapack_int* iseed );
+lapack_int LAPACKE_clagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const float* d, lapack_complex_float* a,
+                           lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_zlagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const double* d, lapack_complex_double* a,
+                           lapack_int lda, lapack_int* iseed );
+
+lapack_int LAPACKE_slapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, float* x, lapack_int ldx,
+                           lapack_int* k );
+lapack_int LAPACKE_dlapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, double* x,
+                           lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_clapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, lapack_complex_float* x,
+                           lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_zlapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, lapack_complex_double* x,
+                           lapack_int ldx, lapack_int* k );
+
+
+float LAPACKE_slapy2( float x, float y );
+double LAPACKE_dlapy2( double x, double y );
+
+float LAPACKE_slapy3( float x, float y, float z );
+double LAPACKE_dlapy3( double x, double y, double z );
+
+lapack_int LAPACKE_slartgp( float f, float g, float* cs, float* sn, float* r );
+lapack_int LAPACKE_dlartgp( double f, double g, double* cs, double* sn,
+                            double* r );
+
+lapack_int LAPACKE_slartgs( float x, float y, float sigma, float* cs,
+                            float* sn );
+lapack_int LAPACKE_dlartgs( double x, double y, double sigma, double* cs,
+                            double* sn );
+
+
+//LAPACK 3.3.0
+lapack_int LAPACKE_cbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, float* theta, float* phi,
+                           lapack_complex_float* u1, lapack_int ldu1,
+                           lapack_complex_float* u2, lapack_int ldu2,
+                           lapack_complex_float* v1t, lapack_int ldv1t,
+                           lapack_complex_float* v2t, lapack_int ldv2t,
+                           float* b11d, float* b11e, float* b12d, float* b12e,
+                           float* b21d, float* b21e, float* b22d, float* b22e );
+lapack_int LAPACKE_cbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                float* theta, float* phi,
+                                lapack_complex_float* u1, lapack_int ldu1,
+                                lapack_complex_float* u2, lapack_int ldu2,
+                                lapack_complex_float* v1t, lapack_int ldv1t,
+                                lapack_complex_float* v2t, lapack_int ldv2t,
+                                float* b11d, float* b11e, float* b12d,
+                                float* b12e, float* b21d, float* b21e,
+                                float* b22d, float* b22e, float* rwork,
+                                lapack_int lrwork );
+lapack_int LAPACKE_cheswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_cheswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_chetri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_chetri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_chetri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_chetri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_float* work, lapack_int nb );
+lapack_int LAPACKE_chetrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_float* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_chetrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_csyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_csyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, lapack_complex_float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_csyswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_csyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_csytri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_csytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_csytri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_csytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_float* work, lapack_int nb );
+lapack_int LAPACKE_csytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_float* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_csytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_cunbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_float* x11, lapack_int ldx11,
+                           lapack_complex_float* x12, lapack_int ldx12,
+                           lapack_complex_float* x21, lapack_int ldx21,
+                           lapack_complex_float* x22, lapack_int ldx22,
+                           float* theta, float* phi,
+                           lapack_complex_float* taup1,
+                           lapack_complex_float* taup2,
+                           lapack_complex_float* tauq1,
+                           lapack_complex_float* tauq2 );
+lapack_int LAPACKE_cunbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                lapack_complex_float* x11, lapack_int ldx11,
+                                lapack_complex_float* x12, lapack_int ldx12,
+                                lapack_complex_float* x21, lapack_int ldx21,
+                                lapack_complex_float* x22, lapack_int ldx22,
+                                float* theta, float* phi,
+                                lapack_complex_float* taup1,
+                                lapack_complex_float* taup2,
+                                lapack_complex_float* tauq1,
+                                lapack_complex_float* tauq2,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_cuncsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_float* x11, lapack_int ldx11,
+                           lapack_complex_float* x12, lapack_int ldx12,
+                           lapack_complex_float* x21, lapack_int ldx21,
+                           lapack_complex_float* x22, lapack_int ldx22,
+                           float* theta, lapack_complex_float* u1,
+                           lapack_int ldu1, lapack_complex_float* u2,
+                           lapack_int ldu2, lapack_complex_float* v1t,
+                           lapack_int ldv1t, lapack_complex_float* v2t,
+                           lapack_int ldv2t );
+lapack_int LAPACKE_cuncsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, lapack_complex_float* x11,
+                                lapack_int ldx11, lapack_complex_float* x12,
+                                lapack_int ldx12, lapack_complex_float* x21,
+                                lapack_int ldx21, lapack_complex_float* x22,
+                                lapack_int ldx22, float* theta,
+                                lapack_complex_float* u1, lapack_int ldu1,
+                                lapack_complex_float* u2, lapack_int ldu2,
+                                lapack_complex_float* v1t, lapack_int ldv1t,
+                                lapack_complex_float* v2t, lapack_int ldv2t,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, double* theta,
+                           double* phi, double* u1, lapack_int ldu1, double* u2,
+                           lapack_int ldu2, double* v1t, lapack_int ldv1t,
+                           double* v2t, lapack_int ldv2t, double* b11d,
+                           double* b11e, double* b12d, double* b12e,
+                           double* b21d, double* b21e, double* b22d,
+                           double* b22e );
+lapack_int LAPACKE_dbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                double* theta, double* phi, double* u1,
+                                lapack_int ldu1, double* u2, lapack_int ldu2,
+                                double* v1t, lapack_int ldv1t, double* v2t,
+                                lapack_int ldv2t, double* b11d, double* b11e,
+                                double* b12d, double* b12e, double* b21d,
+                                double* b21e, double* b22d, double* b22e,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_dorbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           double* x11, lapack_int ldx11, double* x12,
+                           lapack_int ldx12, double* x21, lapack_int ldx21,
+                           double* x22, lapack_int ldx22, double* theta,
+                           double* phi, double* taup1, double* taup2,
+                           double* tauq1, double* tauq2 );
+lapack_int LAPACKE_dorbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                double* x11, lapack_int ldx11, double* x12,
+                                lapack_int ldx12, double* x21, lapack_int ldx21,
+                                double* x22, lapack_int ldx22, double* theta,
+                                double* phi, double* taup1, double* taup2,
+                                double* tauq1, double* tauq2, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           double* x11, lapack_int ldx11, double* x12,
+                           lapack_int ldx12, double* x21, lapack_int ldx21,
+                           double* x22, lapack_int ldx22, double* theta,
+                           double* u1, lapack_int ldu1, double* u2,
+                           lapack_int ldu2, double* v1t, lapack_int ldv1t,
+                           double* v2t, lapack_int ldv2t );
+lapack_int LAPACKE_dorcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, double* x11, lapack_int ldx11,
+                                double* x12, lapack_int ldx12, double* x21,
+                                lapack_int ldx21, double* x22, lapack_int ldx22,
+                                double* theta, double* u1, lapack_int ldu1,
+                                double* u2, lapack_int ldu2, double* v1t,
+                                lapack_int ldv1t, double* v2t, lapack_int ldv2t,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dsyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            double* a, lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dsyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, double* a, lapack_int lda,
+                                 const lapack_int* ipiv, double* work );
+lapack_int LAPACKE_dsyswapr( int matrix_order, char uplo, lapack_int n,
+                             double* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_dsyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  double* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_dsytri2( int matrix_order, char uplo, lapack_int n,
+                            double* a, lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dsytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 double* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_dsytri2x( int matrix_order, char uplo, lapack_int n,
+                             double* a, lapack_int lda, const lapack_int* ipiv,
+                             lapack_int nb );
+lapack_int LAPACKE_dsytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  double* a, lapack_int lda,
+                                  const lapack_int* ipiv, double* work,
+                                  lapack_int nb );
+lapack_int LAPACKE_dsytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const double* a, lapack_int lda,
+                            const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_dsytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 double* b, lapack_int ldb, double* work );
+lapack_int LAPACKE_sbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, float* theta, float* phi,
+                           float* u1, lapack_int ldu1, float* u2,
+                           lapack_int ldu2, float* v1t, lapack_int ldv1t,
+                           float* v2t, lapack_int ldv2t, float* b11d,
+                           float* b11e, float* b12d, float* b12e, float* b21d,
+                           float* b21e, float* b22d, float* b22e );
+lapack_int LAPACKE_sbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                float* theta, float* phi, float* u1,
+                                lapack_int ldu1, float* u2, lapack_int ldu2,
+                                float* v1t, lapack_int ldv1t, float* v2t,
+                                lapack_int ldv2t, float* b11d, float* b11e,
+                                float* b12d, float* b12e, float* b21d,
+                                float* b21e, float* b22d, float* b22e,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_sorbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q, float* x11,
+                           lapack_int ldx11, float* x12, lapack_int ldx12,
+                           float* x21, lapack_int ldx21, float* x22,
+                           lapack_int ldx22, float* theta, float* phi,
+                           float* taup1, float* taup2, float* tauq1,
+                           float* tauq2 );
+lapack_int LAPACKE_sorbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                float* x11, lapack_int ldx11, float* x12,
+                                lapack_int ldx12, float* x21, lapack_int ldx21,
+                                float* x22, lapack_int ldx22, float* theta,
+                                float* phi, float* taup1, float* taup2,
+                                float* tauq1, float* tauq2, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_sorcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q, float* x11,
+                           lapack_int ldx11, float* x12, lapack_int ldx12,
+                           float* x21, lapack_int ldx21, float* x22,
+                           lapack_int ldx22, float* theta, float* u1,
+                           lapack_int ldu1, float* u2, lapack_int ldu2,
+                           float* v1t, lapack_int ldv1t, float* v2t,
+                           lapack_int ldv2t );
+lapack_int LAPACKE_sorcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, float* x11, lapack_int ldx11,
+                                float* x12, lapack_int ldx12, float* x21,
+                                lapack_int ldx21, float* x22, lapack_int ldx22,
+                                float* theta, float* u1, lapack_int ldu1,
+                                float* u2, lapack_int ldu2, float* v1t,
+                                lapack_int ldv1t, float* v2t, lapack_int ldv2t,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ssyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            float* a, lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_ssyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, float* a, lapack_int lda,
+                                 const lapack_int* ipiv, float* work );
+lapack_int LAPACKE_ssyswapr( int matrix_order, char uplo, lapack_int n,
+                             float* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_ssyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  float* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_ssytri2( int matrix_order, char uplo, lapack_int n, float* a,
+                            lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_ssytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 float* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_ssytri2x( int matrix_order, char uplo, lapack_int n,
+                             float* a, lapack_int lda, const lapack_int* ipiv,
+                             lapack_int nb );
+lapack_int LAPACKE_ssytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  float* a, lapack_int lda,
+                                  const lapack_int* ipiv, float* work,
+                                  lapack_int nb );
+lapack_int LAPACKE_ssytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const float* a, lapack_int lda,
+                            const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_ssytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 float* b, lapack_int ldb, float* work );
+lapack_int LAPACKE_zbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, double* theta,
+                           double* phi, lapack_complex_double* u1,
+                           lapack_int ldu1, lapack_complex_double* u2,
+                           lapack_int ldu2, lapack_complex_double* v1t,
+                           lapack_int ldv1t, lapack_complex_double* v2t,
+                           lapack_int ldv2t, double* b11d, double* b11e,
+                           double* b12d, double* b12e, double* b21d,
+                           double* b21e, double* b22d, double* b22e );
+lapack_int LAPACKE_zbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                double* theta, double* phi,
+                                lapack_complex_double* u1, lapack_int ldu1,
+                                lapack_complex_double* u2, lapack_int ldu2,
+                                lapack_complex_double* v1t, lapack_int ldv1t,
+                                lapack_complex_double* v2t, lapack_int ldv2t,
+                                double* b11d, double* b11e, double* b12d,
+                                double* b12e, double* b21d, double* b21e,
+                                double* b22d, double* b22e, double* rwork,
+                                lapack_int lrwork );
+lapack_int LAPACKE_zheswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_zheswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_zhetri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_zhetri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_zhetri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_zhetri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_double* work, lapack_int nb );
+lapack_int LAPACKE_zhetrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_double* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_double* b, lapack_int ldb );
+lapack_int LAPACKE_zhetrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* work );
+lapack_int LAPACKE_zsyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_zsyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, lapack_complex_double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_double* work );
+lapack_int LAPACKE_zsyswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_zsyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_zsytri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_zsytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_zsytri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_zsytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_double* work, lapack_int nb );
+lapack_int LAPACKE_zsytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_double* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_double* b, lapack_int ldb );
+lapack_int LAPACKE_zsytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* work );
+lapack_int LAPACKE_zunbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_double* x11, lapack_int ldx11,
+                           lapack_complex_double* x12, lapack_int ldx12,
+                           lapack_complex_double* x21, lapack_int ldx21,
+                           lapack_complex_double* x22, lapack_int ldx22,
+                           double* theta, double* phi,
+                           lapack_complex_double* taup1,
+                           lapack_complex_double* taup2,
+                           lapack_complex_double* tauq1,
+                           lapack_complex_double* tauq2 );
+lapack_int LAPACKE_zunbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                lapack_complex_double* x11, lapack_int ldx11,
+                                lapack_complex_double* x12, lapack_int ldx12,
+                                lapack_complex_double* x21, lapack_int ldx21,
+                                lapack_complex_double* x22, lapack_int ldx22,
+                                double* theta, double* phi,
+                                lapack_complex_double* taup1,
+                                lapack_complex_double* taup2,
+                                lapack_complex_double* tauq1,
+                                lapack_complex_double* tauq2,
+                                lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_zuncsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_double* x11, lapack_int ldx11,
+                           lapack_complex_double* x12, lapack_int ldx12,
+                           lapack_complex_double* x21, lapack_int ldx21,
+                           lapack_complex_double* x22, lapack_int ldx22,
+                           double* theta, lapack_complex_double* u1,
+                           lapack_int ldu1, lapack_complex_double* u2,
+                           lapack_int ldu2, lapack_complex_double* v1t,
+                           lapack_int ldv1t, lapack_complex_double* v2t,
+                           lapack_int ldv2t );
+lapack_int LAPACKE_zuncsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, lapack_complex_double* x11,
+                                lapack_int ldx11, lapack_complex_double* x12,
+                                lapack_int ldx12, lapack_complex_double* x21,
+                                lapack_int ldx21, lapack_complex_double* x22,
+                                lapack_int ldx22, double* theta,
+                                lapack_complex_double* u1, lapack_int ldu1,
+                                lapack_complex_double* u2, lapack_int ldu2,
+                                lapack_complex_double* v1t, lapack_int ldv1t,
+                                lapack_complex_double* v2t, lapack_int ldv2t,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork );
+//LAPACK 3.4.0
+lapack_int LAPACKE_sgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const float* v, lapack_int ldv,
+                            const float* t, lapack_int ldt, float* c,
+                            lapack_int ldc );
+lapack_int LAPACKE_dgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const double* v, lapack_int ldv,
+                            const double* t, lapack_int ldt, double* c,
+                            lapack_int ldc );
+lapack_int LAPACKE_cgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const lapack_complex_float* v,
+                            lapack_int ldv, const lapack_complex_float* t,
+                            lapack_int ldt, lapack_complex_float* c,
+                            lapack_int ldc );
+lapack_int LAPACKE_zgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const lapack_complex_double* v,
+                            lapack_int ldv, const lapack_complex_double* t,
+                            lapack_int ldt, lapack_complex_double* c,
+                            lapack_int ldc );
+
+lapack_int LAPACKE_sgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, float* a, lapack_int lda, float* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_dgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, double* a, lapack_int lda, double* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_cgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_zgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* t,
+                           lapack_int ldt );
+
+lapack_int LAPACKE_sgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_dgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_cgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_sgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_dgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_cgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb, const float* v,
+                            lapack_int ldv, const float* t, lapack_int ldt,
+                            float* a, lapack_int lda, float* b,
+                            lapack_int ldb );
+lapack_int LAPACKE_dtpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb, const double* v,
+                            lapack_int ldv, const double* t, lapack_int ldt,
+                            double* a, lapack_int lda, double* b,
+                            lapack_int ldb );
+lapack_int LAPACKE_ctpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb,
+                            const lapack_complex_float* v, lapack_int ldv,
+                            const lapack_complex_float* t, lapack_int ldt,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb,
+                            const lapack_complex_double* v, lapack_int ldv,
+                            const lapack_complex_double* t, lapack_int ldt,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_dtpqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int l, lapack_int nb, double* a,
+                           lapack_int lda, double* b, lapack_int ldb, double* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_ctpqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int l, lapack_int nb, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* t,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_int ldt );
+lapack_int LAPACKE_ztpqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int l, lapack_int nb,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* b, lapack_int ldb,
+                            float* t, lapack_int ldt );
+lapack_int LAPACKE_dtpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* b,
+                            lapack_int ldb, double* t, lapack_int ldt );
+lapack_int LAPACKE_ctpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_ztpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l, const float* v,
+                           lapack_int ldv, const float* t, lapack_int ldt,
+                           float* a, lapack_int lda, float* b, lapack_int ldb,
+                           lapack_int myldwork );
+lapack_int LAPACKE_dtprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l, const double* v,
+                           lapack_int ldv, const double* t, lapack_int ldt,
+                           double* a, lapack_int lda, double* b, lapack_int ldb,
+                           lapack_int myldwork );
+lapack_int LAPACKE_ctprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l,
+                           const lapack_complex_float* v, lapack_int ldv,
+                           const lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_int myldwork );
+lapack_int LAPACKE_ztprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l,
+                           const lapack_complex_double* v, lapack_int ldv,
+                           const lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_int myldwork );
+
+lapack_int LAPACKE_sgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const float* v, lapack_int ldv,
+                                 const float* t, lapack_int ldt, float* c,
+                                 lapack_int ldc, float* work );
+lapack_int LAPACKE_dgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const double* v, lapack_int ldv,
+                                 const double* t, lapack_int ldt, double* c,
+                                 lapack_int ldc, double* work );
+lapack_int LAPACKE_cgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const lapack_complex_float* v,
+                                 lapack_int ldv, const lapack_complex_float* t,
+                                 lapack_int ldt, lapack_complex_float* c,
+                                 lapack_int ldc, lapack_complex_float* work );
+lapack_int LAPACKE_zgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const lapack_complex_double* v,
+                                 lapack_int ldv, const lapack_complex_double* t,
+                                 lapack_int ldt, lapack_complex_double* c,
+                                 lapack_int ldc, lapack_complex_double* work );
+
+lapack_int LAPACKE_sgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, float* a, lapack_int lda,
+                                float* t, lapack_int ldt, float* work );
+lapack_int LAPACKE_dgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, double* a, lapack_int lda,
+                                double* t, lapack_int ldt, double* work );
+lapack_int LAPACKE_cgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* t,
+                                lapack_int ldt, lapack_complex_float* work );
+lapack_int LAPACKE_zgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* t,
+                                lapack_int ldt, lapack_complex_double* work );
+
+lapack_int LAPACKE_sgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_dgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_cgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_sgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_dgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_cgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb, const float* v,
+                                 lapack_int ldv, const float* t, lapack_int ldt,
+                                 float* a, lapack_int lda, float* b,
+                                 lapack_int ldb, float* work );
+lapack_int LAPACKE_dtpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb, const double* v,
+                                 lapack_int ldv, const double* t,
+                                 lapack_int ldt, double* a, lapack_int lda,
+                                 double* b, lapack_int ldb, double* work );
+lapack_int LAPACKE_ctpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb,
+                                 const lapack_complex_float* v, lapack_int ldv,
+                                 const lapack_complex_float* t, lapack_int ldt,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_ztpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb,
+                                 const lapack_complex_double* v, lapack_int ldv,
+                                 const lapack_complex_double* t, lapack_int ldt,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* work );
+
+lapack_int LAPACKE_dtpqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int l, lapack_int nb, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* t, lapack_int ldt, double* work );
+lapack_int LAPACKE_ctpqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int l, lapack_int nb,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* t,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_int ldt, lapack_complex_float* work );
+lapack_int LAPACKE_ztpqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int l, lapack_int nb,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_stpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* b,
+                                 lapack_int ldb, float* t, lapack_int ldt );
+lapack_int LAPACKE_dtpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* b,
+                                 lapack_int ldb, double* t, lapack_int ldt );
+lapack_int LAPACKE_ctpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_ztpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const float* v, lapack_int ldv, const float* t,
+                                lapack_int ldt, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, const float* mywork,
+                                lapack_int myldwork );
+lapack_int LAPACKE_dtprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const double* v, lapack_int ldv,
+                                const double* t, lapack_int ldt, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                const double* mywork, lapack_int myldwork );
+lapack_int LAPACKE_ctprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                const float* mywork, lapack_int myldwork );
+lapack_int LAPACKE_ztprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                const double* mywork, lapack_int myldwork );
+//LAPACK 3.X.X
+lapack_int LAPACKE_csyr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float alpha,
+                             const lapack_complex_float* x, lapack_int incx,
+                             lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zsyr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double alpha,
+                             const lapack_complex_double* x, lapack_int incx,
+                             lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_csyr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float alpha,
+                                  const lapack_complex_float* x,
+                                  lapack_int incx, lapack_complex_float* a,
+                                  lapack_int lda );
+lapack_int LAPACKE_zsyr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double alpha,
+                                  const lapack_complex_double* x,
+                                  lapack_int incx, lapack_complex_double* a,
+                                  lapack_int lda );
+
+
+
+#define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf,SGETRF)
+#define LAPACK_dgetrf LAPACK_GLOBAL(dgetrf,DGETRF)
+#define LAPACK_cgetrf LAPACK_GLOBAL(cgetrf,CGETRF)
+#define LAPACK_zgetrf LAPACK_GLOBAL(zgetrf,ZGETRF)
+#define LAPACK_sgbtrf LAPACK_GLOBAL(sgbtrf,SGBTRF)
+#define LAPACK_dgbtrf LAPACK_GLOBAL(dgbtrf,DGBTRF)
+#define LAPACK_cgbtrf LAPACK_GLOBAL(cgbtrf,CGBTRF)
+#define LAPACK_zgbtrf LAPACK_GLOBAL(zgbtrf,ZGBTRF)
+#define LAPACK_sgttrf LAPACK_GLOBAL(sgttrf,SGTTRF)
+#define LAPACK_dgttrf LAPACK_GLOBAL(dgttrf,DGTTRF)
+#define LAPACK_cgttrf LAPACK_GLOBAL(cgttrf,CGTTRF)
+#define LAPACK_zgttrf LAPACK_GLOBAL(zgttrf,ZGTTRF)
+#define LAPACK_spotrf LAPACK_GLOBAL(spotrf,SPOTRF)
+#define LAPACK_dpotrf LAPACK_GLOBAL(dpotrf,DPOTRF)
+#define LAPACK_cpotrf LAPACK_GLOBAL(cpotrf,CPOTRF)
+#define LAPACK_zpotrf LAPACK_GLOBAL(zpotrf,ZPOTRF)
+#define LAPACK_dpstrf LAPACK_GLOBAL(dpstrf,DPSTRF)
+#define LAPACK_spstrf LAPACK_GLOBAL(spstrf,SPSTRF)
+#define LAPACK_zpstrf LAPACK_GLOBAL(zpstrf,ZPSTRF)
+#define LAPACK_cpstrf LAPACK_GLOBAL(cpstrf,CPSTRF)
+#define LAPACK_dpftrf LAPACK_GLOBAL(dpftrf,DPFTRF)
+#define LAPACK_spftrf LAPACK_GLOBAL(spftrf,SPFTRF)
+#define LAPACK_zpftrf LAPACK_GLOBAL(zpftrf,ZPFTRF)
+#define LAPACK_cpftrf LAPACK_GLOBAL(cpftrf,CPFTRF)
+#define LAPACK_spptrf LAPACK_GLOBAL(spptrf,SPPTRF)
+#define LAPACK_dpptrf LAPACK_GLOBAL(dpptrf,DPPTRF)
+#define LAPACK_cpptrf LAPACK_GLOBAL(cpptrf,CPPTRF)
+#define LAPACK_zpptrf LAPACK_GLOBAL(zpptrf,ZPPTRF)
+#define LAPACK_spbtrf LAPACK_GLOBAL(spbtrf,SPBTRF)
+#define LAPACK_dpbtrf LAPACK_GLOBAL(dpbtrf,DPBTRF)
+#define LAPACK_cpbtrf LAPACK_GLOBAL(cpbtrf,CPBTRF)
+#define LAPACK_zpbtrf LAPACK_GLOBAL(zpbtrf,ZPBTRF)
+#define LAPACK_spttrf LAPACK_GLOBAL(spttrf,SPTTRF)
+#define LAPACK_dpttrf LAPACK_GLOBAL(dpttrf,DPTTRF)
+#define LAPACK_cpttrf LAPACK_GLOBAL(cpttrf,CPTTRF)
+#define LAPACK_zpttrf LAPACK_GLOBAL(zpttrf,ZPTTRF)
+#define LAPACK_ssytrf LAPACK_GLOBAL(ssytrf,SSYTRF)
+#define LAPACK_dsytrf LAPACK_GLOBAL(dsytrf,DSYTRF)
+#define LAPACK_csytrf LAPACK_GLOBAL(csytrf,CSYTRF)
+#define LAPACK_zsytrf LAPACK_GLOBAL(zsytrf,ZSYTRF)
+#define LAPACK_chetrf LAPACK_GLOBAL(chetrf,CHETRF)
+#define LAPACK_zhetrf LAPACK_GLOBAL(zhetrf,ZHETRF)
+#define LAPACK_ssptrf LAPACK_GLOBAL(ssptrf,SSPTRF)
+#define LAPACK_dsptrf LAPACK_GLOBAL(dsptrf,DSPTRF)
+#define LAPACK_csptrf LAPACK_GLOBAL(csptrf,CSPTRF)
+#define LAPACK_zsptrf LAPACK_GLOBAL(zsptrf,ZSPTRF)
+#define LAPACK_chptrf LAPACK_GLOBAL(chptrf,CHPTRF)
+#define LAPACK_zhptrf LAPACK_GLOBAL(zhptrf,ZHPTRF)
+#define LAPACK_sgetrs LAPACK_GLOBAL(sgetrs,SGETRS)
+#define LAPACK_dgetrs LAPACK_GLOBAL(dgetrs,DGETRS)
+#define LAPACK_cgetrs LAPACK_GLOBAL(cgetrs,CGETRS)
+#define LAPACK_zgetrs LAPACK_GLOBAL(zgetrs,ZGETRS)
+#define LAPACK_sgbtrs LAPACK_GLOBAL(sgbtrs,SGBTRS)
+#define LAPACK_dgbtrs LAPACK_GLOBAL(dgbtrs,DGBTRS)
+#define LAPACK_cgbtrs LAPACK_GLOBAL(cgbtrs,CGBTRS)
+#define LAPACK_zgbtrs LAPACK_GLOBAL(zgbtrs,ZGBTRS)
+#define LAPACK_sgttrs LAPACK_GLOBAL(sgttrs,SGTTRS)
+#define LAPACK_dgttrs LAPACK_GLOBAL(dgttrs,DGTTRS)
+#define LAPACK_cgttrs LAPACK_GLOBAL(cgttrs,CGTTRS)
+#define LAPACK_zgttrs LAPACK_GLOBAL(zgttrs,ZGTTRS)
+#define LAPACK_spotrs LAPACK_GLOBAL(spotrs,SPOTRS)
+#define LAPACK_dpotrs LAPACK_GLOBAL(dpotrs,DPOTRS)
+#define LAPACK_cpotrs LAPACK_GLOBAL(cpotrs,CPOTRS)
+#define LAPACK_zpotrs LAPACK_GLOBAL(zpotrs,ZPOTRS)
+#define LAPACK_dpftrs LAPACK_GLOBAL(dpftrs,DPFTRS)
+#define LAPACK_spftrs LAPACK_GLOBAL(spftrs,SPFTRS)
+#define LAPACK_zpftrs LAPACK_GLOBAL(zpftrs,ZPFTRS)
+#define LAPACK_cpftrs LAPACK_GLOBAL(cpftrs,CPFTRS)
+#define LAPACK_spptrs LAPACK_GLOBAL(spptrs,SPPTRS)
+#define LAPACK_dpptrs LAPACK_GLOBAL(dpptrs,DPPTRS)
+#define LAPACK_cpptrs LAPACK_GLOBAL(cpptrs,CPPTRS)
+#define LAPACK_zpptrs LAPACK_GLOBAL(zpptrs,ZPPTRS)
+#define LAPACK_spbtrs LAPACK_GLOBAL(spbtrs,SPBTRS)
+#define LAPACK_dpbtrs LAPACK_GLOBAL(dpbtrs,DPBTRS)
+#define LAPACK_cpbtrs LAPACK_GLOBAL(cpbtrs,CPBTRS)
+#define LAPACK_zpbtrs LAPACK_GLOBAL(zpbtrs,ZPBTRS)
+#define LAPACK_spttrs LAPACK_GLOBAL(spttrs,SPTTRS)
+#define LAPACK_dpttrs LAPACK_GLOBAL(dpttrs,DPTTRS)
+#define LAPACK_cpttrs LAPACK_GLOBAL(cpttrs,CPTTRS)
+#define LAPACK_zpttrs LAPACK_GLOBAL(zpttrs,ZPTTRS)
+#define LAPACK_ssytrs LAPACK_GLOBAL(ssytrs,SSYTRS)
+#define LAPACK_dsytrs LAPACK_GLOBAL(dsytrs,DSYTRS)
+#define LAPACK_csytrs LAPACK_GLOBAL(csytrs,CSYTRS)
+#define LAPACK_zsytrs LAPACK_GLOBAL(zsytrs,ZSYTRS)
+#define LAPACK_chetrs LAPACK_GLOBAL(chetrs,CHETRS)
+#define LAPACK_zhetrs LAPACK_GLOBAL(zhetrs,ZHETRS)
+#define LAPACK_ssptrs LAPACK_GLOBAL(ssptrs,SSPTRS)
+#define LAPACK_dsptrs LAPACK_GLOBAL(dsptrs,DSPTRS)
+#define LAPACK_csptrs LAPACK_GLOBAL(csptrs,CSPTRS)
+#define LAPACK_zsptrs LAPACK_GLOBAL(zsptrs,ZSPTRS)
+#define LAPACK_chptrs LAPACK_GLOBAL(chptrs,CHPTRS)
+#define LAPACK_zhptrs LAPACK_GLOBAL(zhptrs,ZHPTRS)
+#define LAPACK_strtrs LAPACK_GLOBAL(strtrs,STRTRS)
+#define LAPACK_dtrtrs LAPACK_GLOBAL(dtrtrs,DTRTRS)
+#define LAPACK_ctrtrs LAPACK_GLOBAL(ctrtrs,CTRTRS)
+#define LAPACK_ztrtrs LAPACK_GLOBAL(ztrtrs,ZTRTRS)
+#define LAPACK_stptrs LAPACK_GLOBAL(stptrs,STPTRS)
+#define LAPACK_dtptrs LAPACK_GLOBAL(dtptrs,DTPTRS)
+#define LAPACK_ctptrs LAPACK_GLOBAL(ctptrs,CTPTRS)
+#define LAPACK_ztptrs LAPACK_GLOBAL(ztptrs,ZTPTRS)
+#define LAPACK_stbtrs LAPACK_GLOBAL(stbtrs,STBTRS)
+#define LAPACK_dtbtrs LAPACK_GLOBAL(dtbtrs,DTBTRS)
+#define LAPACK_ctbtrs LAPACK_GLOBAL(ctbtrs,CTBTRS)
+#define LAPACK_ztbtrs LAPACK_GLOBAL(ztbtrs,ZTBTRS)
+#define LAPACK_sgecon LAPACK_GLOBAL(sgecon,SGECON)
+#define LAPACK_dgecon LAPACK_GLOBAL(dgecon,DGECON)
+#define LAPACK_cgecon LAPACK_GLOBAL(cgecon,CGECON)
+#define LAPACK_zgecon LAPACK_GLOBAL(zgecon,ZGECON)
+#define LAPACK_sgbcon LAPACK_GLOBAL(sgbcon,SGBCON)
+#define LAPACK_dgbcon LAPACK_GLOBAL(dgbcon,DGBCON)
+#define LAPACK_cgbcon LAPACK_GLOBAL(cgbcon,CGBCON)
+#define LAPACK_zgbcon LAPACK_GLOBAL(zgbcon,ZGBCON)
+#define LAPACK_sgtcon LAPACK_GLOBAL(sgtcon,SGTCON)
+#define LAPACK_dgtcon LAPACK_GLOBAL(dgtcon,DGTCON)
+#define LAPACK_cgtcon LAPACK_GLOBAL(cgtcon,CGTCON)
+#define LAPACK_zgtcon LAPACK_GLOBAL(zgtcon,ZGTCON)
+#define LAPACK_spocon LAPACK_GLOBAL(spocon,SPOCON)
+#define LAPACK_dpocon LAPACK_GLOBAL(dpocon,DPOCON)
+#define LAPACK_cpocon LAPACK_GLOBAL(cpocon,CPOCON)
+#define LAPACK_zpocon LAPACK_GLOBAL(zpocon,ZPOCON)
+#define LAPACK_sppcon LAPACK_GLOBAL(sppcon,SPPCON)
+#define LAPACK_dppcon LAPACK_GLOBAL(dppcon,DPPCON)
+#define LAPACK_cppcon LAPACK_GLOBAL(cppcon,CPPCON)
+#define LAPACK_zppcon LAPACK_GLOBAL(zppcon,ZPPCON)
+#define LAPACK_spbcon LAPACK_GLOBAL(spbcon,SPBCON)
+#define LAPACK_dpbcon LAPACK_GLOBAL(dpbcon,DPBCON)
+#define LAPACK_cpbcon LAPACK_GLOBAL(cpbcon,CPBCON)
+#define LAPACK_zpbcon LAPACK_GLOBAL(zpbcon,ZPBCON)
+#define LAPACK_sptcon LAPACK_GLOBAL(sptcon,SPTCON)
+#define LAPACK_dptcon LAPACK_GLOBAL(dptcon,DPTCON)
+#define LAPACK_cptcon LAPACK_GLOBAL(cptcon,CPTCON)
+#define LAPACK_zptcon LAPACK_GLOBAL(zptcon,ZPTCON)
+#define LAPACK_ssycon LAPACK_GLOBAL(ssycon,SSYCON)
+#define LAPACK_dsycon LAPACK_GLOBAL(dsycon,DSYCON)
+#define LAPACK_csycon LAPACK_GLOBAL(csycon,CSYCON)
+#define LAPACK_zsycon LAPACK_GLOBAL(zsycon,ZSYCON)
+#define LAPACK_checon LAPACK_GLOBAL(checon,CHECON)
+#define LAPACK_zhecon LAPACK_GLOBAL(zhecon,ZHECON)
+#define LAPACK_sspcon LAPACK_GLOBAL(sspcon,SSPCON)
+#define LAPACK_dspcon LAPACK_GLOBAL(dspcon,DSPCON)
+#define LAPACK_cspcon LAPACK_GLOBAL(cspcon,CSPCON)
+#define LAPACK_zspcon LAPACK_GLOBAL(zspcon,ZSPCON)
+#define LAPACK_chpcon LAPACK_GLOBAL(chpcon,CHPCON)
+#define LAPACK_zhpcon LAPACK_GLOBAL(zhpcon,ZHPCON)
+#define LAPACK_strcon LAPACK_GLOBAL(strcon,STRCON)
+#define LAPACK_dtrcon LAPACK_GLOBAL(dtrcon,DTRCON)
+#define LAPACK_ctrcon LAPACK_GLOBAL(ctrcon,CTRCON)
+#define LAPACK_ztrcon LAPACK_GLOBAL(ztrcon,ZTRCON)
+#define LAPACK_stpcon LAPACK_GLOBAL(stpcon,STPCON)
+#define LAPACK_dtpcon LAPACK_GLOBAL(dtpcon,DTPCON)
+#define LAPACK_ctpcon LAPACK_GLOBAL(ctpcon,CTPCON)
+#define LAPACK_ztpcon LAPACK_GLOBAL(ztpcon,ZTPCON)
+#define LAPACK_stbcon LAPACK_GLOBAL(stbcon,STBCON)
+#define LAPACK_dtbcon LAPACK_GLOBAL(dtbcon,DTBCON)
+#define LAPACK_ctbcon LAPACK_GLOBAL(ctbcon,CTBCON)
+#define LAPACK_ztbcon LAPACK_GLOBAL(ztbcon,ZTBCON)
+#define LAPACK_sgerfs LAPACK_GLOBAL(sgerfs,SGERFS)
+#define LAPACK_dgerfs LAPACK_GLOBAL(dgerfs,DGERFS)
+#define LAPACK_cgerfs LAPACK_GLOBAL(cgerfs,CGERFS)
+#define LAPACK_zgerfs LAPACK_GLOBAL(zgerfs,ZGERFS)
+#define LAPACK_dgerfsx LAPACK_GLOBAL(dgerfsx,DGERFSX)
+#define LAPACK_sgerfsx LAPACK_GLOBAL(sgerfsx,SGERFSX)
+#define LAPACK_zgerfsx LAPACK_GLOBAL(zgerfsx,ZGERFSX)
+#define LAPACK_cgerfsx LAPACK_GLOBAL(cgerfsx,CGERFSX)
+#define LAPACK_sgbrfs LAPACK_GLOBAL(sgbrfs,SGBRFS)
+#define LAPACK_dgbrfs LAPACK_GLOBAL(dgbrfs,DGBRFS)
+#define LAPACK_cgbrfs LAPACK_GLOBAL(cgbrfs,CGBRFS)
+#define LAPACK_zgbrfs LAPACK_GLOBAL(zgbrfs,ZGBRFS)
+#define LAPACK_dgbrfsx LAPACK_GLOBAL(dgbrfsx,DGBRFSX)
+#define LAPACK_sgbrfsx LAPACK_GLOBAL(sgbrfsx,SGBRFSX)
+#define LAPACK_zgbrfsx LAPACK_GLOBAL(zgbrfsx,ZGBRFSX)
+#define LAPACK_cgbrfsx LAPACK_GLOBAL(cgbrfsx,CGBRFSX)
+#define LAPACK_sgtrfs LAPACK_GLOBAL(sgtrfs,SGTRFS)
+#define LAPACK_dgtrfs LAPACK_GLOBAL(dgtrfs,DGTRFS)
+#define LAPACK_cgtrfs LAPACK_GLOBAL(cgtrfs,CGTRFS)
+#define LAPACK_zgtrfs LAPACK_GLOBAL(zgtrfs,ZGTRFS)
+#define LAPACK_sporfs LAPACK_GLOBAL(sporfs,SPORFS)
+#define LAPACK_dporfs LAPACK_GLOBAL(dporfs,DPORFS)
+#define LAPACK_cporfs LAPACK_GLOBAL(cporfs,CPORFS)
+#define LAPACK_zporfs LAPACK_GLOBAL(zporfs,ZPORFS)
+#define LAPACK_dporfsx LAPACK_GLOBAL(dporfsx,DPORFSX)
+#define LAPACK_sporfsx LAPACK_GLOBAL(sporfsx,SPORFSX)
+#define LAPACK_zporfsx LAPACK_GLOBAL(zporfsx,ZPORFSX)
+#define LAPACK_cporfsx LAPACK_GLOBAL(cporfsx,CPORFSX)
+#define LAPACK_spprfs LAPACK_GLOBAL(spprfs,SPPRFS)
+#define LAPACK_dpprfs LAPACK_GLOBAL(dpprfs,DPPRFS)
+#define LAPACK_cpprfs LAPACK_GLOBAL(cpprfs,CPPRFS)
+#define LAPACK_zpprfs LAPACK_GLOBAL(zpprfs,ZPPRFS)
+#define LAPACK_spbrfs LAPACK_GLOBAL(spbrfs,SPBRFS)
+#define LAPACK_dpbrfs LAPACK_GLOBAL(dpbrfs,DPBRFS)
+#define LAPACK_cpbrfs LAPACK_GLOBAL(cpbrfs,CPBRFS)
+#define LAPACK_zpbrfs LAPACK_GLOBAL(zpbrfs,ZPBRFS)
+#define LAPACK_sptrfs LAPACK_GLOBAL(sptrfs,SPTRFS)
+#define LAPACK_dptrfs LAPACK_GLOBAL(dptrfs,DPTRFS)
+#define LAPACK_cptrfs LAPACK_GLOBAL(cptrfs,CPTRFS)
+#define LAPACK_zptrfs LAPACK_GLOBAL(zptrfs,ZPTRFS)
+#define LAPACK_ssyrfs LAPACK_GLOBAL(ssyrfs,SSYRFS)
+#define LAPACK_dsyrfs LAPACK_GLOBAL(dsyrfs,DSYRFS)
+#define LAPACK_csyrfs LAPACK_GLOBAL(csyrfs,CSYRFS)
+#define LAPACK_zsyrfs LAPACK_GLOBAL(zsyrfs,ZSYRFS)
+#define LAPACK_dsyrfsx LAPACK_GLOBAL(dsyrfsx,DSYRFSX)
+#define LAPACK_ssyrfsx LAPACK_GLOBAL(ssyrfsx,SSYRFSX)
+#define LAPACK_zsyrfsx LAPACK_GLOBAL(zsyrfsx,ZSYRFSX)
+#define LAPACK_csyrfsx LAPACK_GLOBAL(csyrfsx,CSYRFSX)
+#define LAPACK_cherfs LAPACK_GLOBAL(cherfs,CHERFS)
+#define LAPACK_zherfs LAPACK_GLOBAL(zherfs,ZHERFS)
+#define LAPACK_zherfsx LAPACK_GLOBAL(zherfsx,ZHERFSX)
+#define LAPACK_cherfsx LAPACK_GLOBAL(cherfsx,CHERFSX)
+#define LAPACK_ssprfs LAPACK_GLOBAL(ssprfs,SSPRFS)
+#define LAPACK_dsprfs LAPACK_GLOBAL(dsprfs,DSPRFS)
+#define LAPACK_csprfs LAPACK_GLOBAL(csprfs,CSPRFS)
+#define LAPACK_zsprfs LAPACK_GLOBAL(zsprfs,ZSPRFS)
+#define LAPACK_chprfs LAPACK_GLOBAL(chprfs,CHPRFS)
+#define LAPACK_zhprfs LAPACK_GLOBAL(zhprfs,ZHPRFS)
+#define LAPACK_strrfs LAPACK_GLOBAL(strrfs,STRRFS)
+#define LAPACK_dtrrfs LAPACK_GLOBAL(dtrrfs,DTRRFS)
+#define LAPACK_ctrrfs LAPACK_GLOBAL(ctrrfs,CTRRFS)
+#define LAPACK_ztrrfs LAPACK_GLOBAL(ztrrfs,ZTRRFS)
+#define LAPACK_stprfs LAPACK_GLOBAL(stprfs,STPRFS)
+#define LAPACK_dtprfs LAPACK_GLOBAL(dtprfs,DTPRFS)
+#define LAPACK_ctprfs LAPACK_GLOBAL(ctprfs,CTPRFS)
+#define LAPACK_ztprfs LAPACK_GLOBAL(ztprfs,ZTPRFS)
+#define LAPACK_stbrfs LAPACK_GLOBAL(stbrfs,STBRFS)
+#define LAPACK_dtbrfs LAPACK_GLOBAL(dtbrfs,DTBRFS)
+#define LAPACK_ctbrfs LAPACK_GLOBAL(ctbrfs,CTBRFS)
+#define LAPACK_ztbrfs LAPACK_GLOBAL(ztbrfs,ZTBRFS)
+#define LAPACK_sgetri LAPACK_GLOBAL(sgetri,SGETRI)
+#define LAPACK_dgetri LAPACK_GLOBAL(dgetri,DGETRI)
+#define LAPACK_cgetri LAPACK_GLOBAL(cgetri,CGETRI)
+#define LAPACK_zgetri LAPACK_GLOBAL(zgetri,ZGETRI)
+#define LAPACK_spotri LAPACK_GLOBAL(spotri,SPOTRI)
+#define LAPACK_dpotri LAPACK_GLOBAL(dpotri,DPOTRI)
+#define LAPACK_cpotri LAPACK_GLOBAL(cpotri,CPOTRI)
+#define LAPACK_zpotri LAPACK_GLOBAL(zpotri,ZPOTRI)
+#define LAPACK_dpftri LAPACK_GLOBAL(dpftri,DPFTRI)
+#define LAPACK_spftri LAPACK_GLOBAL(spftri,SPFTRI)
+#define LAPACK_zpftri LAPACK_GLOBAL(zpftri,ZPFTRI)
+#define LAPACK_cpftri LAPACK_GLOBAL(cpftri,CPFTRI)
+#define LAPACK_spptri LAPACK_GLOBAL(spptri,SPPTRI)
+#define LAPACK_dpptri LAPACK_GLOBAL(dpptri,DPPTRI)
+#define LAPACK_cpptri LAPACK_GLOBAL(cpptri,CPPTRI)
+#define LAPACK_zpptri LAPACK_GLOBAL(zpptri,ZPPTRI)
+#define LAPACK_ssytri LAPACK_GLOBAL(ssytri,SSYTRI)
+#define LAPACK_dsytri LAPACK_GLOBAL(dsytri,DSYTRI)
+#define LAPACK_csytri LAPACK_GLOBAL(csytri,CSYTRI)
+#define LAPACK_zsytri LAPACK_GLOBAL(zsytri,ZSYTRI)
+#define LAPACK_chetri LAPACK_GLOBAL(chetri,CHETRI)
+#define LAPACK_zhetri LAPACK_GLOBAL(zhetri,ZHETRI)
+#define LAPACK_ssptri LAPACK_GLOBAL(ssptri,SSPTRI)
+#define LAPACK_dsptri LAPACK_GLOBAL(dsptri,DSPTRI)
+#define LAPACK_csptri LAPACK_GLOBAL(csptri,CSPTRI)
+#define LAPACK_zsptri LAPACK_GLOBAL(zsptri,ZSPTRI)
+#define LAPACK_chptri LAPACK_GLOBAL(chptri,CHPTRI)
+#define LAPACK_zhptri LAPACK_GLOBAL(zhptri,ZHPTRI)
+#define LAPACK_strtri LAPACK_GLOBAL(strtri,STRTRI)
+#define LAPACK_dtrtri LAPACK_GLOBAL(dtrtri,DTRTRI)
+#define LAPACK_ctrtri LAPACK_GLOBAL(ctrtri,CTRTRI)
+#define LAPACK_ztrtri LAPACK_GLOBAL(ztrtri,ZTRTRI)
+#define LAPACK_dtftri LAPACK_GLOBAL(dtftri,DTFTRI)
+#define LAPACK_stftri LAPACK_GLOBAL(stftri,STFTRI)
+#define LAPACK_ztftri LAPACK_GLOBAL(ztftri,ZTFTRI)
+#define LAPACK_ctftri LAPACK_GLOBAL(ctftri,CTFTRI)
+#define LAPACK_stptri LAPACK_GLOBAL(stptri,STPTRI)
+#define LAPACK_dtptri LAPACK_GLOBAL(dtptri,DTPTRI)
+#define LAPACK_ctptri LAPACK_GLOBAL(ctptri,CTPTRI)
+#define LAPACK_ztptri LAPACK_GLOBAL(ztptri,ZTPTRI)
+#define LAPACK_sgeequ LAPACK_GLOBAL(sgeequ,SGEEQU)
+#define LAPACK_dgeequ LAPACK_GLOBAL(dgeequ,DGEEQU)
+#define LAPACK_cgeequ LAPACK_GLOBAL(cgeequ,CGEEQU)
+#define LAPACK_zgeequ LAPACK_GLOBAL(zgeequ,ZGEEQU)
+#define LAPACK_dgeequb LAPACK_GLOBAL(dgeequb,DGEEQUB)
+#define LAPACK_sgeequb LAPACK_GLOBAL(sgeequb,SGEEQUB)
+#define LAPACK_zgeequb LAPACK_GLOBAL(zgeequb,ZGEEQUB)
+#define LAPACK_cgeequb LAPACK_GLOBAL(cgeequb,CGEEQUB)
+#define LAPACK_sgbequ LAPACK_GLOBAL(sgbequ,SGBEQU)
+#define LAPACK_dgbequ LAPACK_GLOBAL(dgbequ,DGBEQU)
+#define LAPACK_cgbequ LAPACK_GLOBAL(cgbequ,CGBEQU)
+#define LAPACK_zgbequ LAPACK_GLOBAL(zgbequ,ZGBEQU)
+#define LAPACK_dgbequb LAPACK_GLOBAL(dgbequb,DGBEQUB)
+#define LAPACK_sgbequb LAPACK_GLOBAL(sgbequb,SGBEQUB)
+#define LAPACK_zgbequb LAPACK_GLOBAL(zgbequb,ZGBEQUB)
+#define LAPACK_cgbequb LAPACK_GLOBAL(cgbequb,CGBEQUB)
+#define LAPACK_spoequ LAPACK_GLOBAL(spoequ,SPOEQU)
+#define LAPACK_dpoequ LAPACK_GLOBAL(dpoequ,DPOEQU)
+#define LAPACK_cpoequ LAPACK_GLOBAL(cpoequ,CPOEQU)
+#define LAPACK_zpoequ LAPACK_GLOBAL(zpoequ,ZPOEQU)
+#define LAPACK_dpoequb LAPACK_GLOBAL(dpoequb,DPOEQUB)
+#define LAPACK_spoequb LAPACK_GLOBAL(spoequb,SPOEQUB)
+#define LAPACK_zpoequb LAPACK_GLOBAL(zpoequb,ZPOEQUB)
+#define LAPACK_cpoequb LAPACK_GLOBAL(cpoequb,CPOEQUB)
+#define LAPACK_sppequ LAPACK_GLOBAL(sppequ,SPPEQU)
+#define LAPACK_dppequ LAPACK_GLOBAL(dppequ,DPPEQU)
+#define LAPACK_cppequ LAPACK_GLOBAL(cppequ,CPPEQU)
+#define LAPACK_zppequ LAPACK_GLOBAL(zppequ,ZPPEQU)
+#define LAPACK_spbequ LAPACK_GLOBAL(spbequ,SPBEQU)
+#define LAPACK_dpbequ LAPACK_GLOBAL(dpbequ,DPBEQU)
+#define LAPACK_cpbequ LAPACK_GLOBAL(cpbequ,CPBEQU)
+#define LAPACK_zpbequ LAPACK_GLOBAL(zpbequ,ZPBEQU)
+#define LAPACK_dsyequb LAPACK_GLOBAL(dsyequb,DSYEQUB)
+#define LAPACK_ssyequb LAPACK_GLOBAL(ssyequb,SSYEQUB)
+#define LAPACK_zsyequb LAPACK_GLOBAL(zsyequb,ZSYEQUB)
+#define LAPACK_csyequb LAPACK_GLOBAL(csyequb,CSYEQUB)
+#define LAPACK_zheequb LAPACK_GLOBAL(zheequb,ZHEEQUB)
+#define LAPACK_cheequb LAPACK_GLOBAL(cheequb,CHEEQUB)
+#define LAPACK_sgesv LAPACK_GLOBAL(sgesv,SGESV)
+#define LAPACK_dgesv LAPACK_GLOBAL(dgesv,DGESV)
+#define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV)
+#define LAPACK_zgesv LAPACK_GLOBAL(zgesv,ZGESV)
+#define LAPACK_dsgesv LAPACK_GLOBAL(dsgesv,DSGESV)
+#define LAPACK_zcgesv LAPACK_GLOBAL(zcgesv,ZCGESV)
+#define LAPACK_sgesvx LAPACK_GLOBAL(sgesvx,SGESVX)
+#define LAPACK_dgesvx LAPACK_GLOBAL(dgesvx,DGESVX)
+#define LAPACK_cgesvx LAPACK_GLOBAL(cgesvx,CGESVX)
+#define LAPACK_zgesvx LAPACK_GLOBAL(zgesvx,ZGESVX)
+#define LAPACK_dgesvxx LAPACK_GLOBAL(dgesvxx,DGESVXX)
+#define LAPACK_sgesvxx LAPACK_GLOBAL(sgesvxx,SGESVXX)
+#define LAPACK_zgesvxx LAPACK_GLOBAL(zgesvxx,ZGESVXX)
+#define LAPACK_cgesvxx LAPACK_GLOBAL(cgesvxx,CGESVXX)
+#define LAPACK_sgbsv LAPACK_GLOBAL(sgbsv,SGBSV)
+#define LAPACK_dgbsv LAPACK_GLOBAL(dgbsv,DGBSV)
+#define LAPACK_cgbsv LAPACK_GLOBAL(cgbsv,CGBSV)
+#define LAPACK_zgbsv LAPACK_GLOBAL(zgbsv,ZGBSV)
+#define LAPACK_sgbsvx LAPACK_GLOBAL(sgbsvx,SGBSVX)
+#define LAPACK_dgbsvx LAPACK_GLOBAL(dgbsvx,DGBSVX)
+#define LAPACK_cgbsvx LAPACK_GLOBAL(cgbsvx,CGBSVX)
+#define LAPACK_zgbsvx LAPACK_GLOBAL(zgbsvx,ZGBSVX)
+#define LAPACK_dgbsvxx LAPACK_GLOBAL(dgbsvxx,DGBSVXX)
+#define LAPACK_sgbsvxx LAPACK_GLOBAL(sgbsvxx,SGBSVXX)
+#define LAPACK_zgbsvxx LAPACK_GLOBAL(zgbsvxx,ZGBSVXX)
+#define LAPACK_cgbsvxx LAPACK_GLOBAL(cgbsvxx,CGBSVXX)
+#define LAPACK_sgtsv LAPACK_GLOBAL(sgtsv,SGTSV)
+#define LAPACK_dgtsv LAPACK_GLOBAL(dgtsv,DGTSV)
+#define LAPACK_cgtsv LAPACK_GLOBAL(cgtsv,CGTSV)
+#define LAPACK_zgtsv LAPACK_GLOBAL(zgtsv,ZGTSV)
+#define LAPACK_sgtsvx LAPACK_GLOBAL(sgtsvx,SGTSVX)
+#define LAPACK_dgtsvx LAPACK_GLOBAL(dgtsvx,DGTSVX)
+#define LAPACK_cgtsvx LAPACK_GLOBAL(cgtsvx,CGTSVX)
+#define LAPACK_zgtsvx LAPACK_GLOBAL(zgtsvx,ZGTSVX)
+#define LAPACK_sposv LAPACK_GLOBAL(sposv,SPOSV)
+#define LAPACK_dposv LAPACK_GLOBAL(dposv,DPOSV)
+#define LAPACK_cposv LAPACK_GLOBAL(cposv,CPOSV)
+#define LAPACK_zposv LAPACK_GLOBAL(zposv,ZPOSV)
+#define LAPACK_dsposv LAPACK_GLOBAL(dsposv,DSPOSV)
+#define LAPACK_zcposv LAPACK_GLOBAL(zcposv,ZCPOSV)
+#define LAPACK_sposvx LAPACK_GLOBAL(sposvx,SPOSVX)
+#define LAPACK_dposvx LAPACK_GLOBAL(dposvx,DPOSVX)
+#define LAPACK_cposvx LAPACK_GLOBAL(cposvx,CPOSVX)
+#define LAPACK_zposvx LAPACK_GLOBAL(zposvx,ZPOSVX)
+#define LAPACK_dposvxx LAPACK_GLOBAL(dposvxx,DPOSVXX)
+#define LAPACK_sposvxx LAPACK_GLOBAL(sposvxx,SPOSVXX)
+#define LAPACK_zposvxx LAPACK_GLOBAL(zposvxx,ZPOSVXX)
+#define LAPACK_cposvxx LAPACK_GLOBAL(cposvxx,CPOSVXX)
+#define LAPACK_sppsv LAPACK_GLOBAL(sppsv,SPPSV)
+#define LAPACK_dppsv LAPACK_GLOBAL(dppsv,DPPSV)
+#define LAPACK_cppsv LAPACK_GLOBAL(cppsv,CPPSV)
+#define LAPACK_zppsv LAPACK_GLOBAL(zppsv,ZPPSV)
+#define LAPACK_sppsvx LAPACK_GLOBAL(sppsvx,SPPSVX)
+#define LAPACK_dppsvx LAPACK_GLOBAL(dppsvx,DPPSVX)
+#define LAPACK_cppsvx LAPACK_GLOBAL(cppsvx,CPPSVX)
+#define LAPACK_zppsvx LAPACK_GLOBAL(zppsvx,ZPPSVX)
+#define LAPACK_spbsv LAPACK_GLOBAL(spbsv,SPBSV)
+#define LAPACK_dpbsv LAPACK_GLOBAL(dpbsv,DPBSV)
+#define LAPACK_cpbsv LAPACK_GLOBAL(cpbsv,CPBSV)
+#define LAPACK_zpbsv LAPACK_GLOBAL(zpbsv,ZPBSV)
+#define LAPACK_spbsvx LAPACK_GLOBAL(spbsvx,SPBSVX)
+#define LAPACK_dpbsvx LAPACK_GLOBAL(dpbsvx,DPBSVX)
+#define LAPACK_cpbsvx LAPACK_GLOBAL(cpbsvx,CPBSVX)
+#define LAPACK_zpbsvx LAPACK_GLOBAL(zpbsvx,ZPBSVX)
+#define LAPACK_sptsv LAPACK_GLOBAL(sptsv,SPTSV)
+#define LAPACK_dptsv LAPACK_GLOBAL(dptsv,DPTSV)
+#define LAPACK_cptsv LAPACK_GLOBAL(cptsv,CPTSV)
+#define LAPACK_zptsv LAPACK_GLOBAL(zptsv,ZPTSV)
+#define LAPACK_sptsvx LAPACK_GLOBAL(sptsvx,SPTSVX)
+#define LAPACK_dptsvx LAPACK_GLOBAL(dptsvx,DPTSVX)
+#define LAPACK_cptsvx LAPACK_GLOBAL(cptsvx,CPTSVX)
+#define LAPACK_zptsvx LAPACK_GLOBAL(zptsvx,ZPTSVX)
+#define LAPACK_ssysv LAPACK_GLOBAL(ssysv,SSYSV)
+#define LAPACK_dsysv LAPACK_GLOBAL(dsysv,DSYSV)
+#define LAPACK_csysv LAPACK_GLOBAL(csysv,CSYSV)
+#define LAPACK_zsysv LAPACK_GLOBAL(zsysv,ZSYSV)
+#define LAPACK_ssysvx LAPACK_GLOBAL(ssysvx,SSYSVX)
+#define LAPACK_dsysvx LAPACK_GLOBAL(dsysvx,DSYSVX)
+#define LAPACK_csysvx LAPACK_GLOBAL(csysvx,CSYSVX)
+#define LAPACK_zsysvx LAPACK_GLOBAL(zsysvx,ZSYSVX)
+#define LAPACK_dsysvxx LAPACK_GLOBAL(dsysvxx,DSYSVXX)
+#define LAPACK_ssysvxx LAPACK_GLOBAL(ssysvxx,SSYSVXX)
+#define LAPACK_zsysvxx LAPACK_GLOBAL(zsysvxx,ZSYSVXX)
+#define LAPACK_csysvxx LAPACK_GLOBAL(csysvxx,CSYSVXX)
+#define LAPACK_chesv LAPACK_GLOBAL(chesv,CHESV)
+#define LAPACK_zhesv LAPACK_GLOBAL(zhesv,ZHESV)
+#define LAPACK_chesvx LAPACK_GLOBAL(chesvx,CHESVX)
+#define LAPACK_zhesvx LAPACK_GLOBAL(zhesvx,ZHESVX)
+#define LAPACK_zhesvxx LAPACK_GLOBAL(zhesvxx,ZHESVXX)
+#define LAPACK_chesvxx LAPACK_GLOBAL(chesvxx,CHESVXX)
+#define LAPACK_sspsv LAPACK_GLOBAL(sspsv,SSPSV)
+#define LAPACK_dspsv LAPACK_GLOBAL(dspsv,DSPSV)
+#define LAPACK_cspsv LAPACK_GLOBAL(cspsv,CSPSV)
+#define LAPACK_zspsv LAPACK_GLOBAL(zspsv,ZSPSV)
+#define LAPACK_sspsvx LAPACK_GLOBAL(sspsvx,SSPSVX)
+#define LAPACK_dspsvx LAPACK_GLOBAL(dspsvx,DSPSVX)
+#define LAPACK_cspsvx LAPACK_GLOBAL(cspsvx,CSPSVX)
+#define LAPACK_zspsvx LAPACK_GLOBAL(zspsvx,ZSPSVX)
+#define LAPACK_chpsv LAPACK_GLOBAL(chpsv,CHPSV)
+#define LAPACK_zhpsv LAPACK_GLOBAL(zhpsv,ZHPSV)
+#define LAPACK_chpsvx LAPACK_GLOBAL(chpsvx,CHPSVX)
+#define LAPACK_zhpsvx LAPACK_GLOBAL(zhpsvx,ZHPSVX)
+#define LAPACK_sgeqrf LAPACK_GLOBAL(sgeqrf,SGEQRF)
+#define LAPACK_dgeqrf LAPACK_GLOBAL(dgeqrf,DGEQRF)
+#define LAPACK_cgeqrf LAPACK_GLOBAL(cgeqrf,CGEQRF)
+#define LAPACK_zgeqrf LAPACK_GLOBAL(zgeqrf,ZGEQRF)
+#define LAPACK_sgeqpf LAPACK_GLOBAL(sgeqpf,SGEQPF)
+#define LAPACK_dgeqpf LAPACK_GLOBAL(dgeqpf,DGEQPF)
+#define LAPACK_cgeqpf LAPACK_GLOBAL(cgeqpf,CGEQPF)
+#define LAPACK_zgeqpf LAPACK_GLOBAL(zgeqpf,ZGEQPF)
+#define LAPACK_sgeqp3 LAPACK_GLOBAL(sgeqp3,SGEQP3)
+#define LAPACK_dgeqp3 LAPACK_GLOBAL(dgeqp3,DGEQP3)
+#define LAPACK_cgeqp3 LAPACK_GLOBAL(cgeqp3,CGEQP3)
+#define LAPACK_zgeqp3 LAPACK_GLOBAL(zgeqp3,ZGEQP3)
+#define LAPACK_sorgqr LAPACK_GLOBAL(sorgqr,SORGQR)
+#define LAPACK_dorgqr LAPACK_GLOBAL(dorgqr,DORGQR)
+#define LAPACK_sormqr LAPACK_GLOBAL(sormqr,SORMQR)
+#define LAPACK_dormqr LAPACK_GLOBAL(dormqr,DORMQR)
+#define LAPACK_cungqr LAPACK_GLOBAL(cungqr,CUNGQR)
+#define LAPACK_zungqr LAPACK_GLOBAL(zungqr,ZUNGQR)
+#define LAPACK_cunmqr LAPACK_GLOBAL(cunmqr,CUNMQR)
+#define LAPACK_zunmqr LAPACK_GLOBAL(zunmqr,ZUNMQR)
+#define LAPACK_sgelqf LAPACK_GLOBAL(sgelqf,SGELQF)
+#define LAPACK_dgelqf LAPACK_GLOBAL(dgelqf,DGELQF)
+#define LAPACK_cgelqf LAPACK_GLOBAL(cgelqf,CGELQF)
+#define LAPACK_zgelqf LAPACK_GLOBAL(zgelqf,ZGELQF)
+#define LAPACK_sorglq LAPACK_GLOBAL(sorglq,SORGLQ)
+#define LAPACK_dorglq LAPACK_GLOBAL(dorglq,DORGLQ)
+#define LAPACK_sormlq LAPACK_GLOBAL(sormlq,SORMLQ)
+#define LAPACK_dormlq LAPACK_GLOBAL(dormlq,DORMLQ)
+#define LAPACK_cunglq LAPACK_GLOBAL(cunglq,CUNGLQ)
+#define LAPACK_zunglq LAPACK_GLOBAL(zunglq,ZUNGLQ)
+#define LAPACK_cunmlq LAPACK_GLOBAL(cunmlq,CUNMLQ)
+#define LAPACK_zunmlq LAPACK_GLOBAL(zunmlq,ZUNMLQ)
+#define LAPACK_sgeqlf LAPACK_GLOBAL(sgeqlf,SGEQLF)
+#define LAPACK_dgeqlf LAPACK_GLOBAL(dgeqlf,DGEQLF)
+#define LAPACK_cgeqlf LAPACK_GLOBAL(cgeqlf,CGEQLF)
+#define LAPACK_zgeqlf LAPACK_GLOBAL(zgeqlf,ZGEQLF)
+#define LAPACK_sorgql LAPACK_GLOBAL(sorgql,SORGQL)
+#define LAPACK_dorgql LAPACK_GLOBAL(dorgql,DORGQL)
+#define LAPACK_cungql LAPACK_GLOBAL(cungql,CUNGQL)
+#define LAPACK_zungql LAPACK_GLOBAL(zungql,ZUNGQL)
+#define LAPACK_sormql LAPACK_GLOBAL(sormql,SORMQL)
+#define LAPACK_dormql LAPACK_GLOBAL(dormql,DORMQL)
+#define LAPACK_cunmql LAPACK_GLOBAL(cunmql,CUNMQL)
+#define LAPACK_zunmql LAPACK_GLOBAL(zunmql,ZUNMQL)
+#define LAPACK_sgerqf LAPACK_GLOBAL(sgerqf,SGERQF)
+#define LAPACK_dgerqf LAPACK_GLOBAL(dgerqf,DGERQF)
+#define LAPACK_cgerqf LAPACK_GLOBAL(cgerqf,CGERQF)
+#define LAPACK_zgerqf LAPACK_GLOBAL(zgerqf,ZGERQF)
+#define LAPACK_sorgrq LAPACK_GLOBAL(sorgrq,SORGRQ)
+#define LAPACK_dorgrq LAPACK_GLOBAL(dorgrq,DORGRQ)
+#define LAPACK_cungrq LAPACK_GLOBAL(cungrq,CUNGRQ)
+#define LAPACK_zungrq LAPACK_GLOBAL(zungrq,ZUNGRQ)
+#define LAPACK_sormrq LAPACK_GLOBAL(sormrq,SORMRQ)
+#define LAPACK_dormrq LAPACK_GLOBAL(dormrq,DORMRQ)
+#define LAPACK_cunmrq LAPACK_GLOBAL(cunmrq,CUNMRQ)
+#define LAPACK_zunmrq LAPACK_GLOBAL(zunmrq,ZUNMRQ)
+#define LAPACK_stzrzf LAPACK_GLOBAL(stzrzf,STZRZF)
+#define LAPACK_dtzrzf LAPACK_GLOBAL(dtzrzf,DTZRZF)
+#define LAPACK_ctzrzf LAPACK_GLOBAL(ctzrzf,CTZRZF)
+#define LAPACK_ztzrzf LAPACK_GLOBAL(ztzrzf,ZTZRZF)
+#define LAPACK_sormrz LAPACK_GLOBAL(sormrz,SORMRZ)
+#define LAPACK_dormrz LAPACK_GLOBAL(dormrz,DORMRZ)
+#define LAPACK_cunmrz LAPACK_GLOBAL(cunmrz,CUNMRZ)
+#define LAPACK_zunmrz LAPACK_GLOBAL(zunmrz,ZUNMRZ)
+#define LAPACK_sggqrf LAPACK_GLOBAL(sggqrf,SGGQRF)
+#define LAPACK_dggqrf LAPACK_GLOBAL(dggqrf,DGGQRF)
+#define LAPACK_cggqrf LAPACK_GLOBAL(cggqrf,CGGQRF)
+#define LAPACK_zggqrf LAPACK_GLOBAL(zggqrf,ZGGQRF)
+#define LAPACK_sggrqf LAPACK_GLOBAL(sggrqf,SGGRQF)
+#define LAPACK_dggrqf LAPACK_GLOBAL(dggrqf,DGGRQF)
+#define LAPACK_cggrqf LAPACK_GLOBAL(cggrqf,CGGRQF)
+#define LAPACK_zggrqf LAPACK_GLOBAL(zggrqf,ZGGRQF)
+#define LAPACK_sgebrd LAPACK_GLOBAL(sgebrd,SGEBRD)
+#define LAPACK_dgebrd LAPACK_GLOBAL(dgebrd,DGEBRD)
+#define LAPACK_cgebrd LAPACK_GLOBAL(cgebrd,CGEBRD)
+#define LAPACK_zgebrd LAPACK_GLOBAL(zgebrd,ZGEBRD)
+#define LAPACK_sgbbrd LAPACK_GLOBAL(sgbbrd,SGBBRD)
+#define LAPACK_dgbbrd LAPACK_GLOBAL(dgbbrd,DGBBRD)
+#define LAPACK_cgbbrd LAPACK_GLOBAL(cgbbrd,CGBBRD)
+#define LAPACK_zgbbrd LAPACK_GLOBAL(zgbbrd,ZGBBRD)
+#define LAPACK_sorgbr LAPACK_GLOBAL(sorgbr,SORGBR)
+#define LAPACK_dorgbr LAPACK_GLOBAL(dorgbr,DORGBR)
+#define LAPACK_sormbr LAPACK_GLOBAL(sormbr,SORMBR)
+#define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR)
+#define LAPACK_cungbr LAPACK_GLOBAL(cungbr,CUNGBR)
+#define LAPACK_zungbr LAPACK_GLOBAL(zungbr,ZUNGBR)
+#define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR)
+#define LAPACK_zunmbr LAPACK_GLOBAL(zunmbr,ZUNMBR)
+#define LAPACK_sbdsqr LAPACK_GLOBAL(sbdsqr,SBDSQR)
+#define LAPACK_dbdsqr LAPACK_GLOBAL(dbdsqr,DBDSQR)
+#define LAPACK_cbdsqr LAPACK_GLOBAL(cbdsqr,CBDSQR)
+#define LAPACK_zbdsqr LAPACK_GLOBAL(zbdsqr,ZBDSQR)
+#define LAPACK_sbdsdc LAPACK_GLOBAL(sbdsdc,SBDSDC)
+#define LAPACK_dbdsdc LAPACK_GLOBAL(dbdsdc,DBDSDC)
+#define LAPACK_ssytrd LAPACK_GLOBAL(ssytrd,SSYTRD)
+#define LAPACK_dsytrd LAPACK_GLOBAL(dsytrd,DSYTRD)
+#define LAPACK_sorgtr LAPACK_GLOBAL(sorgtr,SORGTR)
+#define LAPACK_dorgtr LAPACK_GLOBAL(dorgtr,DORGTR)
+#define LAPACK_sormtr LAPACK_GLOBAL(sormtr,SORMTR)
+#define LAPACK_dormtr LAPACK_GLOBAL(dormtr,DORMTR)
+#define LAPACK_chetrd LAPACK_GLOBAL(chetrd,CHETRD)
+#define LAPACK_zhetrd LAPACK_GLOBAL(zhetrd,ZHETRD)
+#define LAPACK_cungtr LAPACK_GLOBAL(cungtr,CUNGTR)
+#define LAPACK_zungtr LAPACK_GLOBAL(zungtr,ZUNGTR)
+#define LAPACK_cunmtr LAPACK_GLOBAL(cunmtr,CUNMTR)
+#define LAPACK_zunmtr LAPACK_GLOBAL(zunmtr,ZUNMTR)
+#define LAPACK_ssptrd LAPACK_GLOBAL(ssptrd,SSPTRD)
+#define LAPACK_dsptrd LAPACK_GLOBAL(dsptrd,DSPTRD)
+#define LAPACK_sopgtr LAPACK_GLOBAL(sopgtr,SOPGTR)
+#define LAPACK_dopgtr LAPACK_GLOBAL(dopgtr,DOPGTR)
+#define LAPACK_sopmtr LAPACK_GLOBAL(sopmtr,SOPMTR)
+#define LAPACK_dopmtr LAPACK_GLOBAL(dopmtr,DOPMTR)
+#define LAPACK_chptrd LAPACK_GLOBAL(chptrd,CHPTRD)
+#define LAPACK_zhptrd LAPACK_GLOBAL(zhptrd,ZHPTRD)
+#define LAPACK_cupgtr LAPACK_GLOBAL(cupgtr,CUPGTR)
+#define LAPACK_zupgtr LAPACK_GLOBAL(zupgtr,ZUPGTR)
+#define LAPACK_cupmtr LAPACK_GLOBAL(cupmtr,CUPMTR)
+#define LAPACK_zupmtr LAPACK_GLOBAL(zupmtr,ZUPMTR)
+#define LAPACK_ssbtrd LAPACK_GLOBAL(ssbtrd,SSBTRD)
+#define LAPACK_dsbtrd LAPACK_GLOBAL(dsbtrd,DSBTRD)
+#define LAPACK_chbtrd LAPACK_GLOBAL(chbtrd,CHBTRD)
+#define LAPACK_zhbtrd LAPACK_GLOBAL(zhbtrd,ZHBTRD)
+#define LAPACK_ssterf LAPACK_GLOBAL(ssterf,SSTERF)
+#define LAPACK_dsterf LAPACK_GLOBAL(dsterf,DSTERF)
+#define LAPACK_ssteqr LAPACK_GLOBAL(ssteqr,SSTEQR)
+#define LAPACK_dsteqr LAPACK_GLOBAL(dsteqr,DSTEQR)
+#define LAPACK_csteqr LAPACK_GLOBAL(csteqr,CSTEQR)
+#define LAPACK_zsteqr LAPACK_GLOBAL(zsteqr,ZSTEQR)
+#define LAPACK_sstemr LAPACK_GLOBAL(sstemr,SSTEMR)
+#define LAPACK_dstemr LAPACK_GLOBAL(dstemr,DSTEMR)
+#define LAPACK_cstemr LAPACK_GLOBAL(cstemr,CSTEMR)
+#define LAPACK_zstemr LAPACK_GLOBAL(zstemr,ZSTEMR)
+#define LAPACK_sstedc LAPACK_GLOBAL(sstedc,SSTEDC)
+#define LAPACK_dstedc LAPACK_GLOBAL(dstedc,DSTEDC)
+#define LAPACK_cstedc LAPACK_GLOBAL(cstedc,CSTEDC)
+#define LAPACK_zstedc LAPACK_GLOBAL(zstedc,ZSTEDC)
+#define LAPACK_sstegr LAPACK_GLOBAL(sstegr,SSTEGR)
+#define LAPACK_dstegr LAPACK_GLOBAL(dstegr,DSTEGR)
+#define LAPACK_cstegr LAPACK_GLOBAL(cstegr,CSTEGR)
+#define LAPACK_zstegr LAPACK_GLOBAL(zstegr,ZSTEGR)
+#define LAPACK_spteqr LAPACK_GLOBAL(spteqr,SPTEQR)
+#define LAPACK_dpteqr LAPACK_GLOBAL(dpteqr,DPTEQR)
+#define LAPACK_cpteqr LAPACK_GLOBAL(cpteqr,CPTEQR)
+#define LAPACK_zpteqr LAPACK_GLOBAL(zpteqr,ZPTEQR)
+#define LAPACK_sstebz LAPACK_GLOBAL(sstebz,SSTEBZ)
+#define LAPACK_dstebz LAPACK_GLOBAL(dstebz,DSTEBZ)
+#define LAPACK_sstein LAPACK_GLOBAL(sstein,SSTEIN)
+#define LAPACK_dstein LAPACK_GLOBAL(dstein,DSTEIN)
+#define LAPACK_cstein LAPACK_GLOBAL(cstein,CSTEIN)
+#define LAPACK_zstein LAPACK_GLOBAL(zstein,ZSTEIN)
+#define LAPACK_sdisna LAPACK_GLOBAL(sdisna,SDISNA)
+#define LAPACK_ddisna LAPACK_GLOBAL(ddisna,DDISNA)
+#define LAPACK_ssygst LAPACK_GLOBAL(ssygst,SSYGST)
+#define LAPACK_dsygst LAPACK_GLOBAL(dsygst,DSYGST)
+#define LAPACK_chegst LAPACK_GLOBAL(chegst,CHEGST)
+#define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST)
+#define LAPACK_sspgst LAPACK_GLOBAL(sspgst,SSPGST)
+#define LAPACK_dspgst LAPACK_GLOBAL(dspgst,DSPGST)
+#define LAPACK_chpgst LAPACK_GLOBAL(chpgst,CHPGST)
+#define LAPACK_zhpgst LAPACK_GLOBAL(zhpgst,ZHPGST)
+#define LAPACK_ssbgst LAPACK_GLOBAL(ssbgst,SSBGST)
+#define LAPACK_dsbgst LAPACK_GLOBAL(dsbgst,DSBGST)
+#define LAPACK_chbgst LAPACK_GLOBAL(chbgst,CHBGST)
+#define LAPACK_zhbgst LAPACK_GLOBAL(zhbgst,ZHBGST)
+#define LAPACK_spbstf LAPACK_GLOBAL(spbstf,SPBSTF)
+#define LAPACK_dpbstf LAPACK_GLOBAL(dpbstf,DPBSTF)
+#define LAPACK_cpbstf LAPACK_GLOBAL(cpbstf,CPBSTF)
+#define LAPACK_zpbstf LAPACK_GLOBAL(zpbstf,ZPBSTF)
+#define LAPACK_sgehrd LAPACK_GLOBAL(sgehrd,SGEHRD)
+#define LAPACK_dgehrd LAPACK_GLOBAL(dgehrd,DGEHRD)
+#define LAPACK_cgehrd LAPACK_GLOBAL(cgehrd,CGEHRD)
+#define LAPACK_zgehrd LAPACK_GLOBAL(zgehrd,ZGEHRD)
+#define LAPACK_sorghr LAPACK_GLOBAL(sorghr,SORGHR)
+#define LAPACK_dorghr LAPACK_GLOBAL(dorghr,DORGHR)
+#define LAPACK_sormhr LAPACK_GLOBAL(sormhr,SORMHR)
+#define LAPACK_dormhr LAPACK_GLOBAL(dormhr,DORMHR)
+#define LAPACK_cunghr LAPACK_GLOBAL(cunghr,CUNGHR)
+#define LAPACK_zunghr LAPACK_GLOBAL(zunghr,ZUNGHR)
+#define LAPACK_cunmhr LAPACK_GLOBAL(cunmhr,CUNMHR)
+#define LAPACK_zunmhr LAPACK_GLOBAL(zunmhr,ZUNMHR)
+#define LAPACK_sgebal LAPACK_GLOBAL(sgebal,SGEBAL)
+#define LAPACK_dgebal LAPACK_GLOBAL(dgebal,DGEBAL)
+#define LAPACK_cgebal LAPACK_GLOBAL(cgebal,CGEBAL)
+#define LAPACK_zgebal LAPACK_GLOBAL(zgebal,ZGEBAL)
+#define LAPACK_sgebak LAPACK_GLOBAL(sgebak,SGEBAK)
+#define LAPACK_dgebak LAPACK_GLOBAL(dgebak,DGEBAK)
+#define LAPACK_cgebak LAPACK_GLOBAL(cgebak,CGEBAK)
+#define LAPACK_zgebak LAPACK_GLOBAL(zgebak,ZGEBAK)
+#define LAPACK_shseqr LAPACK_GLOBAL(shseqr,SHSEQR)
+#define LAPACK_dhseqr LAPACK_GLOBAL(dhseqr,DHSEQR)
+#define LAPACK_chseqr LAPACK_GLOBAL(chseqr,CHSEQR)
+#define LAPACK_zhseqr LAPACK_GLOBAL(zhseqr,ZHSEQR)
+#define LAPACK_shsein LAPACK_GLOBAL(shsein,SHSEIN)
+#define LAPACK_dhsein LAPACK_GLOBAL(dhsein,DHSEIN)
+#define LAPACK_chsein LAPACK_GLOBAL(chsein,CHSEIN)
+#define LAPACK_zhsein LAPACK_GLOBAL(zhsein,ZHSEIN)
+#define LAPACK_strevc LAPACK_GLOBAL(strevc,STREVC)
+#define LAPACK_dtrevc LAPACK_GLOBAL(dtrevc,DTREVC)
+#define LAPACK_ctrevc LAPACK_GLOBAL(ctrevc,CTREVC)
+#define LAPACK_ztrevc LAPACK_GLOBAL(ztrevc,ZTREVC)
+#define LAPACK_strsna LAPACK_GLOBAL(strsna,STRSNA)
+#define LAPACK_dtrsna LAPACK_GLOBAL(dtrsna,DTRSNA)
+#define LAPACK_ctrsna LAPACK_GLOBAL(ctrsna,CTRSNA)
+#define LAPACK_ztrsna LAPACK_GLOBAL(ztrsna,ZTRSNA)
+#define LAPACK_strexc LAPACK_GLOBAL(strexc,STREXC)
+#define LAPACK_dtrexc LAPACK_GLOBAL(dtrexc,DTREXC)
+#define LAPACK_ctrexc LAPACK_GLOBAL(ctrexc,CTREXC)
+#define LAPACK_ztrexc LAPACK_GLOBAL(ztrexc,ZTREXC)
+#define LAPACK_strsen LAPACK_GLOBAL(strsen,STRSEN)
+#define LAPACK_dtrsen LAPACK_GLOBAL(dtrsen,DTRSEN)
+#define LAPACK_ctrsen LAPACK_GLOBAL(ctrsen,CTRSEN)
+#define LAPACK_ztrsen LAPACK_GLOBAL(ztrsen,ZTRSEN)
+#define LAPACK_strsyl LAPACK_GLOBAL(strsyl,STRSYL)
+#define LAPACK_dtrsyl LAPACK_GLOBAL(dtrsyl,DTRSYL)
+#define LAPACK_ctrsyl LAPACK_GLOBAL(ctrsyl,CTRSYL)
+#define LAPACK_ztrsyl LAPACK_GLOBAL(ztrsyl,ZTRSYL)
+#define LAPACK_sgghrd LAPACK_GLOBAL(sgghrd,SGGHRD)
+#define LAPACK_dgghrd LAPACK_GLOBAL(dgghrd,DGGHRD)
+#define LAPACK_cgghrd LAPACK_GLOBAL(cgghrd,CGGHRD)
+#define LAPACK_zgghrd LAPACK_GLOBAL(zgghrd,ZGGHRD)
+#define LAPACK_sggbal LAPACK_GLOBAL(sggbal,SGGBAL)
+#define LAPACK_dggbal LAPACK_GLOBAL(dggbal,DGGBAL)
+#define LAPACK_cggbal LAPACK_GLOBAL(cggbal,CGGBAL)
+#define LAPACK_zggbal LAPACK_GLOBAL(zggbal,ZGGBAL)
+#define LAPACK_sggbak LAPACK_GLOBAL(sggbak,SGGBAK)
+#define LAPACK_dggbak LAPACK_GLOBAL(dggbak,DGGBAK)
+#define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK)
+#define LAPACK_zggbak LAPACK_GLOBAL(zggbak,ZGGBAK)
+#define LAPACK_shgeqz LAPACK_GLOBAL(shgeqz,SHGEQZ)
+#define LAPACK_dhgeqz LAPACK_GLOBAL(dhgeqz,DHGEQZ)
+#define LAPACK_chgeqz LAPACK_GLOBAL(chgeqz,CHGEQZ)
+#define LAPACK_zhgeqz LAPACK_GLOBAL(zhgeqz,ZHGEQZ)
+#define LAPACK_stgevc LAPACK_GLOBAL(stgevc,STGEVC)
+#define LAPACK_dtgevc LAPACK_GLOBAL(dtgevc,DTGEVC)
+#define LAPACK_ctgevc LAPACK_GLOBAL(ctgevc,CTGEVC)
+#define LAPACK_ztgevc LAPACK_GLOBAL(ztgevc,ZTGEVC)
+#define LAPACK_stgexc LAPACK_GLOBAL(stgexc,STGEXC)
+#define LAPACK_dtgexc LAPACK_GLOBAL(dtgexc,DTGEXC)
+#define LAPACK_ctgexc LAPACK_GLOBAL(ctgexc,CTGEXC)
+#define LAPACK_ztgexc LAPACK_GLOBAL(ztgexc,ZTGEXC)
+#define LAPACK_stgsen LAPACK_GLOBAL(stgsen,STGSEN)
+#define LAPACK_dtgsen LAPACK_GLOBAL(dtgsen,DTGSEN)
+#define LAPACK_ctgsen LAPACK_GLOBAL(ctgsen,CTGSEN)
+#define LAPACK_ztgsen LAPACK_GLOBAL(ztgsen,ZTGSEN)
+#define LAPACK_stgsyl LAPACK_GLOBAL(stgsyl,STGSYL)
+#define LAPACK_dtgsyl LAPACK_GLOBAL(dtgsyl,DTGSYL)
+#define LAPACK_ctgsyl LAPACK_GLOBAL(ctgsyl,CTGSYL)
+#define LAPACK_ztgsyl LAPACK_GLOBAL(ztgsyl,ZTGSYL)
+#define LAPACK_stgsna LAPACK_GLOBAL(stgsna,STGSNA)
+#define LAPACK_dtgsna LAPACK_GLOBAL(dtgsna,DTGSNA)
+#define LAPACK_ctgsna LAPACK_GLOBAL(ctgsna,CTGSNA)
+#define LAPACK_ztgsna LAPACK_GLOBAL(ztgsna,ZTGSNA)
+#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP)
+#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP)
+#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP)
+#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP)
+#define LAPACK_stgsja LAPACK_GLOBAL(stgsja,STGSJA)
+#define LAPACK_dtgsja LAPACK_GLOBAL(dtgsja,DTGSJA)
+#define LAPACK_ctgsja LAPACK_GLOBAL(ctgsja,CTGSJA)
+#define LAPACK_ztgsja LAPACK_GLOBAL(ztgsja,ZTGSJA)
+#define LAPACK_sgels LAPACK_GLOBAL(sgels,SGELS)
+#define LAPACK_dgels LAPACK_GLOBAL(dgels,DGELS)
+#define LAPACK_cgels LAPACK_GLOBAL(cgels,CGELS)
+#define LAPACK_zgels LAPACK_GLOBAL(zgels,ZGELS)
+#define LAPACK_sgelsy LAPACK_GLOBAL(sgelsy,SGELSY)
+#define LAPACK_dgelsy LAPACK_GLOBAL(dgelsy,DGELSY)
+#define LAPACK_cgelsy LAPACK_GLOBAL(cgelsy,CGELSY)
+#define LAPACK_zgelsy LAPACK_GLOBAL(zgelsy,ZGELSY)
+#define LAPACK_sgelss LAPACK_GLOBAL(sgelss,SGELSS)
+#define LAPACK_dgelss LAPACK_GLOBAL(dgelss,DGELSS)
+#define LAPACK_cgelss LAPACK_GLOBAL(cgelss,CGELSS)
+#define LAPACK_zgelss LAPACK_GLOBAL(zgelss,ZGELSS)
+#define LAPACK_sgelsd LAPACK_GLOBAL(sgelsd,SGELSD)
+#define LAPACK_dgelsd LAPACK_GLOBAL(dgelsd,DGELSD)
+#define LAPACK_cgelsd LAPACK_GLOBAL(cgelsd,CGELSD)
+#define LAPACK_zgelsd LAPACK_GLOBAL(zgelsd,ZGELSD)
+#define LAPACK_sgglse LAPACK_GLOBAL(sgglse,SGGLSE)
+#define LAPACK_dgglse LAPACK_GLOBAL(dgglse,DGGLSE)
+#define LAPACK_cgglse LAPACK_GLOBAL(cgglse,CGGLSE)
+#define LAPACK_zgglse LAPACK_GLOBAL(zgglse,ZGGLSE)
+#define LAPACK_sggglm LAPACK_GLOBAL(sggglm,SGGGLM)
+#define LAPACK_dggglm LAPACK_GLOBAL(dggglm,DGGGLM)
+#define LAPACK_cggglm LAPACK_GLOBAL(cggglm,CGGGLM)
+#define LAPACK_zggglm LAPACK_GLOBAL(zggglm,ZGGGLM)
+#define LAPACK_ssyev LAPACK_GLOBAL(ssyev,SSYEV)
+#define LAPACK_dsyev LAPACK_GLOBAL(dsyev,DSYEV)
+#define LAPACK_cheev LAPACK_GLOBAL(cheev,CHEEV)
+#define LAPACK_zheev LAPACK_GLOBAL(zheev,ZHEEV)
+#define LAPACK_ssyevd LAPACK_GLOBAL(ssyevd,SSYEVD)
+#define LAPACK_dsyevd LAPACK_GLOBAL(dsyevd,DSYEVD)
+#define LAPACK_cheevd LAPACK_GLOBAL(cheevd,CHEEVD)
+#define LAPACK_zheevd LAPACK_GLOBAL(zheevd,ZHEEVD)
+#define LAPACK_ssyevx LAPACK_GLOBAL(ssyevx,SSYEVX)
+#define LAPACK_dsyevx LAPACK_GLOBAL(dsyevx,DSYEVX)
+#define LAPACK_cheevx LAPACK_GLOBAL(cheevx,CHEEVX)
+#define LAPACK_zheevx LAPACK_GLOBAL(zheevx,ZHEEVX)
+#define LAPACK_ssyevr LAPACK_GLOBAL(ssyevr,SSYEVR)
+#define LAPACK_dsyevr LAPACK_GLOBAL(dsyevr,DSYEVR)
+#define LAPACK_cheevr LAPACK_GLOBAL(cheevr,CHEEVR)
+#define LAPACK_zheevr LAPACK_GLOBAL(zheevr,ZHEEVR)
+#define LAPACK_sspev LAPACK_GLOBAL(sspev,SSPEV)
+#define LAPACK_dspev LAPACK_GLOBAL(dspev,DSPEV)
+#define LAPACK_chpev LAPACK_GLOBAL(chpev,CHPEV)
+#define LAPACK_zhpev LAPACK_GLOBAL(zhpev,ZHPEV)
+#define LAPACK_sspevd LAPACK_GLOBAL(sspevd,SSPEVD)
+#define LAPACK_dspevd LAPACK_GLOBAL(dspevd,DSPEVD)
+#define LAPACK_chpevd LAPACK_GLOBAL(chpevd,CHPEVD)
+#define LAPACK_zhpevd LAPACK_GLOBAL(zhpevd,ZHPEVD)
+#define LAPACK_sspevx LAPACK_GLOBAL(sspevx,SSPEVX)
+#define LAPACK_dspevx LAPACK_GLOBAL(dspevx,DSPEVX)
+#define LAPACK_chpevx LAPACK_GLOBAL(chpevx,CHPEVX)
+#define LAPACK_zhpevx LAPACK_GLOBAL(zhpevx,ZHPEVX)
+#define LAPACK_ssbev LAPACK_GLOBAL(ssbev,SSBEV)
+#define LAPACK_dsbev LAPACK_GLOBAL(dsbev,DSBEV)
+#define LAPACK_chbev LAPACK_GLOBAL(chbev,CHBEV)
+#define LAPACK_zhbev LAPACK_GLOBAL(zhbev,ZHBEV)
+#define LAPACK_ssbevd LAPACK_GLOBAL(ssbevd,SSBEVD)
+#define LAPACK_dsbevd LAPACK_GLOBAL(dsbevd,DSBEVD)
+#define LAPACK_chbevd LAPACK_GLOBAL(chbevd,CHBEVD)
+#define LAPACK_zhbevd LAPACK_GLOBAL(zhbevd,ZHBEVD)
+#define LAPACK_ssbevx LAPACK_GLOBAL(ssbevx,SSBEVX)
+#define LAPACK_dsbevx LAPACK_GLOBAL(dsbevx,DSBEVX)
+#define LAPACK_chbevx LAPACK_GLOBAL(chbevx,CHBEVX)
+#define LAPACK_zhbevx LAPACK_GLOBAL(zhbevx,ZHBEVX)
+#define LAPACK_sstev LAPACK_GLOBAL(sstev,SSTEV)
+#define LAPACK_dstev LAPACK_GLOBAL(dstev,DSTEV)
+#define LAPACK_sstevd LAPACK_GLOBAL(sstevd,SSTEVD)
+#define LAPACK_dstevd LAPACK_GLOBAL(dstevd,DSTEVD)
+#define LAPACK_sstevx LAPACK_GLOBAL(sstevx,SSTEVX)
+#define LAPACK_dstevx LAPACK_GLOBAL(dstevx,DSTEVX)
+#define LAPACK_sstevr LAPACK_GLOBAL(sstevr,SSTEVR)
+#define LAPACK_dstevr LAPACK_GLOBAL(dstevr,DSTEVR)
+#define LAPACK_sgees LAPACK_GLOBAL(sgees,SGEES)
+#define LAPACK_dgees LAPACK_GLOBAL(dgees,DGEES)
+#define LAPACK_cgees LAPACK_GLOBAL(cgees,CGEES)
+#define LAPACK_zgees LAPACK_GLOBAL(zgees,ZGEES)
+#define LAPACK_sgeesx LAPACK_GLOBAL(sgeesx,SGEESX)
+#define LAPACK_dgeesx LAPACK_GLOBAL(dgeesx,DGEESX)
+#define LAPACK_cgeesx LAPACK_GLOBAL(cgeesx,CGEESX)
+#define LAPACK_zgeesx LAPACK_GLOBAL(zgeesx,ZGEESX)
+#define LAPACK_sgeev LAPACK_GLOBAL(sgeev,SGEEV)
+#define LAPACK_dgeev LAPACK_GLOBAL(dgeev,DGEEV)
+#define LAPACK_cgeev LAPACK_GLOBAL(cgeev,CGEEV)
+#define LAPACK_zgeev LAPACK_GLOBAL(zgeev,ZGEEV)
+#define LAPACK_sgeevx LAPACK_GLOBAL(sgeevx,SGEEVX)
+#define LAPACK_dgeevx LAPACK_GLOBAL(dgeevx,DGEEVX)
+#define LAPACK_cgeevx LAPACK_GLOBAL(cgeevx,CGEEVX)
+#define LAPACK_zgeevx LAPACK_GLOBAL(zgeevx,ZGEEVX)
+#define LAPACK_sgesvd LAPACK_GLOBAL(sgesvd,SGESVD)
+#define LAPACK_dgesvd LAPACK_GLOBAL(dgesvd,DGESVD)
+#define LAPACK_cgesvd LAPACK_GLOBAL(cgesvd,CGESVD)
+#define LAPACK_zgesvd LAPACK_GLOBAL(zgesvd,ZGESVD)
+#define LAPACK_sgesdd LAPACK_GLOBAL(sgesdd,SGESDD)
+#define LAPACK_dgesdd LAPACK_GLOBAL(dgesdd,DGESDD)
+#define LAPACK_cgesdd LAPACK_GLOBAL(cgesdd,CGESDD)
+#define LAPACK_zgesdd LAPACK_GLOBAL(zgesdd,ZGESDD)
+#define LAPACK_dgejsv LAPACK_GLOBAL(dgejsv,DGEJSV)
+#define LAPACK_sgejsv LAPACK_GLOBAL(sgejsv,SGEJSV)
+#define LAPACK_dgesvj LAPACK_GLOBAL(dgesvj,DGESVJ)
+#define LAPACK_sgesvj LAPACK_GLOBAL(sgesvj,SGESVJ)
+#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD)
+#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD)
+#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD)
+#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD)
+#define LAPACK_ssygv LAPACK_GLOBAL(ssygv,SSYGV)
+#define LAPACK_dsygv LAPACK_GLOBAL(dsygv,DSYGV)
+#define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV)
+#define LAPACK_zhegv LAPACK_GLOBAL(zhegv,ZHEGV)
+#define LAPACK_ssygvd LAPACK_GLOBAL(ssygvd,SSYGVD)
+#define LAPACK_dsygvd LAPACK_GLOBAL(dsygvd,DSYGVD)
+#define LAPACK_chegvd LAPACK_GLOBAL(chegvd,CHEGVD)
+#define LAPACK_zhegvd LAPACK_GLOBAL(zhegvd,ZHEGVD)
+#define LAPACK_ssygvx LAPACK_GLOBAL(ssygvx,SSYGVX)
+#define LAPACK_dsygvx LAPACK_GLOBAL(dsygvx,DSYGVX)
+#define LAPACK_chegvx LAPACK_GLOBAL(chegvx,CHEGVX)
+#define LAPACK_zhegvx LAPACK_GLOBAL(zhegvx,ZHEGVX)
+#define LAPACK_sspgv LAPACK_GLOBAL(sspgv,SSPGV)
+#define LAPACK_dspgv LAPACK_GLOBAL(dspgv,DSPGV)
+#define LAPACK_chpgv LAPACK_GLOBAL(chpgv,CHPGV)
+#define LAPACK_zhpgv LAPACK_GLOBAL(zhpgv,ZHPGV)
+#define LAPACK_sspgvd LAPACK_GLOBAL(sspgvd,SSPGVD)
+#define LAPACK_dspgvd LAPACK_GLOBAL(dspgvd,DSPGVD)
+#define LAPACK_chpgvd LAPACK_GLOBAL(chpgvd,CHPGVD)
+#define LAPACK_zhpgvd LAPACK_GLOBAL(zhpgvd,ZHPGVD)
+#define LAPACK_sspgvx LAPACK_GLOBAL(sspgvx,SSPGVX)
+#define LAPACK_dspgvx LAPACK_GLOBAL(dspgvx,DSPGVX)
+#define LAPACK_chpgvx LAPACK_GLOBAL(chpgvx,CHPGVX)
+#define LAPACK_zhpgvx LAPACK_GLOBAL(zhpgvx,ZHPGVX)
+#define LAPACK_ssbgv LAPACK_GLOBAL(ssbgv,SSBGV)
+#define LAPACK_dsbgv LAPACK_GLOBAL(dsbgv,DSBGV)
+#define LAPACK_chbgv LAPACK_GLOBAL(chbgv,CHBGV)
+#define LAPACK_zhbgv LAPACK_GLOBAL(zhbgv,ZHBGV)
+#define LAPACK_ssbgvd LAPACK_GLOBAL(ssbgvd,SSBGVD)
+#define LAPACK_dsbgvd LAPACK_GLOBAL(dsbgvd,DSBGVD)
+#define LAPACK_chbgvd LAPACK_GLOBAL(chbgvd,CHBGVD)
+#define LAPACK_zhbgvd LAPACK_GLOBAL(zhbgvd,ZHBGVD)
+#define LAPACK_ssbgvx LAPACK_GLOBAL(ssbgvx,SSBGVX)
+#define LAPACK_dsbgvx LAPACK_GLOBAL(dsbgvx,DSBGVX)
+#define LAPACK_chbgvx LAPACK_GLOBAL(chbgvx,CHBGVX)
+#define LAPACK_zhbgvx LAPACK_GLOBAL(zhbgvx,ZHBGVX)
+#define LAPACK_sgges LAPACK_GLOBAL(sgges,SGGES)
+#define LAPACK_dgges LAPACK_GLOBAL(dgges,DGGES)
+#define LAPACK_cgges LAPACK_GLOBAL(cgges,CGGES)
+#define LAPACK_zgges LAPACK_GLOBAL(zgges,ZGGES)
+#define LAPACK_sggesx LAPACK_GLOBAL(sggesx,SGGESX)
+#define LAPACK_dggesx LAPACK_GLOBAL(dggesx,DGGESX)
+#define LAPACK_cggesx LAPACK_GLOBAL(cggesx,CGGESX)
+#define LAPACK_zggesx LAPACK_GLOBAL(zggesx,ZGGESX)
+#define LAPACK_sggev LAPACK_GLOBAL(sggev,SGGEV)
+#define LAPACK_dggev LAPACK_GLOBAL(dggev,DGGEV)
+#define LAPACK_cggev LAPACK_GLOBAL(cggev,CGGEV)
+#define LAPACK_zggev LAPACK_GLOBAL(zggev,ZGGEV)
+#define LAPACK_sggevx LAPACK_GLOBAL(sggevx,SGGEVX)
+#define LAPACK_dggevx LAPACK_GLOBAL(dggevx,DGGEVX)
+#define LAPACK_cggevx LAPACK_GLOBAL(cggevx,CGGEVX)
+#define LAPACK_zggevx LAPACK_GLOBAL(zggevx,ZGGEVX)
+#define LAPACK_dsfrk LAPACK_GLOBAL(dsfrk,DSFRK)
+#define LAPACK_ssfrk LAPACK_GLOBAL(ssfrk,SSFRK)
+#define LAPACK_zhfrk LAPACK_GLOBAL(zhfrk,ZHFRK)
+#define LAPACK_chfrk LAPACK_GLOBAL(chfrk,CHFRK)
+#define LAPACK_dtfsm LAPACK_GLOBAL(dtfsm,DTFSM)
+#define LAPACK_stfsm LAPACK_GLOBAL(stfsm,STFSM)
+#define LAPACK_ztfsm LAPACK_GLOBAL(ztfsm,ZTFSM)
+#define LAPACK_ctfsm LAPACK_GLOBAL(ctfsm,CTFSM)
+#define LAPACK_dtfttp LAPACK_GLOBAL(dtfttp,DTFTTP)
+#define LAPACK_stfttp LAPACK_GLOBAL(stfttp,STFTTP)
+#define LAPACK_ztfttp LAPACK_GLOBAL(ztfttp,ZTFTTP)
+#define LAPACK_ctfttp LAPACK_GLOBAL(ctfttp,CTFTTP)
+#define LAPACK_dtfttr LAPACK_GLOBAL(dtfttr,DTFTTR)
+#define LAPACK_stfttr LAPACK_GLOBAL(stfttr,STFTTR)
+#define LAPACK_ztfttr LAPACK_GLOBAL(ztfttr,ZTFTTR)
+#define LAPACK_ctfttr LAPACK_GLOBAL(ctfttr,CTFTTR)
+#define LAPACK_dtpttf LAPACK_GLOBAL(dtpttf,DTPTTF)
+#define LAPACK_stpttf LAPACK_GLOBAL(stpttf,STPTTF)
+#define LAPACK_ztpttf LAPACK_GLOBAL(ztpttf,ZTPTTF)
+#define LAPACK_ctpttf LAPACK_GLOBAL(ctpttf,CTPTTF)
+#define LAPACK_dtpttr LAPACK_GLOBAL(dtpttr,DTPTTR)
+#define LAPACK_stpttr LAPACK_GLOBAL(stpttr,STPTTR)
+#define LAPACK_ztpttr LAPACK_GLOBAL(ztpttr,ZTPTTR)
+#define LAPACK_ctpttr LAPACK_GLOBAL(ctpttr,CTPTTR)
+#define LAPACK_dtrttf LAPACK_GLOBAL(dtrttf,DTRTTF)
+#define LAPACK_strttf LAPACK_GLOBAL(strttf,STRTTF)
+#define LAPACK_ztrttf LAPACK_GLOBAL(ztrttf,ZTRTTF)
+#define LAPACK_ctrttf LAPACK_GLOBAL(ctrttf,CTRTTF)
+#define LAPACK_dtrttp LAPACK_GLOBAL(dtrttp,DTRTTP)
+#define LAPACK_strttp LAPACK_GLOBAL(strttp,STRTTP)
+#define LAPACK_ztrttp LAPACK_GLOBAL(ztrttp,ZTRTTP)
+#define LAPACK_ctrttp LAPACK_GLOBAL(ctrttp,CTRTTP)
+#define LAPACK_sgeqrfp LAPACK_GLOBAL(sgeqrfp,SGEQRFP)
+#define LAPACK_dgeqrfp LAPACK_GLOBAL(dgeqrfp,DGEQRFP)
+#define LAPACK_cgeqrfp LAPACK_GLOBAL(cgeqrfp,CGEQRFP)
+#define LAPACK_zgeqrfp LAPACK_GLOBAL(zgeqrfp,ZGEQRFP)
+#define LAPACK_clacgv LAPACK_GLOBAL(clacgv,CLACGV)
+#define LAPACK_zlacgv LAPACK_GLOBAL(zlacgv,ZLACGV)
+#define LAPACK_slarnv LAPACK_GLOBAL(slarnv,SLARNV)
+#define LAPACK_dlarnv LAPACK_GLOBAL(dlarnv,DLARNV)
+#define LAPACK_clarnv LAPACK_GLOBAL(clarnv,CLARNV)
+#define LAPACK_zlarnv LAPACK_GLOBAL(zlarnv,ZLARNV)
+#define LAPACK_sgeqr2 LAPACK_GLOBAL(sgeqr2,SGEQR2)
+#define LAPACK_dgeqr2 LAPACK_GLOBAL(dgeqr2,DGEQR2)
+#define LAPACK_cgeqr2 LAPACK_GLOBAL(cgeqr2,CGEQR2)
+#define LAPACK_zgeqr2 LAPACK_GLOBAL(zgeqr2,ZGEQR2)
+#define LAPACK_slacpy LAPACK_GLOBAL(slacpy,SLACPY)
+#define LAPACK_dlacpy LAPACK_GLOBAL(dlacpy,DLACPY)
+#define LAPACK_clacpy LAPACK_GLOBAL(clacpy,CLACPY)
+#define LAPACK_zlacpy LAPACK_GLOBAL(zlacpy,ZLACPY)
+#define LAPACK_sgetf2 LAPACK_GLOBAL(sgetf2,SGETF2)
+#define LAPACK_dgetf2 LAPACK_GLOBAL(dgetf2,DGETF2)
+#define LAPACK_cgetf2 LAPACK_GLOBAL(cgetf2,CGETF2)
+#define LAPACK_zgetf2 LAPACK_GLOBAL(zgetf2,ZGETF2)
+#define LAPACK_slaswp LAPACK_GLOBAL(slaswp,SLASWP)
+#define LAPACK_dlaswp LAPACK_GLOBAL(dlaswp,DLASWP)
+#define LAPACK_claswp LAPACK_GLOBAL(claswp,CLASWP)
+#define LAPACK_zlaswp LAPACK_GLOBAL(zlaswp,ZLASWP)
+#define LAPACK_slange LAPACK_GLOBAL(slange,SLANGE)
+#define LAPACK_dlange LAPACK_GLOBAL(dlange,DLANGE)
+#define LAPACK_clange LAPACK_GLOBAL(clange,CLANGE)
+#define LAPACK_zlange LAPACK_GLOBAL(zlange,ZLANGE)
+#define LAPACK_clanhe LAPACK_GLOBAL(clanhe,CLANHE)
+#define LAPACK_zlanhe LAPACK_GLOBAL(zlanhe,ZLANHE)
+#define LAPACK_slansy LAPACK_GLOBAL(slansy,SLANSY)
+#define LAPACK_dlansy LAPACK_GLOBAL(dlansy,DLANSY)
+#define LAPACK_clansy LAPACK_GLOBAL(clansy,CLANSY)
+#define LAPACK_zlansy LAPACK_GLOBAL(zlansy,ZLANSY)
+#define LAPACK_slantr LAPACK_GLOBAL(slantr,SLANTR)
+#define LAPACK_dlantr LAPACK_GLOBAL(dlantr,DLANTR)
+#define LAPACK_clantr LAPACK_GLOBAL(clantr,CLANTR)
+#define LAPACK_zlantr LAPACK_GLOBAL(zlantr,ZLANTR)
+#define LAPACK_slamch LAPACK_GLOBAL(slamch,SLAMCH)
+#define LAPACK_dlamch LAPACK_GLOBAL(dlamch,DLAMCH)
+#define LAPACK_sgelq2 LAPACK_GLOBAL(sgelq2,SGELQ2)
+#define LAPACK_dgelq2 LAPACK_GLOBAL(dgelq2,DGELQ2)
+#define LAPACK_cgelq2 LAPACK_GLOBAL(cgelq2,CGELQ2)
+#define LAPACK_zgelq2 LAPACK_GLOBAL(zgelq2,ZGELQ2)
+#define LAPACK_slarfb LAPACK_GLOBAL(slarfb,SLARFB)
+#define LAPACK_dlarfb LAPACK_GLOBAL(dlarfb,DLARFB)
+#define LAPACK_clarfb LAPACK_GLOBAL(clarfb,CLARFB)
+#define LAPACK_zlarfb LAPACK_GLOBAL(zlarfb,ZLARFB)
+#define LAPACK_slarfg LAPACK_GLOBAL(slarfg,SLARFG)
+#define LAPACK_dlarfg LAPACK_GLOBAL(dlarfg,DLARFG)
+#define LAPACK_clarfg LAPACK_GLOBAL(clarfg,CLARFG)
+#define LAPACK_zlarfg LAPACK_GLOBAL(zlarfg,ZLARFG)
+#define LAPACK_slarft LAPACK_GLOBAL(slarft,SLARFT)
+#define LAPACK_dlarft LAPACK_GLOBAL(dlarft,DLARFT)
+#define LAPACK_clarft LAPACK_GLOBAL(clarft,CLARFT)
+#define LAPACK_zlarft LAPACK_GLOBAL(zlarft,ZLARFT)
+#define LAPACK_slarfx LAPACK_GLOBAL(slarfx,SLARFX)
+#define LAPACK_dlarfx LAPACK_GLOBAL(dlarfx,DLARFX)
+#define LAPACK_clarfx LAPACK_GLOBAL(clarfx,CLARFX)
+#define LAPACK_zlarfx LAPACK_GLOBAL(zlarfx,ZLARFX)
+#define LAPACK_slatms LAPACK_GLOBAL(slatms,SLATMS)
+#define LAPACK_dlatms LAPACK_GLOBAL(dlatms,DLATMS)
+#define LAPACK_clatms LAPACK_GLOBAL(clatms,CLATMS)
+#define LAPACK_zlatms LAPACK_GLOBAL(zlatms,ZLATMS)
+#define LAPACK_slag2d LAPACK_GLOBAL(slag2d,SLAG2D)
+#define LAPACK_dlag2s LAPACK_GLOBAL(dlag2s,DLAG2S)
+#define LAPACK_clag2z LAPACK_GLOBAL(clag2z,CLAG2Z)
+#define LAPACK_zlag2c LAPACK_GLOBAL(zlag2c,ZLAG2C)
+#define LAPACK_slauum LAPACK_GLOBAL(slauum,SLAUUM)
+#define LAPACK_dlauum LAPACK_GLOBAL(dlauum,DLAUUM)
+#define LAPACK_clauum LAPACK_GLOBAL(clauum,CLAUUM)
+#define LAPACK_zlauum LAPACK_GLOBAL(zlauum,ZLAUUM)
+#define LAPACK_slagge LAPACK_GLOBAL(slagge,SLAGGE)
+#define LAPACK_dlagge LAPACK_GLOBAL(dlagge,DLAGGE)
+#define LAPACK_clagge LAPACK_GLOBAL(clagge,CLAGGE)
+#define LAPACK_zlagge LAPACK_GLOBAL(zlagge,ZLAGGE)
+#define LAPACK_slaset LAPACK_GLOBAL(slaset,SLASET)
+#define LAPACK_dlaset LAPACK_GLOBAL(dlaset,DLASET)
+#define LAPACK_claset LAPACK_GLOBAL(claset,CLASET)
+#define LAPACK_zlaset LAPACK_GLOBAL(zlaset,ZLASET)
+#define LAPACK_slasrt LAPACK_GLOBAL(slasrt,SLASRT)
+#define LAPACK_dlasrt LAPACK_GLOBAL(dlasrt,DLASRT)
+#define LAPACK_slagsy LAPACK_GLOBAL(slagsy,SLAGSY)
+#define LAPACK_dlagsy LAPACK_GLOBAL(dlagsy,DLAGSY)
+#define LAPACK_clagsy LAPACK_GLOBAL(clagsy,CLAGSY)
+#define LAPACK_zlagsy LAPACK_GLOBAL(zlagsy,ZLAGSY)
+#define LAPACK_claghe LAPACK_GLOBAL(claghe,CLAGHE)
+#define LAPACK_zlaghe LAPACK_GLOBAL(zlaghe,ZLAGHE)
+#define LAPACK_slapmr LAPACK_GLOBAL(slapmr,SLAPMR)
+#define LAPACK_dlapmr LAPACK_GLOBAL(dlapmr,DLAPMR)
+#define LAPACK_clapmr LAPACK_GLOBAL(clapmr,CLAPMR)
+#define LAPACK_zlapmr LAPACK_GLOBAL(zlapmr,ZLAPMR)
+#define LAPACK_slapy2 LAPACK_GLOBAL(slapy2,SLAPY2)
+#define LAPACK_dlapy2 LAPACK_GLOBAL(dlapy2,DLAPY2)
+#define LAPACK_slapy3 LAPACK_GLOBAL(slapy3,SLAPY3)
+#define LAPACK_dlapy3 LAPACK_GLOBAL(dlapy3,DLAPY3)
+#define LAPACK_slartgp LAPACK_GLOBAL(slartgp,SLARTGP)
+#define LAPACK_dlartgp LAPACK_GLOBAL(dlartgp,DLARTGP)
+#define LAPACK_slartgs LAPACK_GLOBAL(slartgs,SLARTGS)
+#define LAPACK_dlartgs LAPACK_GLOBAL(dlartgs,DLARTGS)
+// LAPACK 3.3.0
+#define LAPACK_cbbcsd LAPACK_GLOBAL(cbbcsd,CBBCSD)
+#define LAPACK_cheswapr LAPACK_GLOBAL(cheswapr,CHESWAPR)
+#define LAPACK_chetri2 LAPACK_GLOBAL(chetri2,CHETRI2)
+#define LAPACK_chetri2x LAPACK_GLOBAL(chetri2x,CHETRI2X)
+#define LAPACK_chetrs2 LAPACK_GLOBAL(chetrs2,CHETRS2)
+#define LAPACK_csyconv LAPACK_GLOBAL(csyconv,CSYCONV)
+#define LAPACK_csyswapr LAPACK_GLOBAL(csyswapr,CSYSWAPR)
+#define LAPACK_csytri2 LAPACK_GLOBAL(csytri2,CSYTRI2)
+#define LAPACK_csytri2x LAPACK_GLOBAL(csytri2x,CSYTRI2X)
+#define LAPACK_csytrs2 LAPACK_GLOBAL(csytrs2,CSYTRS2)
+#define LAPACK_cunbdb LAPACK_GLOBAL(cunbdb,CUNBDB)
+#define LAPACK_cuncsd LAPACK_GLOBAL(cuncsd,CUNCSD)
+#define LAPACK_dbbcsd LAPACK_GLOBAL(dbbcsd,DBBCSD)
+#define LAPACK_dorbdb LAPACK_GLOBAL(dorbdb,DORBDB)
+#define LAPACK_dorcsd LAPACK_GLOBAL(dorcsd,DORCSD)
+#define LAPACK_dsyconv LAPACK_GLOBAL(dsyconv,DSYCONV)
+#define LAPACK_dsyswapr LAPACK_GLOBAL(dsyswapr,DSYSWAPR)
+#define LAPACK_dsytri2 LAPACK_GLOBAL(dsytri2,DSYTRI2)
+#define LAPACK_dsytri2x LAPACK_GLOBAL(dsytri2x,DSYTRI2X)
+#define LAPACK_dsytrs2 LAPACK_GLOBAL(dsytrs2,DSYTRS2)
+#define LAPACK_sbbcsd LAPACK_GLOBAL(sbbcsd,SBBCSD)
+#define LAPACK_sorbdb LAPACK_GLOBAL(sorbdb,SORBDB)
+#define LAPACK_sorcsd LAPACK_GLOBAL(sorcsd,SORCSD)
+#define LAPACK_ssyconv LAPACK_GLOBAL(ssyconv,SSYCONV)
+#define LAPACK_ssyswapr LAPACK_GLOBAL(ssyswapr,SSYSWAPR)
+#define LAPACK_ssytri2 LAPACK_GLOBAL(ssytri2,SSYTRI2)
+#define LAPACK_ssytri2x LAPACK_GLOBAL(ssytri2x,SSYTRI2X)
+#define LAPACK_ssytrs2 LAPACK_GLOBAL(ssytrs2,SSYTRS2)
+#define LAPACK_zbbcsd LAPACK_GLOBAL(zbbcsd,ZBBCSD)
+#define LAPACK_zheswapr LAPACK_GLOBAL(zheswapr,ZHESWAPR)
+#define LAPACK_zhetri2 LAPACK_GLOBAL(zhetri2,ZHETRI2)
+#define LAPACK_zhetri2x LAPACK_GLOBAL(zhetri2x,ZHETRI2X)
+#define LAPACK_zhetrs2 LAPACK_GLOBAL(zhetrs2,ZHETRS2)
+#define LAPACK_zsyconv LAPACK_GLOBAL(zsyconv,ZSYCONV)
+#define LAPACK_zsyswapr LAPACK_GLOBAL(zsyswapr,ZSYSWAPR)
+#define LAPACK_zsytri2 LAPACK_GLOBAL(zsytri2,ZSYTRI2)
+#define LAPACK_zsytri2x LAPACK_GLOBAL(zsytri2x,ZSYTRI2X)
+#define LAPACK_zsytrs2 LAPACK_GLOBAL(zsytrs2,ZSYTRS2)
+#define LAPACK_zunbdb LAPACK_GLOBAL(zunbdb,ZUNBDB)
+#define LAPACK_zuncsd LAPACK_GLOBAL(zuncsd,ZUNCSD)
+// LAPACK 3.4.0
+#define LAPACK_sgemqrt LAPACK_GLOBAL(sgemqrt,SGEMQRT)
+#define LAPACK_dgemqrt LAPACK_GLOBAL(dgemqrt,DGEMQRT)
+#define LAPACK_cgemqrt LAPACK_GLOBAL(cgemqrt,CGEMQRT)
+#define LAPACK_zgemqrt LAPACK_GLOBAL(zgemqrt,ZGEMQRT)
+#define LAPACK_sgeqrt LAPACK_GLOBAL(sgeqrt,SGEQRT)
+#define LAPACK_dgeqrt LAPACK_GLOBAL(dgeqrt,DGEQRT)
+#define LAPACK_cgeqrt LAPACK_GLOBAL(cgeqrt,CGEQRT)
+#define LAPACK_zgeqrt LAPACK_GLOBAL(zgeqrt,ZGEQRT)
+#define LAPACK_sgeqrt2 LAPACK_GLOBAL(sgeqrt2,SGEQRT2)
+#define LAPACK_dgeqrt2 LAPACK_GLOBAL(dgeqrt2,DGEQRT2)
+#define LAPACK_cgeqrt2 LAPACK_GLOBAL(cgeqrt2,CGEQRT2)
+#define LAPACK_zgeqrt2 LAPACK_GLOBAL(zgeqrt2,ZGEQRT2)
+#define LAPACK_sgeqrt3 LAPACK_GLOBAL(sgeqrt3,SGEQRT3)
+#define LAPACK_dgeqrt3 LAPACK_GLOBAL(dgeqrt3,DGEQRT3)
+#define LAPACK_cgeqrt3 LAPACK_GLOBAL(cgeqrt3,CGEQRT3)
+#define LAPACK_zgeqrt3 LAPACK_GLOBAL(zgeqrt3,ZGEQRT3)
+#define LAPACK_stpmqrt LAPACK_GLOBAL(stpmqrt,STPMQRT)
+#define LAPACK_dtpmqrt LAPACK_GLOBAL(dtpmqrt,DTPMQRT)
+#define LAPACK_ctpmqrt LAPACK_GLOBAL(ctpmqrt,CTPMQRT)
+#define LAPACK_ztpmqrt LAPACK_GLOBAL(ztpmqrt,ZTPMQRT)
+#define LAPACK_dtpqrt LAPACK_GLOBAL(dtpqrt,DTPQRT)
+#define LAPACK_ctpqrt LAPACK_GLOBAL(ctpqrt,CTPQRT)
+#define LAPACK_ztpqrt LAPACK_GLOBAL(ztpqrt,ZTPQRT)
+#define LAPACK_stpqrt2 LAPACK_GLOBAL(stpqrt2,STPQRT2)
+#define LAPACK_dtpqrt2 LAPACK_GLOBAL(dtpqrt2,DTPQRT2)
+#define LAPACK_ctpqrt2 LAPACK_GLOBAL(ctpqrt2,CTPQRT2)
+#define LAPACK_ztpqrt2 LAPACK_GLOBAL(ztpqrt2,ZTPQRT2)
+#define LAPACK_stprfb LAPACK_GLOBAL(stprfb,STPRFB)
+#define LAPACK_dtprfb LAPACK_GLOBAL(dtprfb,DTPRFB)
+#define LAPACK_ctprfb LAPACK_GLOBAL(ctprfb,CTPRFB)
+#define LAPACK_ztprfb LAPACK_GLOBAL(ztprfb,ZTPRFB)
+// LAPACK 3.X.X
+#define LAPACK_csyr LAPACK_GLOBAL(csyr,CSYR)
+#define LAPACK_zsyr LAPACK_GLOBAL(zsyr,ZSYR)
+
+
+void LAPACK_sgetrf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgetrf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgetrf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_zgetrf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_sgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, float* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, double* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_zgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_sgttrf( lapack_int* n, float* dl, float* d, float* du, float* du2,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgttrf( lapack_int* n, double* dl, double* d, double* du,
+                    double* du2, lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgttrf( lapack_int* n, lapack_complex_float* dl,
+                    lapack_complex_float* d, lapack_complex_float* du,
+                    lapack_complex_float* du2, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_zgttrf( lapack_int* n, lapack_complex_double* dl,
+                    lapack_complex_double* d, lapack_complex_double* du,
+                    lapack_complex_double* du2, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_spotrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dpotrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_cpotrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_zpotrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dpstrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* piv, lapack_int* rank, double* tol,
+                    double* work, lapack_int *info );
+void LAPACK_spstrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* piv, lapack_int* rank, float* tol, float* work,
+                    lapack_int *info );
+void LAPACK_zpstrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* piv, lapack_int* rank,
+                    double* tol, double* work, lapack_int *info );
+void LAPACK_cpstrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* piv, lapack_int* rank,
+                    float* tol, float* work, lapack_int *info );
+void LAPACK_dpftrf( char* transr, char* uplo, lapack_int* n, double* a,
+                    lapack_int *info );
+void LAPACK_spftrf( char* transr, char* uplo, lapack_int* n, float* a,
+                    lapack_int *info );
+void LAPACK_zpftrf( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int *info );
+void LAPACK_cpftrf( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int *info );
+void LAPACK_spptrf( char* uplo, lapack_int* n, float* ap, lapack_int *info );
+void LAPACK_dpptrf( char* uplo, lapack_int* n, double* ap, lapack_int *info );
+void LAPACK_cpptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_zpptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_spbtrf( char* uplo, lapack_int* n, lapack_int* kd, float* ab,
+                    lapack_int* ldab, lapack_int *info );
+void LAPACK_dpbtrf( char* uplo, lapack_int* n, lapack_int* kd, double* ab,
+                    lapack_int* ldab, lapack_int *info );
+void LAPACK_cpbtrf( char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_int *info );
+void LAPACK_zpbtrf( char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_int *info );
+void LAPACK_spttrf( lapack_int* n, float* d, float* e, lapack_int *info );
+void LAPACK_dpttrf( lapack_int* n, double* d, double* e, lapack_int *info );
+void LAPACK_cpttrf( lapack_int* n, float* d, lapack_complex_float* e,
+                    lapack_int *info );
+void LAPACK_zpttrf( lapack_int* n, double* d, lapack_complex_double* e,
+                    lapack_int *info );
+void LAPACK_ssytrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ipiv, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dsytrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ipiv, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_csytrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zsytrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_chetrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zhetrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ssptrf( char* uplo, lapack_int* n, float* ap, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_dsptrf( char* uplo, lapack_int* n, double* ap, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_csptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_zsptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_chptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_zhptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_sgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* a, lapack_int* lda, const lapack_int* ipiv,
+                    float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const lapack_int* ipiv,
+                    double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_sgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const float* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const double* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_cgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_float* ab,
+                    lapack_int* ldab, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_double* ab,
+                    lapack_int* ldab, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_sgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* dl, const float* d, const float* du,
+                    const float* du2, const lapack_int* ipiv, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* dl, const double* d, const double* du,
+                    const double* du2, const lapack_int* ipiv, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du,
+                    const lapack_complex_float* du2, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du,
+                    const lapack_complex_double* du2, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_spotrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_spftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* a, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_spptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_cpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_spbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const float* ab, lapack_int* ldab, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const double* ab, lapack_int* ldab, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_spttrs( lapack_int* n, lapack_int* nrhs, const float* d,
+                    const float* e, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpttrs( lapack_int* n, lapack_int* nrhs, const double* d,
+                    const double* e, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_cpttrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const lapack_complex_float* e, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zpttrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const lapack_complex_double* e,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ssytrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, const lapack_int* ipiv, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dsytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const lapack_int* ipiv,
+                    double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_csytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zsytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_chetrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zhetrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_ssptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, const lapack_int* ipiv, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dsptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, const lapack_int* ipiv, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_csptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zsptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_chptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zhptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_strtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dtrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* a, lapack_int* lda,
+                    double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_ctrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ztrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_stptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* ap, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dtptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* ap, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_ctptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* ap,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ztptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* ap,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_stbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const float* ab,
+                    lapack_int* ldab, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dtbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const double* ab,
+                    lapack_int* ldab, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ctbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ztbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_sgecon( char* norm, lapack_int* n, const float* a, lapack_int* lda,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgecon( char* norm, lapack_int* n, const double* a, lapack_int* lda,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgecon( char* norm, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* anorm, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgecon( char* norm, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, double* anorm, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const float* ab, lapack_int* ldab, const lapack_int* ipiv,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const double* ab, lapack_int* ldab, const lapack_int* ipiv,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgtcon( char* norm, lapack_int* n, const float* dl, const float* d,
+                    const float* du, const float* du2, const lapack_int* ipiv,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgtcon( char* norm, lapack_int* n, const double* dl,
+                    const double* d, const double* du, const double* du2,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgtcon( char* norm, lapack_int* n, const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du,
+                    const lapack_complex_float* du2, const lapack_int* ipiv,
+                    float* anorm, float* rcond, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zgtcon( char* norm, lapack_int* n, const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du,
+                    const lapack_complex_double* du2, const lapack_int* ipiv,
+                    double* anorm, double* rcond, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_spocon( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dpocon( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cpocon( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* anorm, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zpocon( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, double* anorm, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sppcon( char* uplo, lapack_int* n, const float* ap, float* anorm,
+                    float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dppcon( char* uplo, lapack_int* n, const double* ap, double* anorm,
+                    double* rcond, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cppcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    float* anorm, float* rcond, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zppcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    double* anorm, double* rcond, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_spbcon( char* uplo, lapack_int* n, lapack_int* kd, const float* ab,
+                    lapack_int* ldab, float* anorm, float* rcond, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dpbcon( char* uplo, lapack_int* n, lapack_int* kd, const double* ab,
+                    lapack_int* ldab, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cpbcon( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    float* anorm, float* rcond, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zpbcon( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    double* anorm, double* rcond, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sptcon( lapack_int* n, const float* d, const float* e, float* anorm,
+                    float* rcond, float* work, lapack_int *info );
+void LAPACK_dptcon( lapack_int* n, const double* d, const double* e,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int *info );
+void LAPACK_cptcon( lapack_int* n, const float* d,
+                    const lapack_complex_float* e, float* anorm, float* rcond,
+                    float* work, lapack_int *info );
+void LAPACK_zptcon( lapack_int* n, const double* d,
+                    const lapack_complex_double* e, double* anorm,
+                    double* rcond, double* work, lapack_int *info );
+void LAPACK_ssycon( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dsycon( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_csycon( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv, float* anorm,
+                    float* rcond, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zsycon( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv, double* anorm,
+                    double* rcond, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_checon( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv, float* anorm,
+                    float* rcond, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zhecon( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv, double* anorm,
+                    double* rcond, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_sspcon( char* uplo, lapack_int* n, const float* ap,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dspcon( char* uplo, lapack_int* n, const double* ap,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cspcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zspcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_chpcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zhpcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_strcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const float* a, lapack_int* lda, float* rcond, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtrcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const double* a, lapack_int* lda, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_ctrcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    float* rcond, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztrcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    double* rcond, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const float* ap, float* rcond, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const double* ap, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_float* ap, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_double* ap, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const float* ab, lapack_int* ldab,
+                    float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dtbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const double* ab, lapack_int* ldab,
+                    double* rcond, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const lapack_complex_float* ab,
+                    lapack_int* ldab, float* rcond, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_ztbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const lapack_complex_double* ab,
+                    lapack_int* ldab, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* a, lapack_int* lda, const float* af,
+                    lapack_int* ldaf, const lapack_int* ipiv, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                    float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const double* af,
+                    lapack_int* ldaf, const lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const double* a, lapack_int* lda, const double* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const double* r,
+                     const double* c, const double* b, lapack_int* ldb,
+                     double* x, lapack_int* ldx, double* rcond, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const float* a, lapack_int* lda, const float* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const float* r,
+                     const float* c, const float* b, lapack_int* ldb, float* x,
+                     lapack_int* ldx, float* rcond, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const double* r, const double* c,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const float* r, const float* c,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const float* ab, lapack_int* ldab,
+                    const float* afb, lapack_int* ldafb, const lapack_int* ipiv,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const double* ab, lapack_int* ldab,
+                    const double* afb, lapack_int* ldafb,
+                    const lapack_int* ipiv, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_float* ab,
+                    lapack_int* ldab, const lapack_complex_float* afb,
+                    lapack_int* ldafb, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_double* ab,
+                    lapack_int* ldab, const lapack_complex_double* afb,
+                    lapack_int* ldafb, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_dgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, const double* ab,
+                     lapack_int* ldab, const double* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const double* r, const double* c,
+                     const double* b, lapack_int* ldb, double* x,
+                     lapack_int* ldx, double* rcond, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, const float* ab,
+                     lapack_int* ldab, const float* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const float* r, const float* c,
+                     const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params, float* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_zgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs,
+                     const lapack_complex_double* ab, lapack_int* ldab,
+                     const lapack_complex_double* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const double* r, const double* c,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs,
+                     const lapack_complex_float* ab, lapack_int* ldab,
+                     const lapack_complex_float* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const float* r, const float* c,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* dl, const float* d, const float* du,
+                    const float* dlf, const float* df, const float* duf,
+                    const float* du2, const lapack_int* ipiv, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                    float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* dl, const double* d, const double* du,
+                    const double* dlf, const double* df, const double* duf,
+                    const double* du2, const lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du,
+                    const lapack_complex_float* dlf,
+                    const lapack_complex_float* df,
+                    const lapack_complex_float* duf,
+                    const lapack_complex_float* du2, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du,
+                    const lapack_complex_double* dlf,
+                    const lapack_complex_double* df,
+                    const lapack_complex_double* duf,
+                    const lapack_complex_double* du2, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sporfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, const float* af, lapack_int* ldaf,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const double* af,
+                    lapack_int* ldaf, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_dporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const double* a, lapack_int* lda, const double* af,
+                     lapack_int* ldaf, const double* s, const double* b,
+                     lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params, double* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_sporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const float* a, lapack_int* lda, const float* af,
+                     lapack_int* ldaf, const float* s, const float* b,
+                     lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const double* s, const lapack_complex_double* b,
+                     lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                     double* rcond, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const float* s, const lapack_complex_float* b,
+                     lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                     float* rcond, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_spprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, const float* afp, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                    float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, const double* afp, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap,
+                    const lapack_complex_float* afp,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap,
+                    const lapack_complex_double* afp,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_spbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const float* ab, lapack_int* ldab, const float* afb,
+                    lapack_int* ldafb, const float* b, lapack_int* ldb,
+                    float* x, lapack_int* ldx, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const double* ab, lapack_int* ldab, const double* afb,
+                    lapack_int* ldafb, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_complex_float* afb, lapack_int* ldafb,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_complex_double* afb, lapack_int* ldafb,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sptrfs( lapack_int* n, lapack_int* nrhs, const float* d,
+                    const float* e, const float* df, const float* ef,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int *info );
+void LAPACK_dptrfs( lapack_int* n, lapack_int* nrhs, const double* d,
+                    const double* e, const double* df, const double* ef,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* ferr, double* berr, double* work,
+                    lapack_int *info );
+void LAPACK_cptrfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const lapack_complex_float* e, const float* df,
+                    const lapack_complex_float* ef,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zptrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const lapack_complex_double* e,
+                    const double* df, const lapack_complex_double* ef,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_ssyrfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, const float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const float* b, lapack_int* ldb,
+                    float* x, lapack_int* ldx, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dsyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const double* af,
+                    lapack_int* ldaf, const lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_csyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zsyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dsyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const double* a, lapack_int* lda, const double* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const double* s,
+                     const double* b, lapack_int* ldb, double* x,
+                     lapack_int* ldx, double* rcond, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_ssyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const float* a, lapack_int* lda, const float* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const float* s,
+                     const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params, float* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_zsyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const double* s,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_csyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const float* s,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_cherfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zherfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_zherfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const double* s,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cherfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const float* s,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_ssprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, const float* afp, const lapack_int* ipiv,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dsprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, const double* afp, const lapack_int* ipiv,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_csprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap,
+                    const lapack_complex_float* afp, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zsprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap,
+                    const lapack_complex_double* afp, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_chprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap,
+                    const lapack_complex_float* afp, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap,
+                    const lapack_complex_double* afp, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_strrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* a, lapack_int* lda,
+                    const float* b, lapack_int* ldb, const float* x,
+                    lapack_int* ldx, float* ferr, float* berr, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* a, lapack_int* lda,
+                    const double* b, lapack_int* ldb, const double* x,
+                    lapack_int* ldx, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* b,
+                    lapack_int* ldb, const lapack_complex_float* x,
+                    lapack_int* ldx, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* b,
+                    lapack_int* ldb, const lapack_complex_double* x,
+                    lapack_int* ldx, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* ap, const float* b,
+                    lapack_int* ldb, const float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dtprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* ap, const double* b,
+                    lapack_int* ldb, const double* x, lapack_int* ldx,
+                    double* ferr, double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* ap,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    const lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* ap,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    const lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_stbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const float* ab,
+                    lapack_int* ldab, const float* b, lapack_int* ldb,
+                    const float* x, lapack_int* ldx, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dtbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const double* ab,
+                    lapack_int* ldab, const double* b, lapack_int* ldb,
+                    const double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    const lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    const lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgetri( lapack_int* n, float* a, lapack_int* lda,
+                    const lapack_int* ipiv, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgetri( lapack_int* n, double* a, lapack_int* lda,
+                    const lapack_int* ipiv, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgetri( lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zgetri( lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_spotri( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dpotri( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_cpotri( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_zpotri( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dpftri( char* transr, char* uplo, lapack_int* n, double* a,
+                    lapack_int *info );
+void LAPACK_spftri( char* transr, char* uplo, lapack_int* n, float* a,
+                    lapack_int *info );
+void LAPACK_zpftri( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int *info );
+void LAPACK_cpftri( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int *info );
+void LAPACK_spptri( char* uplo, lapack_int* n, float* ap, lapack_int *info );
+void LAPACK_dpptri( char* uplo, lapack_int* n, double* ap, lapack_int *info );
+void LAPACK_cpptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_zpptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_ssytri( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    const lapack_int* ipiv, float* work, lapack_int *info );
+void LAPACK_dsytri( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    const lapack_int* ipiv, double* work, lapack_int *info );
+void LAPACK_csytri( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zsytri( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_chetri( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zhetri( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_ssptri( char* uplo, lapack_int* n, float* ap,
+                    const lapack_int* ipiv, float* work, lapack_int *info );
+void LAPACK_dsptri( char* uplo, lapack_int* n, double* ap,
+                    const lapack_int* ipiv, double* work, lapack_int *info );
+void LAPACK_csptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    const lapack_int* ipiv, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zsptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    const lapack_int* ipiv, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_chptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    const lapack_int* ipiv, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zhptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    const lapack_int* ipiv, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_strtri( char* uplo, char* diag, lapack_int* n, float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dtrtri( char* uplo, char* diag, lapack_int* n, double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_ctrtri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_ztrtri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dtftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    double* a, lapack_int *info );
+void LAPACK_stftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    float* a, lapack_int *info );
+void LAPACK_ztftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_double* a, lapack_int *info );
+void LAPACK_ctftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_float* a, lapack_int *info );
+void LAPACK_stptri( char* uplo, char* diag, lapack_int* n, float* ap,
+                    lapack_int *info );
+void LAPACK_dtptri( char* uplo, char* diag, lapack_int* n, double* ap,
+                    lapack_int *info );
+void LAPACK_ctptri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_float* ap, lapack_int *info );
+void LAPACK_ztptri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_double* ap, lapack_int *info );
+void LAPACK_sgeequ( lapack_int* m, lapack_int* n, const float* a,
+                    lapack_int* lda, float* r, float* c, float* rowcnd,
+                    float* colcnd, float* amax, lapack_int *info );
+void LAPACK_dgeequ( lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, double* r, double* c, double* rowcnd,
+                    double* colcnd, double* amax, lapack_int *info );
+void LAPACK_cgeequ( lapack_int* m, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* r, float* c, float* rowcnd,
+                    float* colcnd, float* amax, lapack_int *info );
+void LAPACK_zgeequ( lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* r,
+                    double* c, double* rowcnd, double* colcnd, double* amax,
+                    lapack_int *info );
+void LAPACK_dgeequb( lapack_int* m, lapack_int* n, const double* a,
+                     lapack_int* lda, double* r, double* c, double* rowcnd,
+                     double* colcnd, double* amax, lapack_int *info );
+void LAPACK_sgeequb( lapack_int* m, lapack_int* n, const float* a,
+                     lapack_int* lda, float* r, float* c, float* rowcnd,
+                     float* colcnd, float* amax, lapack_int *info );
+void LAPACK_zgeequb( lapack_int* m, lapack_int* n,
+                     const lapack_complex_double* a, lapack_int* lda, double* r,
+                     double* c, double* rowcnd, double* colcnd, double* amax,
+                     lapack_int *info );
+void LAPACK_cgeequb( lapack_int* m, lapack_int* n,
+                     const lapack_complex_float* a, lapack_int* lda, float* r,
+                     float* c, float* rowcnd, float* colcnd, float* amax,
+                     lapack_int *info );
+void LAPACK_sgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const float* ab, lapack_int* ldab, float* r,
+                    float* c, float* rowcnd, float* colcnd, float* amax,
+                    lapack_int *info );
+void LAPACK_dgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const double* ab, lapack_int* ldab,
+                    double* r, double* c, double* rowcnd, double* colcnd,
+                    double* amax, lapack_int *info );
+void LAPACK_cgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const lapack_complex_float* ab,
+                    lapack_int* ldab, float* r, float* c, float* rowcnd,
+                    float* colcnd, float* amax, lapack_int *info );
+void LAPACK_zgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const lapack_complex_double* ab,
+                    lapack_int* ldab, double* r, double* c, double* rowcnd,
+                    double* colcnd, double* amax, lapack_int *info );
+void LAPACK_dgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const double* ab, lapack_int* ldab,
+                     double* r, double* c, double* rowcnd, double* colcnd,
+                     double* amax, lapack_int *info );
+void LAPACK_sgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const float* ab, lapack_int* ldab,
+                     float* r, float* c, float* rowcnd, float* colcnd,
+                     float* amax, lapack_int *info );
+void LAPACK_zgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const lapack_complex_double* ab,
+                     lapack_int* ldab, double* r, double* c, double* rowcnd,
+                     double* colcnd, double* amax, lapack_int *info );
+void LAPACK_cgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const lapack_complex_float* ab,
+                     lapack_int* ldab, float* r, float* c, float* rowcnd,
+                     float* colcnd, float* amax, lapack_int *info );
+void LAPACK_spoequ( lapack_int* n, const float* a, lapack_int* lda, float* s,
+                    float* scond, float* amax, lapack_int *info );
+void LAPACK_dpoequ( lapack_int* n, const double* a, lapack_int* lda, double* s,
+                    double* scond, double* amax, lapack_int *info );
+void LAPACK_cpoequ( lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* s, float* scond, float* amax,
+                    lapack_int *info );
+void LAPACK_zpoequ( lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, double* s, double* scond, double* amax,
+                    lapack_int *info );
+void LAPACK_dpoequb( lapack_int* n, const double* a, lapack_int* lda, double* s,
+                     double* scond, double* amax, lapack_int *info );
+void LAPACK_spoequb( lapack_int* n, const float* a, lapack_int* lda, float* s,
+                     float* scond, float* amax, lapack_int *info );
+void LAPACK_zpoequb( lapack_int* n, const lapack_complex_double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     lapack_int *info );
+void LAPACK_cpoequb( lapack_int* n, const lapack_complex_float* a,
+                     lapack_int* lda, float* s, float* scond, float* amax,
+                     lapack_int *info );
+void LAPACK_sppequ( char* uplo, lapack_int* n, const float* ap, float* s,
+                    float* scond, float* amax, lapack_int *info );
+void LAPACK_dppequ( char* uplo, lapack_int* n, const double* ap, double* s,
+                    double* scond, double* amax, lapack_int *info );
+void LAPACK_cppequ( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    float* s, float* scond, float* amax, lapack_int *info );
+void LAPACK_zppequ( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    double* s, double* scond, double* amax, lapack_int *info );
+void LAPACK_spbequ( char* uplo, lapack_int* n, lapack_int* kd, const float* ab,
+                    lapack_int* ldab, float* s, float* scond, float* amax,
+                    lapack_int *info );
+void LAPACK_dpbequ( char* uplo, lapack_int* n, lapack_int* kd, const double* ab,
+                    lapack_int* ldab, double* s, double* scond, double* amax,
+                    lapack_int *info );
+void LAPACK_cpbequ( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_float* ab, lapack_int* ldab, float* s,
+                    float* scond, float* amax, lapack_int *info );
+void LAPACK_zpbequ( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    double* s, double* scond, double* amax, lapack_int *info );
+void LAPACK_dsyequb( char* uplo, lapack_int* n, const double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     double* work, lapack_int *info );
+void LAPACK_ssyequb( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                     float* s, float* scond, float* amax, float* work,
+                     lapack_int *info );
+void LAPACK_zsyequb( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_csyequb( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                     lapack_int* lda, float* s, float* scond, float* amax,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_zheequb( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_cheequb( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                     lapack_int* lda, float* s, float* scond, float* amax,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_sgesv( lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda,
+                   lapack_int* ipiv, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dgesv( lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda,
+                   lapack_int* ipiv, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_float* a,
+                   lapack_int* lda, lapack_int* ipiv, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_zgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* a,
+                   lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_dsgesv( lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda,
+                    lapack_int* ipiv, double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* work, float* swork,
+                    lapack_int* iter, lapack_int *info );
+void LAPACK_zcgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    lapack_complex_double* work, lapack_complex_float* swork,
+                    double* rwork, lapack_int* iter, lapack_int *info );
+void LAPACK_sgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, float* r, float* c, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, double* r, double* c,
+                    double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, float* r, float* c,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, double* r, double* c,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                     double* rcond, double* rpvgrw, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* r, float* c,
+                     float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* rpvgrw, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* r, float* c,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, float* ab, lapack_int* ldab,
+                   lapack_int* ipiv, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, double* ab, lapack_int* ldab,
+                   lapack_int* ipiv, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, lapack_complex_float* ab, lapack_int* ldab,
+                   lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_zgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, lapack_complex_double* ab,
+                   lapack_int* ldab, lapack_int* ipiv, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_sgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, float* ab,
+                    lapack_int* ldab, float* afb, lapack_int* ldafb,
+                    lapack_int* ipiv, char* equed, float* r, float* c, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, double* ab,
+                    lapack_int* ldab, double* afb, lapack_int* ldafb,
+                    lapack_int* ipiv, char* equed, double* r, double* c,
+                    double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab,
+                    lapack_int* ldab, lapack_complex_float* afb,
+                    lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r,
+                    float* c, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, lapack_complex_double* ab,
+                    lapack_int* ldab, lapack_complex_double* afb,
+                    lapack_int* ldafb, lapack_int* ipiv, char* equed, double* r,
+                    double* c, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, double* ab,
+                     lapack_int* ldab, double* afb, lapack_int* ldafb,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                     double* rcond, double* rpvgrw, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, float* ab,
+                     lapack_int* ldab, float* afb, lapack_int* ldafb,
+                     lapack_int* ipiv, char* equed, float* r, float* c,
+                     float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* rpvgrw, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs,
+                     lapack_complex_double* ab, lapack_int* ldab,
+                     lapack_complex_double* afb, lapack_int* ldafb,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab,
+                     lapack_int* ldab, lapack_complex_float* afb,
+                     lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r,
+                     float* c, lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgtsv( lapack_int* n, lapack_int* nrhs, float* dl, float* d,
+                   float* du, float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dgtsv( lapack_int* n, lapack_int* nrhs, double* dl, double* d,
+                   double* du, double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cgtsv( lapack_int* n, lapack_int* nrhs, lapack_complex_float* dl,
+                   lapack_complex_float* d, lapack_complex_float* du,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zgtsv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* dl,
+                   lapack_complex_double* d, lapack_complex_double* du,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_sgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* dl, const float* d, const float* du,
+                    float* dlf, float* df, float* duf, float* du2,
+                    lapack_int* ipiv, const float* b, lapack_int* ldb, float* x,
+                    lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* dl, const double* d, const double* du,
+                    double* dlf, double* df, double* duf, double* du2,
+                    lapack_int* ipiv, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* rcond, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du, lapack_complex_float* dlf,
+                    lapack_complex_float* df, lapack_complex_float* duf,
+                    lapack_complex_float* du2, lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du, lapack_complex_double* dlf,
+                    lapack_complex_double* df, lapack_complex_double* duf,
+                    lapack_complex_double* du2, lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sposv( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dposv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cposv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zposv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dsposv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* work, float* swork,
+                    lapack_int* iter, lapack_int *info );
+void LAPACK_zcposv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx,
+                    lapack_complex_double* work, lapack_complex_float* swork,
+                    double* rwork, lapack_int* iter, lapack_int *info );
+void LAPACK_sposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                    char* equed, float* s, float* b, lapack_int* ldb, float* x,
+                    lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                    char* equed, double* s, double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* rcond, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf, char* equed,
+                    float* s, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf, char* equed,
+                    double* s, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                     char* equed, double* s, double* b, lapack_int* ldb,
+                     double* x, lapack_int* ldx, double* rcond, double* rpvgrw,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params, double* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_sposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                     char* equed, float* s, float* b, lapack_int* ldb, float* x,
+                     lapack_int* ldx, float* rcond, float* rpvgrw, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf, char* equed,
+                     double* s, lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf, char* equed,
+                     float* s, lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sppsv( char* uplo, lapack_int* n, lapack_int* nrhs, float* ap,
+                   float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dppsv( char* uplo, lapack_int* n, lapack_int* nrhs, double* ap,
+                   double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cppsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* ap, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_zppsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* ap, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_sppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    float* ap, float* afp, char* equed, float* s, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    double* ap, double* afp, char* equed, double* s, double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* ap, lapack_complex_float* afp,
+                    char* equed, float* s, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* ap, lapack_complex_double* afp,
+                    char* equed, double* s, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_spbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   float* ab, lapack_int* ldab, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   double* ab, lapack_int* ldab, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   lapack_complex_float* ab, lapack_int* ldab,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   lapack_complex_double* ab, lapack_int* ldab,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_spbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, float* ab, lapack_int* ldab, float* afb,
+                    lapack_int* ldafb, char* equed, float* s, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, double* ab, lapack_int* ldab, double* afb,
+                    lapack_int* ldafb, char* equed, double* s, double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, lapack_complex_float* ab,
+                    lapack_int* ldab, lapack_complex_float* afb,
+                    lapack_int* ldafb, char* equed, float* s,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, lapack_complex_double* ab,
+                    lapack_int* ldab, lapack_complex_double* afb,
+                    lapack_int* ldafb, char* equed, double* s,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sptsv( lapack_int* n, lapack_int* nrhs, float* d, float* e,
+                   float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dptsv( lapack_int* n, lapack_int* nrhs, double* d, double* e,
+                   double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cptsv( lapack_int* n, lapack_int* nrhs, float* d,
+                   lapack_complex_float* e, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_zptsv( lapack_int* n, lapack_int* nrhs, double* d,
+                   lapack_complex_double* e, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_sptsvx( char* fact, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const float* e, float* df, float* ef, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int *info );
+void LAPACK_dptsvx( char* fact, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const double* e, double* df, double* ef,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                    double* work, lapack_int *info );
+void LAPACK_cptsvx( char* fact, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const lapack_complex_float* e, float* df,
+                    lapack_complex_float* ef, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zptsvx( char* fact, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const lapack_complex_double* e, double* df,
+                    lapack_complex_double* ef, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_ssysv( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
+                   lapack_int* lda, lapack_int* ipiv, float* b, lapack_int* ldb,
+                   float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dsysv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
+                   lapack_int* lda, lapack_int* ipiv, double* b,
+                   lapack_int* ldb, double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_csysv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_zsysv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_ssysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* a, lapack_int* lda, float* af,
+                    lapack_int* ldaf, lapack_int* ipiv, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dsysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, double* af,
+                    lapack_int* ldaf, lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_csysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zsysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_dsysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* s, double* b,
+                     lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params, double* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_ssysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* s, float* b,
+                     lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params, float* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_zsysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* s,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_csysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* s,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_chesv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_zhesv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_chesvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhesvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_zhesvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* s,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_chesvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* s,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sspsv( char* uplo, lapack_int* n, lapack_int* nrhs, float* ap,
+                   lapack_int* ipiv, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dspsv( char* uplo, lapack_int* n, lapack_int* nrhs, double* ap,
+                   lapack_int* ipiv, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cspsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* ap, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zspsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* ap, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_sspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, float* afp, lapack_int* ipiv,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, double* afp, lapack_int* ipiv,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, lapack_complex_float* afp,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, lapack_complex_double* afp,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_chpsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* ap, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zhpsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* ap, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_chpsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, lapack_complex_float* afp,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhpsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, lapack_complex_double* afp,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgeqrf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgeqrf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgeqrf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgeqrf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgeqpf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* jpvt, float* tau, float* work,
+                    lapack_int *info );
+void LAPACK_dgeqpf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* jpvt, double* tau, double* work,
+                    lapack_int *info );
+void LAPACK_cgeqpf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgeqpf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgeqp3( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* jpvt, float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgeqp3( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* jpvt, double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgeqp3( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int *info );
+void LAPACK_zgeqp3( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int *info );
+void LAPACK_sorgqr( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgqr( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungqr( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungqr( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgelqf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgelqf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgelqf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgelqf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sorglq( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorglq( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cunglq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zunglq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgeqlf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgeqlf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgeqlf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgeqlf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sorgql( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgql( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungql( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungql( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cunmql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgerqf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgerqf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgerqf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgerqf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sorgrq( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgrq( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungrq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungrq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cunmrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_stzrzf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dtzrzf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ctzrzf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ztzrzf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l, const float* a,
+                    lapack_int* lda, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dormrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l, const double* a,
+                    lapack_int* lda, const double* tau, double* c,
+                    lapack_int* ldc, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sggqrf( lapack_int* n, lapack_int* m, lapack_int* p, float* a,
+                    lapack_int* lda, float* taua, float* b, lapack_int* ldb,
+                    float* taub, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dggqrf( lapack_int* n, lapack_int* m, lapack_int* p, double* a,
+                    lapack_int* lda, double* taua, double* b, lapack_int* ldb,
+                    double* taub, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cggqrf( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* taua, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* taub,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zggqrf( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* taua, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* taub,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sggrqf( lapack_int* m, lapack_int* p, lapack_int* n, float* a,
+                    lapack_int* lda, float* taua, float* b, lapack_int* ldb,
+                    float* taub, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dggrqf( lapack_int* m, lapack_int* p, lapack_int* n, double* a,
+                    lapack_int* lda, double* taua, double* b, lapack_int* ldb,
+                    double* taub, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cggrqf( lapack_int* m, lapack_int* p, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* taua, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* taub,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zggrqf( lapack_int* m, lapack_int* p, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* taua, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* taub,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgebrd( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* d, float* e, float* tauq, float* taup, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgebrd( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* d, double* e, double* tauq, double* taup,
+                    double* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_cgebrd( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, float* d, float* e,
+                    lapack_complex_float* tauq, lapack_complex_float* taup,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgebrd( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, double* d, double* e,
+                    lapack_complex_double* tauq, lapack_complex_double* taup,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, float* ab, lapack_int* ldab,
+                    float* d, float* e, float* q, lapack_int* ldq, float* pt,
+                    lapack_int* ldpt, float* c, lapack_int* ldc, float* work,
+                    lapack_int *info );
+void LAPACK_dgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, double* ab,
+                    lapack_int* ldab, double* d, double* e, double* q,
+                    lapack_int* ldq, double* pt, lapack_int* ldpt, double* c,
+                    lapack_int* ldc, double* work, lapack_int *info );
+void LAPACK_cgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, lapack_complex_float* ab,
+                    lapack_int* ldab, float* d, float* e,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* pt, lapack_int* ldpt,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, lapack_complex_double* ab,
+                    lapack_int* ldab, double* d, double* e,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* pt, lapack_int* ldpt,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sorgbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    float* a, lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    double* a, lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k, const float* a,
+                    lapack_int* lda, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dormbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k, const double* a,
+                    lapack_int* lda, const double* tau, double* c,
+                    lapack_int* ldc, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cungbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, float* d, float* e,
+                    float* vt, lapack_int* ldvt, float* u, lapack_int* ldu,
+                    float* c, lapack_int* ldc, float* work, lapack_int *info );
+void LAPACK_dbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, double* d, double* e,
+                    double* vt, lapack_int* ldvt, double* u, lapack_int* ldu,
+                    double* c, lapack_int* ldc, double* work,
+                    lapack_int *info );
+void LAPACK_cbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, float* d, float* e,
+                    lapack_complex_float* vt, lapack_int* ldvt,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* c, lapack_int* ldc, float* work,
+                    lapack_int *info );
+void LAPACK_zbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, double* d, double* e,
+                    lapack_complex_double* vt, lapack_int* ldvt,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* c, lapack_int* ldc, double* work,
+                    lapack_int *info );
+void LAPACK_sbdsdc( char* uplo, char* compq, lapack_int* n, float* d, float* e,
+                    float* u, lapack_int* ldu, float* vt, lapack_int* ldvt,
+                    float* q, lapack_int* iq, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dbdsdc( char* uplo, char* compq, lapack_int* n, double* d,
+                    double* e, double* u, lapack_int* ldu, double* vt,
+                    lapack_int* ldvt, double* q, lapack_int* iq, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ssytrd( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    float* d, float* e, float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dsytrd( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    double* d, double* e, double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sorgtr( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    const float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dorgtr( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    const double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_chetrd( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, float* d, float* e,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zhetrd( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, double* d, double* e,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungtr( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zungtr( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ssptrd( char* uplo, lapack_int* n, float* ap, float* d, float* e,
+                    float* tau, lapack_int *info );
+void LAPACK_dsptrd( char* uplo, lapack_int* n, double* ap, double* d, double* e,
+                    double* tau, lapack_int *info );
+void LAPACK_sopgtr( char* uplo, lapack_int* n, const float* ap,
+                    const float* tau, float* q, lapack_int* ldq, float* work,
+                    lapack_int *info );
+void LAPACK_dopgtr( char* uplo, lapack_int* n, const double* ap,
+                    const double* tau, double* q, lapack_int* ldq, double* work,
+                    lapack_int *info );
+void LAPACK_sopmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const float* ap, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int *info );
+void LAPACK_dopmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const double* ap, const double* tau,
+                    double* c, lapack_int* ldc, double* work,
+                    lapack_int *info );
+void LAPACK_chptrd( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    float* d, float* e, lapack_complex_float* tau,
+                    lapack_int *info );
+void LAPACK_zhptrd( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    double* d, double* e, lapack_complex_double* tau,
+                    lapack_int *info );
+void LAPACK_cupgtr( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_complex_float* tau, lapack_complex_float* q,
+                    lapack_int* ldq, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zupgtr( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_complex_double* tau, lapack_complex_double* q,
+                    lapack_int* ldq, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_cupmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_complex_float* tau, lapack_complex_float* c,
+                    lapack_int* ldc, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zupmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_ssbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    float* ab, lapack_int* ldab, float* d, float* e, float* q,
+                    lapack_int* ldq, float* work, lapack_int *info );
+void LAPACK_dsbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    double* ab, lapack_int* ldab, double* d, double* e,
+                    double* q, lapack_int* ldq, double* work,
+                    lapack_int *info );
+void LAPACK_chbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_float* ab, lapack_int* ldab, float* d,
+                    float* e, lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zhbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_double* ab, lapack_int* ldab, double* d,
+                    double* e, lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_ssterf( lapack_int* n, float* d, float* e, lapack_int *info );
+void LAPACK_dsterf( lapack_int* n, double* d, double* e, lapack_int *info );
+void LAPACK_ssteqr( char* compz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dsteqr( char* compz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_csteqr( char* compz, lapack_int* n, float* d, float* e,
+                    lapack_complex_float* z, lapack_int* ldz, float* work,
+                    lapack_int *info );
+void LAPACK_zsteqr( char* compz, lapack_int* n, double* d, double* e,
+                    lapack_complex_double* z, lapack_int* ldz, double* work,
+                    lapack_int *info );
+void LAPACK_sstemr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    lapack_int* nzc, lapack_int* isuppz, lapack_logical* tryrac,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dstemr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, lapack_int* m, double* w, double* z,
+                    lapack_int* ldz, lapack_int* nzc, lapack_int* isuppz,
+                    lapack_logical* tryrac, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cstemr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_int* nzc, lapack_int* isuppz,
+                    lapack_logical* tryrac, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_zstemr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, lapack_int* m, double* w,
+                    lapack_complex_double* z, lapack_int* ldz, lapack_int* nzc,
+                    lapack_int* isuppz, lapack_logical* tryrac, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_sstedc( char* compz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dstedc( char* compz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cstedc( char* compz, lapack_int* n, float* d, float* e,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zstedc( char* compz, lapack_int* n, double* d, double* e,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sstegr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, lapack_int* isuppz, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_dstegr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, lapack_int* isuppz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_cstegr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_int* isuppz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_zstegr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_int* isuppz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_spteqr( char* compz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dpteqr( char* compz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_cpteqr( char* compz, lapack_int* n, float* d, float* e,
+                    lapack_complex_float* z, lapack_int* ldz, float* work,
+                    lapack_int *info );
+void LAPACK_zpteqr( char* compz, lapack_int* n, double* d, double* e,
+                    lapack_complex_double* z, lapack_int* ldz, double* work,
+                    lapack_int *info );
+void LAPACK_sstebz( char* range, char* order, lapack_int* n, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    const float* d, const float* e, lapack_int* m,
+                    lapack_int* nsplit, float* w, lapack_int* iblock,
+                    lapack_int* isplit, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dstebz( char* range, char* order, lapack_int* n, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    const double* d, const double* e, lapack_int* m,
+                    lapack_int* nsplit, double* w, lapack_int* iblock,
+                    lapack_int* isplit, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_sstein( lapack_int* n, const float* d, const float* e,
+                    lapack_int* m, const float* w, const lapack_int* iblock,
+                    const lapack_int* isplit, float* z, lapack_int* ldz,
+                    float* work, lapack_int* iwork, lapack_int* ifailv,
+                    lapack_int *info );
+void LAPACK_dstein( lapack_int* n, const double* d, const double* e,
+                    lapack_int* m, const double* w, const lapack_int* iblock,
+                    const lapack_int* isplit, double* z, lapack_int* ldz,
+                    double* work, lapack_int* iwork, lapack_int* ifailv,
+                    lapack_int *info );
+void LAPACK_cstein( lapack_int* n, const float* d, const float* e,
+                    lapack_int* m, const float* w, const lapack_int* iblock,
+                    const lapack_int* isplit, lapack_complex_float* z,
+                    lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifailv, lapack_int *info );
+void LAPACK_zstein( lapack_int* n, const double* d, const double* e,
+                    lapack_int* m, const double* w, const lapack_int* iblock,
+                    const lapack_int* isplit, lapack_complex_double* z,
+                    lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifailv, lapack_int *info );
+void LAPACK_sdisna( char* job, lapack_int* m, lapack_int* n, const float* d,
+                    float* sep, lapack_int *info );
+void LAPACK_ddisna( char* job, lapack_int* m, lapack_int* n, const double* d,
+                    double* sep, lapack_int *info );
+void LAPACK_ssygst( lapack_int* itype, char* uplo, lapack_int* n, float* a,
+                    lapack_int* lda, const float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dsygst( lapack_int* itype, char* uplo, lapack_int* n, double* a,
+                    lapack_int* lda, const double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_chegst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zhegst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_sspgst( lapack_int* itype, char* uplo, lapack_int* n, float* ap,
+                    const float* bp, lapack_int *info );
+void LAPACK_dspgst( lapack_int* itype, char* uplo, lapack_int* n, double* ap,
+                    const double* bp, lapack_int *info );
+void LAPACK_chpgst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, const lapack_complex_float* bp,
+                    lapack_int *info );
+void LAPACK_zhpgst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, const lapack_complex_double* bp,
+                    lapack_int *info );
+void LAPACK_ssbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, float* ab, lapack_int* ldab,
+                    const float* bb, lapack_int* ldbb, float* x,
+                    lapack_int* ldx, float* work, lapack_int *info );
+void LAPACK_dsbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, double* ab, lapack_int* ldab,
+                    const double* bb, lapack_int* ldbb, double* x,
+                    lapack_int* ldx, double* work, lapack_int *info );
+void LAPACK_chbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_complex_float* bb, lapack_int* ldbb,
+                    lapack_complex_float* x, lapack_int* ldx,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_complex_double* bb, lapack_int* ldbb,
+                    lapack_complex_double* x, lapack_int* ldx,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_spbstf( char* uplo, lapack_int* n, lapack_int* kb, float* bb,
+                    lapack_int* ldbb, lapack_int *info );
+void LAPACK_dpbstf( char* uplo, lapack_int* n, lapack_int* kb, double* bb,
+                    lapack_int* ldbb, lapack_int *info );
+void LAPACK_cpbstf( char* uplo, lapack_int* n, lapack_int* kb,
+                    lapack_complex_float* bb, lapack_int* ldbb,
+                    lapack_int *info );
+void LAPACK_zpbstf( char* uplo, lapack_int* n, lapack_int* kb,
+                    lapack_complex_double* bb, lapack_int* ldbb,
+                    lapack_int *info );
+void LAPACK_sgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a,
+                    lapack_int* lda, float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a,
+                    lapack_int* lda, double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sorghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, const float* a,
+                    lapack_int* lda, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dormhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, const double* a,
+                    lapack_int* lda, const double* tau, double* c,
+                    lapack_int* ldc, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zunghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* c,
+                    lapack_int* ldc, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zunmhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sgebal( char* job, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ilo, lapack_int* ihi, float* scale,
+                    lapack_int *info );
+void LAPACK_dgebal( char* job, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ilo, lapack_int* ihi, double* scale,
+                    lapack_int *info );
+void LAPACK_cgebal( char* job, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
+                    float* scale, lapack_int *info );
+void LAPACK_zgebal( char* job, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
+                    double* scale, lapack_int *info );
+void LAPACK_sgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* scale, lapack_int* m,
+                    float* v, lapack_int* ldv, lapack_int *info );
+void LAPACK_dgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* scale, lapack_int* m,
+                    double* v, lapack_int* ldv, lapack_int *info );
+void LAPACK_cgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* scale, lapack_int* m,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_zgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* scale, lapack_int* m,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_shseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, float* h, lapack_int* ldh, float* wr,
+                    float* wi, float* z, lapack_int* ldz, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dhseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, double* h, lapack_int* ldh, double* wr,
+                    double* wi, double* z, lapack_int* ldz, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_chseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_float* h, lapack_int* ldh,
+                    lapack_complex_float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zhseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_double* h, lapack_int* ldh,
+                    lapack_complex_double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_shsein( char* job, char* eigsrc, char* initv,
+                    lapack_logical* select, lapack_int* n, const float* h,
+                    lapack_int* ldh, float* wr, const float* wi, float* vl,
+                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, float* work,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_dhsein( char* job, char* eigsrc, char* initv,
+                    lapack_logical* select, lapack_int* n, const double* h,
+                    lapack_int* ldh, double* wr, const double* wi, double* vl,
+                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, double* work,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_chsein( char* job, char* eigsrc, char* initv,
+                    const lapack_logical* select, lapack_int* n,
+                    const lapack_complex_float* h, lapack_int* ldh,
+                    lapack_complex_float* w, lapack_complex_float* vl,
+                    lapack_int* ldvl, lapack_complex_float* vr,
+                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_zhsein( char* job, char* eigsrc, char* initv,
+                    const lapack_logical* select, lapack_int* n,
+                    const lapack_complex_double* h, lapack_int* ldh,
+                    lapack_complex_double* w, lapack_complex_double* vl,
+                    lapack_int* ldvl, lapack_complex_double* vr,
+                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_strevc( char* side, char* howmny, lapack_logical* select,
+                    lapack_int* n, const float* t, lapack_int* ldt, float* vl,
+                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, float* work,
+                    lapack_int *info );
+void LAPACK_dtrevc( char* side, char* howmny, lapack_logical* select,
+                    lapack_int* n, const double* t, lapack_int* ldt, double* vl,
+                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, double* work,
+                    lapack_int *info );
+void LAPACK_ctrevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* vl, lapack_int* ldvl,
+                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztrevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* vl, lapack_int* ldvl,
+                    lapack_complex_double* vr, lapack_int* ldvr, lapack_int* mm,
+                    lapack_int* m, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_strsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const float* t, lapack_int* ldt,
+                    const float* vl, lapack_int* ldvl, const float* vr,
+                    lapack_int* ldvr, float* s, float* sep, lapack_int* mm,
+                    lapack_int* m, float* work, lapack_int* ldwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtrsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const double* t, lapack_int* ldt,
+                    const double* vl, lapack_int* ldvl, const double* vr,
+                    lapack_int* ldvr, double* s, double* sep, lapack_int* mm,
+                    lapack_int* m, double* work, lapack_int* ldwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctrsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_float* t,
+                    lapack_int* ldt, const lapack_complex_float* vl,
+                    lapack_int* ldvl, const lapack_complex_float* vr,
+                    lapack_int* ldvr, float* s, float* sep, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work,
+                    lapack_int* ldwork, float* rwork, lapack_int *info );
+void LAPACK_ztrsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_double* t,
+                    lapack_int* ldt, const lapack_complex_double* vl,
+                    lapack_int* ldvl, const lapack_complex_double* vr,
+                    lapack_int* ldvr, double* s, double* sep, lapack_int* mm,
+                    lapack_int* m, lapack_complex_double* work,
+                    lapack_int* ldwork, double* rwork, lapack_int *info );
+void LAPACK_strexc( char* compq, lapack_int* n, float* t, lapack_int* ldt,
+                    float* q, lapack_int* ldq, lapack_int* ifst,
+                    lapack_int* ilst, float* work, lapack_int *info );
+void LAPACK_dtrexc( char* compq, lapack_int* n, double* t, lapack_int* ldt,
+                    double* q, lapack_int* ldq, lapack_int* ifst,
+                    lapack_int* ilst, double* work, lapack_int *info );
+void LAPACK_ctrexc( char* compq, lapack_int* n, lapack_complex_float* t,
+                    lapack_int* ldt, lapack_complex_float* q, lapack_int* ldq,
+                    lapack_int* ifst, lapack_int* ilst, lapack_int *info );
+void LAPACK_ztrexc( char* compq, lapack_int* n, lapack_complex_double* t,
+                    lapack_int* ldt, lapack_complex_double* q, lapack_int* ldq,
+                    lapack_int* ifst, lapack_int* ilst, lapack_int *info );
+void LAPACK_strsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, float* t, lapack_int* ldt, float* q,
+                    lapack_int* ldq, float* wr, float* wi, lapack_int* m,
+                    float* s, float* sep, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dtrsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, double* t, lapack_int* ldt, double* q,
+                    lapack_int* ldq, double* wr, double* wi, lapack_int* m,
+                    double* s, double* sep, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_ctrsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* w, lapack_int* m, float* s,
+                    float* sep, lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ztrsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* w, lapack_int* m, double* s,
+                    double* sep, lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_strsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const float* a, lapack_int* lda,
+                    const float* b, lapack_int* ldb, float* c, lapack_int* ldc,
+                    float* scale, lapack_int *info );
+void LAPACK_dtrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const double* a, lapack_int* lda,
+                    const double* b, lapack_int* ldb, double* c,
+                    lapack_int* ldc, double* scale, lapack_int *info );
+void LAPACK_ctrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* c, lapack_int* ldc,
+                    float* scale, lapack_int *info );
+void LAPACK_ztrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* c, lapack_int* ldc,
+                    double* scale, lapack_int *info );
+void LAPACK_sgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* q, lapack_int* ldq, float* z,
+                    lapack_int* ldz, lapack_int *info );
+void LAPACK_dgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* q, lapack_int* ldq, double* z,
+                    lapack_int* ldz, lapack_int *info );
+void LAPACK_cgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_int *info );
+void LAPACK_zgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_int *info );
+void LAPACK_sggbal( char* job, lapack_int* n, float* a, lapack_int* lda,
+                    float* b, lapack_int* ldb, lapack_int* ilo, lapack_int* ihi,
+                    float* lscale, float* rscale, float* work,
+                    lapack_int *info );
+void LAPACK_dggbal( char* job, lapack_int* n, double* a, lapack_int* lda,
+                    double* b, lapack_int* ldb, lapack_int* ilo,
+                    lapack_int* ihi, double* lscale, double* rscale,
+                    double* work, lapack_int *info );
+void LAPACK_cggbal( char* job, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int* ilo, lapack_int* ihi, float* lscale,
+                    float* rscale, float* work, lapack_int *info );
+void LAPACK_zggbal( char* job, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int* ilo, lapack_int* ihi, double* lscale,
+                    double* rscale, double* work, lapack_int *info );
+void LAPACK_sggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* lscale, const float* rscale,
+                    lapack_int* m, float* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_dggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* lscale, const double* rscale,
+                    lapack_int* m, double* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_cggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* lscale, const float* rscale,
+                    lapack_int* m, lapack_complex_float* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_zggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* lscale, const double* rscale,
+                    lapack_int* m, lapack_complex_double* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_shgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, float* h, lapack_int* ldh,
+                    float* t, lapack_int* ldt, float* alphar, float* alphai,
+                    float* beta, float* q, lapack_int* ldq, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dhgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, double* h,
+                    lapack_int* ldh, double* t, lapack_int* ldt, double* alphar,
+                    double* alphai, double* beta, double* q, lapack_int* ldq,
+                    double* z, lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_chgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, lapack_complex_float* h,
+                    lapack_int* ldh, lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, lapack_complex_double* h,
+                    lapack_int* ldh, lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_stgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const float* s, lapack_int* lds,
+                    const float* p, lapack_int* ldp, float* vl,
+                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, float* work,
+                    lapack_int *info );
+void LAPACK_dtgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const double* s, lapack_int* lds,
+                    const double* p, lapack_int* ldp, double* vl,
+                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, double* work,
+                    lapack_int *info );
+void LAPACK_ctgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_float* s,
+                    lapack_int* lds, const lapack_complex_float* p,
+                    lapack_int* ldp, lapack_complex_float* vl, lapack_int* ldvl,
+                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_double* s,
+                    lapack_int* lds, const lapack_complex_double* p,
+                    lapack_int* ldp, lapack_complex_double* vl,
+                    lapack_int* ldvl, lapack_complex_double* vr,
+                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* q, lapack_int* ldq, float* z, lapack_int* ldz,
+                    lapack_int* ifst, lapack_int* ilst, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dtgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* q, lapack_int* ldq, double* z, lapack_int* ldz,
+                    lapack_int* ifst, lapack_int* ilst, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_ctgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz, lapack_int* ifst,
+                    lapack_int* ilst, lapack_int *info );
+void LAPACK_ztgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz, lapack_int* ifst,
+                    lapack_int* ilst, lapack_int *info );
+void LAPACK_stgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* alphar, float* alphai, float* beta,
+                    float* q, lapack_int* ldq, float* z, lapack_int* ldz,
+                    lapack_int* m, float* pl, float* pr, float* dif,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dtgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* alphar, double* alphai,
+                    double* beta, double* q, lapack_int* ldq, double* z,
+                    lapack_int* ldz, lapack_int* m, double* pl, double* pr,
+                    double* dif, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_ctgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz, lapack_int* m,
+                    float* pl, float* pr, float* dif,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_ztgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz, lapack_int* m,
+                    double* pl, double* pr, double* dif,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_stgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const float* a, lapack_int* lda, const float* b,
+                    lapack_int* ldb, float* c, lapack_int* ldc, const float* d,
+                    lapack_int* ldd, const float* e, lapack_int* lde, float* f,
+                    lapack_int* ldf, float* scale, float* dif, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_dtgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const double* a, lapack_int* lda, const double* b,
+                    lapack_int* ldb, double* c, lapack_int* ldc,
+                    const double* d, lapack_int* ldd, const double* e,
+                    lapack_int* lde, double* f, lapack_int* ldf, double* scale,
+                    double* dif, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    const lapack_complex_float* d, lapack_int* ldd,
+                    const lapack_complex_float* e, lapack_int* lde,
+                    lapack_complex_float* f, lapack_int* ldf, float* scale,
+                    float* dif, lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ztgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    const lapack_complex_double* d, lapack_int* ldd,
+                    const lapack_complex_double* e, lapack_int* lde,
+                    lapack_complex_double* f, lapack_int* ldf, double* scale,
+                    double* dif, lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_stgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const float* a, lapack_int* lda,
+                    const float* b, lapack_int* ldb, const float* vl,
+                    lapack_int* ldvl, const float* vr, lapack_int* ldvr,
+                    float* s, float* dif, lapack_int* mm, lapack_int* m,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dtgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const double* a, lapack_int* lda,
+                    const double* b, lapack_int* ldb, const double* vl,
+                    lapack_int* ldvl, const double* vr, lapack_int* ldvr,
+                    double* s, double* dif, lapack_int* mm, lapack_int* m,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* b,
+                    lapack_int* ldb, const lapack_complex_float* vl,
+                    lapack_int* ldvl, const lapack_complex_float* vr,
+                    lapack_int* ldvr, float* s, float* dif, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_ztgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* b,
+                    lapack_int* ldb, const lapack_complex_double* vl,
+                    lapack_int* ldvl, const lapack_complex_double* vr,
+                    lapack_int* ldvr, double* s, double* dif, lapack_int* mm,
+                    lapack_int* m, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_sggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, float* a, lapack_int* lda,
+                    float* b, lapack_int* ldb, float* tola, float* tolb,
+                    lapack_int* k, lapack_int* l, float* u, lapack_int* ldu,
+                    float* v, lapack_int* ldv, float* q, lapack_int* ldq,
+                    lapack_int* iwork, float* tau, float* work,
+                    lapack_int *info );
+void LAPACK_dggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, double* a, lapack_int* lda,
+                    double* b, lapack_int* ldb, double* tola, double* tolb,
+                    lapack_int* k, lapack_int* l, double* u, lapack_int* ldu,
+                    double* v, lapack_int* ldv, double* q, lapack_int* ldq,
+                    lapack_int* iwork, double* tau, double* work,
+                    lapack_int *info );
+void LAPACK_cggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                    float* tola, float* tolb, lapack_int* k, lapack_int* l,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_complex_float* q, lapack_int* ldq, lapack_int* iwork,
+                    float* rwork, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                    double* tola, double* tolb, lapack_int* k, lapack_int* l,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_int* iwork, double* rwork,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_stgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* tola, float* tolb, float* alpha, float* beta,
+                    float* u, lapack_int* ldu, float* v, lapack_int* ldv,
+                    float* q, lapack_int* ldq, float* work, lapack_int* ncycle,
+                    lapack_int *info );
+void LAPACK_dtgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* tola, double* tolb, double* alpha, double* beta,
+                    double* u, lapack_int* ldu, double* v, lapack_int* ldv,
+                    double* q, lapack_int* ldq, double* work,
+                    lapack_int* ncycle, lapack_int *info );
+void LAPACK_ctgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* tola,
+                    float* tolb, float* alpha, float* beta,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* work, lapack_int* ncycle,
+                    lapack_int *info );
+void LAPACK_ztgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* tola,
+                    double* tolb, double* alpha, double* beta,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* work, lapack_int* ncycle,
+                    lapack_int *info );
+void LAPACK_sgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                   double* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_cgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_zgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_sgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb,
+                    lapack_int* jpvt, float* rcond, lapack_int* rank,
+                    float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb,
+                    lapack_int* jpvt, double* rcond, lapack_int* rank,
+                    double* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_cgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, lapack_int* jpvt,
+                    float* rcond, lapack_int* rank, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int *info );
+void LAPACK_zgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, lapack_int* jpvt,
+                    double* rcond, lapack_int* rank,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int *info );
+void LAPACK_zgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_dgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_zgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgglse( lapack_int* m, lapack_int* n, lapack_int* p, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* c,
+                    float* d, float* x, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgglse( lapack_int* m, lapack_int* n, lapack_int* p, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* c,
+                    double* d, double* x, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgglse( lapack_int* m, lapack_int* n, lapack_int* p,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* c, lapack_complex_float* d,
+                    lapack_complex_float* x, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zgglse( lapack_int* m, lapack_int* n, lapack_int* p,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* c, lapack_complex_double* d,
+                    lapack_complex_double* x, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sggglm( lapack_int* n, lapack_int* m, lapack_int* p, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* d,
+                    float* x, float* y, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dggglm( lapack_int* n, lapack_int* m, lapack_int* p, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* d,
+                    double* x, double* y, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cggglm( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* d, lapack_complex_float* x,
+                    lapack_complex_float* y, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zggglm( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* d, lapack_complex_double* x,
+                    lapack_complex_double* y, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_ssyev( char* jobz, char* uplo, lapack_int* n, float* a,
+                   lapack_int* lda, float* w, float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_dsyev( char* jobz, char* uplo, lapack_int* n, double* a,
+                   lapack_int* lda, double* w, double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_cheev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda, float* w,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zheev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda, double* w,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int *info );
+void LAPACK_ssyevd( char* jobz, char* uplo, lapack_int* n, float* a,
+                    lapack_int* lda, float* w, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dsyevd( char* jobz, char* uplo, lapack_int* n, double* a,
+                    lapack_int* lda, double* w, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cheevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* w,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zheevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* w,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssyevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    float* a, lapack_int* lda, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dsyevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    double* a, lapack_int* lda, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, double* z, lapack_int* ldz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_cheevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_zheevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_ssyevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    float* a, lapack_int* lda, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    lapack_int* isuppz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dsyevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    double* a, lapack_int* lda, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, double* z, lapack_int* ldz,
+                    lapack_int* isuppz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cheevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_int* isuppz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zheevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_int* isuppz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sspev( char* jobz, char* uplo, lapack_int* n, float* ap, float* w,
+                   float* z, lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dspev( char* jobz, char* uplo, lapack_int* n, double* ap, double* w,
+                   double* z, lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_chpev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* ap, float* w, lapack_complex_float* z,
+                   lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                   lapack_int *info );
+void LAPACK_zhpev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* ap, double* w,
+                   lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_sspevd( char* jobz, char* uplo, lapack_int* n, float* ap, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dspevd( char* jobz, char* uplo, lapack_int* n, double* ap,
+                    double* w, double* z, lapack_int* ldz, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_chpevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* lrwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_zhpevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sspevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    float* ap, float* vl, float* vu, lapack_int* il,
+                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dspevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    double* ap, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_chpevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_zhpevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_ssbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   float* ab, lapack_int* ldab, float* w, float* z,
+                   lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dsbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   double* ab, lapack_int* ldab, double* w, double* z,
+                   lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_chbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   lapack_complex_float* ab, lapack_int* ldab, float* w,
+                   lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int *info );
+void LAPACK_zhbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   lapack_complex_double* ab, lapack_int* ldab, double* w,
+                   lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_ssbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    float* ab, lapack_int* ldab, float* w, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dsbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    double* ab, lapack_int* ldab, double* w, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_chbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_float* ab, lapack_int* ldab, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_double* ab, lapack_int* ldab, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, float* ab, lapack_int* ldab, float* q,
+                    lapack_int* ldq, float* vl, float* vu, lapack_int* il,
+                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dsbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, double* ab, lapack_int* ldab, double* q,
+                    lapack_int* ldq, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_chbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* q, lapack_int* ldq, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_zhbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* q, lapack_int* ldq, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_sstev( char* jobz, lapack_int* n, float* d, float* e, float* z,
+                   lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dstev( char* jobz, lapack_int* n, double* d, double* e, double* z,
+                   lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_sstevd( char* jobz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dstevd( char* jobz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_sstevx( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dstevx( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_sstevr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, lapack_int* isuppz, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_dstevr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, lapack_int* isuppz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sgees( char* jobvs, char* sort, LAPACK_S_SELECT2 select,
+                   lapack_int* n, float* a, lapack_int* lda, lapack_int* sdim,
+                   float* wr, float* wi, float* vs, lapack_int* ldvs,
+                   float* work, lapack_int* lwork, lapack_logical* bwork,
+                   lapack_int *info );
+void LAPACK_dgees( char* jobvs, char* sort, LAPACK_D_SELECT2 select,
+                   lapack_int* n, double* a, lapack_int* lda, lapack_int* sdim,
+                   double* wr, double* wi, double* vs, lapack_int* ldvs,
+                   double* work, lapack_int* lwork, lapack_logical* bwork,
+                   lapack_int *info );
+void LAPACK_cgees( char* jobvs, char* sort, LAPACK_C_SELECT1 select,
+                   lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                   lapack_int* sdim, lapack_complex_float* w,
+                   lapack_complex_float* vs, lapack_int* ldvs,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_logical* bwork, lapack_int *info );
+void LAPACK_zgees( char* jobvs, char* sort, LAPACK_Z_SELECT1 select,
+                   lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                   lapack_int* sdim, lapack_complex_double* w,
+                   lapack_complex_double* vs, lapack_int* ldvs,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_logical* bwork, lapack_int *info );
+void LAPACK_sgeesx( char* jobvs, char* sort, LAPACK_S_SELECT2 select,
+                    char* sense, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* sdim, float* wr, float* wi, float* vs,
+                    lapack_int* ldvs, float* rconde, float* rcondv, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_logical* bwork, lapack_int *info );
+void LAPACK_dgeesx( char* jobvs, char* sort, LAPACK_D_SELECT2 select,
+                    char* sense, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* sdim, double* wr, double* wi, double* vs,
+                    lapack_int* ldvs, double* rconde, double* rcondv,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_cgeesx( char* jobvs, char* sort, LAPACK_C_SELECT1 select,
+                    char* sense, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* sdim, lapack_complex_float* w,
+                    lapack_complex_float* vs, lapack_int* ldvs, float* rconde,
+                    float* rcondv, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_zgeesx( char* jobvs, char* sort, LAPACK_Z_SELECT1 select,
+                    char* sense, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* sdim, lapack_complex_double* w,
+                    lapack_complex_double* vs, lapack_int* ldvs, double* rconde,
+                    double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_sgeev( char* jobvl, char* jobvr, lapack_int* n, float* a,
+                   lapack_int* lda, float* wr, float* wi, float* vl,
+                   lapack_int* ldvl, float* vr, lapack_int* ldvr, float* work,
+                   lapack_int* lwork, lapack_int *info );
+void LAPACK_dgeev( char* jobvl, char* jobvr, lapack_int* n, double* a,
+                   lapack_int* lda, double* wr, double* wi, double* vl,
+                   lapack_int* ldvl, double* vr, lapack_int* ldvr, double* work,
+                   lapack_int* lwork, lapack_int *info );
+void LAPACK_cgeev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* w, lapack_complex_float* vl,
+                   lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zgeev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* w, lapack_complex_double* vl,
+                   lapack_int* ldvl, lapack_complex_double* vr,
+                   lapack_int* ldvr, lapack_complex_double* work,
+                   lapack_int* lwork, double* rwork, lapack_int *info );
+void LAPACK_sgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, float* a, lapack_int* lda, float* wr,
+                    float* wi, float* vl, lapack_int* ldvl, float* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    float* scale, float* abnrm, float* rconde, float* rcondv,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, double* a, lapack_int* lda, double* wr,
+                    double* wi, double* vl, lapack_int* ldvl, double* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    double* scale, double* abnrm, double* rconde,
+                    double* rcondv, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* w, lapack_complex_float* vl,
+                    lapack_int* ldvl, lapack_complex_float* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    float* scale, float* abnrm, float* rconde, float* rcondv,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* w, lapack_complex_double* vl,
+                    lapack_int* ldvl, lapack_complex_double* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    double* scale, double* abnrm, double* rconde,
+                    double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int *info );
+void LAPACK_sgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    float* a, lapack_int* lda, float* s, float* u,
+                    lapack_int* ldu, float* vt, lapack_int* ldvt, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    double* a, lapack_int* lda, double* s, double* u,
+                    lapack_int* ldu, double* vt, lapack_int* ldvt, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* s,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* vt, lapack_int* ldvt,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* s,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* vt, lapack_int* ldvt,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgesdd( char* jobz, lapack_int* m, lapack_int* n, float* a,
+                    lapack_int* lda, float* s, float* u, lapack_int* ldu,
+                    float* vt, lapack_int* ldvt, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dgesdd( char* jobz, lapack_int* m, lapack_int* n, double* a,
+                    lapack_int* lda, double* s, double* u, lapack_int* ldu,
+                    double* vt, lapack_int* ldvt, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgesdd( char* jobz, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* s,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* vt, lapack_int* ldvt,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_zgesdd( char* jobz, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* s,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* vt, lapack_int* ldvt,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_dgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt,
+                    char* jobp, lapack_int* m, lapack_int* n, double* a,
+                    lapack_int* lda, double* sva, double* u, lapack_int* ldu,
+                    double* v, lapack_int* ldv, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_sgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt,
+                    char* jobp, lapack_int* m, lapack_int* n, float* a,
+                    lapack_int* lda, float* sva, float* u, lapack_int* ldu,
+                    float* v, lapack_int* ldv, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dgesvj( char* joba, char* jobu, char* jobv, lapack_int* m,
+                    lapack_int* n, double* a, lapack_int* lda, double* sva,
+                    lapack_int* mv, double* v, lapack_int* ldv, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sgesvj( char* joba, char* jobu, char* jobv, lapack_int* m,
+                    lapack_int* n, float* a, lapack_int* lda, float* sva,
+                    lapack_int* mv, float* v, lapack_int* ldv, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* alpha, float* beta, float* u, lapack_int* ldu,
+                    float* v, lapack_int* ldv, float* q, lapack_int* ldq,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* alpha, double* beta, double* u, lapack_int* ldu,
+                    double* v, lapack_int* ldv, double* q, lapack_int* ldq,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* alpha,
+                    float* beta, lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* work, float* rwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_zggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* alpha,
+                    double* beta, lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ssygv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   float* w, float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dsygv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                   double* w, double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_chegv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, float* w,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zhegv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, double* w,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int *info );
+void LAPACK_ssygvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* w, float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dsygvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* w, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_chegvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* w,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhegvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* w,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssygvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* vl, float* vu, lapack_int* il,
+                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_dsygvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_chegvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_zhegvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_sspgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   float* ap, float* bp, float* w, float* z, lapack_int* ldz,
+                   float* work, lapack_int *info );
+void LAPACK_dspgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   double* ap, double* bp, double* w, double* z,
+                   lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_chpgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* ap, lapack_complex_float* bp, float* w,
+                   lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int *info );
+void LAPACK_zhpgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* ap, lapack_complex_double* bp,
+                   double* w, lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_sspgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    float* ap, float* bp, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dspgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    double* ap, double* bp, double* w, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_chpgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, lapack_complex_float* bp,
+                    float* w, lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhpgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, lapack_complex_double* bp,
+                    double* w, lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sspgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, float* ap, float* bp, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* iwork, lapack_int* ifail,
+                    lapack_int *info );
+void LAPACK_dspgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, double* ap, double* bp, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, double* z, lapack_int* ldz,
+                    double* work, lapack_int* iwork, lapack_int* ifail,
+                    lapack_int *info );
+void LAPACK_chpgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_float* ap,
+                    lapack_complex_float* bp, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_zhpgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_double* ap,
+                    lapack_complex_double* bp, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_ssbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, float* ab, lapack_int* ldab, float* bb,
+                   lapack_int* ldbb, float* w, float* z, lapack_int* ldz,
+                   float* work, lapack_int *info );
+void LAPACK_dsbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, double* ab, lapack_int* ldab, double* bb,
+                   lapack_int* ldbb, double* w, double* z, lapack_int* ldz,
+                   double* work, lapack_int *info );
+void LAPACK_chbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
+                   lapack_complex_float* bb, lapack_int* ldbb, float* w,
+                   lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int *info );
+void LAPACK_zhbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
+                   lapack_complex_double* bb, lapack_int* ldbb, double* w,
+                   lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_ssbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, float* ab, lapack_int* ldab, float* bb,
+                    lapack_int* ldbb, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dsbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, double* ab, lapack_int* ldab, double* bb,
+                    lapack_int* ldbb, double* w, double* z, lapack_int* ldz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_chbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* bb, lapack_int* ldbb, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* bb, lapack_int* ldbb, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab,
+                    float* bb, lapack_int* ldbb, float* q, lapack_int* ldq,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dsbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, double* ab,
+                    lapack_int* ldab, double* bb, lapack_int* ldbb, double* q,
+                    lapack_int* ldq, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_chbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, lapack_complex_float* ab,
+                    lapack_int* ldab, lapack_complex_float* bb,
+                    lapack_int* ldbb, lapack_complex_float* q, lapack_int* ldq,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, float* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_zhbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, lapack_complex_double* ab,
+                    lapack_int* ldab, lapack_complex_double* bb,
+                    lapack_int* ldbb, lapack_complex_double* q, lapack_int* ldq,
+                    double* vl, double* vu, lapack_int* il, lapack_int* iu,
+                    double* abstol, lapack_int* m, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_sgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_S_SELECT3 selctg, lapack_int* n, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb, lapack_int* sdim,
+                   float* alphar, float* alphai, float* beta, float* vsl,
+                   lapack_int* ldvsl, float* vsr, lapack_int* ldvsr,
+                   float* work, lapack_int* lwork, lapack_logical* bwork,
+                   lapack_int *info );
+void LAPACK_dgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_D_SELECT3 selctg, lapack_int* n, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb,
+                   lapack_int* sdim, double* alphar, double* alphai,
+                   double* beta, double* vsl, lapack_int* ldvsl, double* vsr,
+                   lapack_int* ldvsr, double* work, lapack_int* lwork,
+                   lapack_logical* bwork, lapack_int *info );
+void LAPACK_cgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_C_SELECT2 selctg, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
+                   lapack_complex_float* alpha, lapack_complex_float* beta,
+                   lapack_complex_float* vsl, lapack_int* ldvsl,
+                   lapack_complex_float* vsr, lapack_int* ldvsr,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_logical* bwork, lapack_int *info );
+void LAPACK_zgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_Z_SELECT2 selctg, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim,
+                   lapack_complex_double* alpha, lapack_complex_double* beta,
+                   lapack_complex_double* vsl, lapack_int* ldvsl,
+                   lapack_complex_double* vsr, lapack_int* ldvsr,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_logical* bwork, lapack_int *info );
+void LAPACK_sggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_S_SELECT3 selctg, char* sense, lapack_int* n,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    lapack_int* sdim, float* alphar, float* alphai, float* beta,
+                    float* vsl, lapack_int* ldvsl, float* vsr,
+                    lapack_int* ldvsr, float* rconde, float* rcondv,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_dggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_D_SELECT3 selctg, char* sense, lapack_int* n,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    lapack_int* sdim, double* alphar, double* alphai,
+                    double* beta, double* vsl, lapack_int* ldvsl, double* vsr,
+                    lapack_int* ldvsr, double* rconde, double* rcondv,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_cggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_C_SELECT2 selctg, char* sense, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* vsl, lapack_int* ldvsl,
+                    lapack_complex_float* vsr, lapack_int* ldvsr, float* rconde,
+                    float* rcondv, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_zggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_Z_SELECT2 selctg, char* sense, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* vsl, lapack_int* ldvsl,
+                    lapack_complex_double* vsr, lapack_int* ldvsr,
+                    double* rconde, double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_sggev( char* jobvl, char* jobvr, lapack_int* n, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb, float* alphar,
+                   float* alphai, float* beta, float* vl, lapack_int* ldvl,
+                   float* vr, lapack_int* ldvr, float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_dggev( char* jobvl, char* jobvr, lapack_int* n, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb, double* alphar,
+                   double* alphai, double* beta, double* vl, lapack_int* ldvl,
+                   double* vr, lapack_int* ldvr, double* work,
+                   lapack_int* lwork, lapack_int *info );
+void LAPACK_cggev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* alpha, lapack_complex_float* beta,
+                   lapack_complex_float* vl, lapack_int* ldvl,
+                   lapack_complex_float* vr, lapack_int* ldvr,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zggev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* alpha, lapack_complex_double* beta,
+                   lapack_complex_double* vl, lapack_int* ldvl,
+                   lapack_complex_double* vr, lapack_int* ldvr,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int *info );
+void LAPACK_sggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* alphar, float* alphai, float* beta,
+                    float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* ilo, lapack_int* ihi, float* lscale,
+                    float* rscale, float* abnrm, float* bbnrm, float* rconde,
+                    float* rcondv, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_dggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* alphar, double* alphai,
+                    double* beta, double* vl, lapack_int* ldvl, double* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    double* lscale, double* rscale, double* abnrm,
+                    double* bbnrm, double* rconde, double* rcondv, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_cggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* vl, lapack_int* ldvl,
+                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* ilo,
+                    lapack_int* ihi, float* lscale, float* rscale, float* abnrm,
+                    float* bbnrm, float* rconde, float* rcondv,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* iwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_zggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* vl, lapack_int* ldvl,
+                    lapack_complex_double* vr, lapack_int* ldvr,
+                    lapack_int* ilo, lapack_int* ihi, double* lscale,
+                    double* rscale, double* abnrm, double* bbnrm,
+                    double* rconde, double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_logical* bwork, lapack_int *info );
+void LAPACK_dsfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, double* alpha, const double* a,
+                   lapack_int* lda, double* beta, double* c );
+void LAPACK_ssfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, float* alpha, const float* a, lapack_int* lda,
+                   float* beta, float* c );
+void LAPACK_zhfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, double* alpha, const lapack_complex_double* a,
+                   lapack_int* lda, double* beta, lapack_complex_double* c );
+void LAPACK_chfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, float* alpha, const lapack_complex_float* a,
+                   lapack_int* lda, float* beta, lapack_complex_float* c );
+void LAPACK_dtfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n, double* alpha,
+                   const double* a, double* b, lapack_int* ldb );
+void LAPACK_stfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n, float* alpha,
+                   const float* a, float* b, lapack_int* ldb );
+void LAPACK_ztfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n,
+                   lapack_complex_double* alpha, const lapack_complex_double* a,
+                   lapack_complex_double* b, lapack_int* ldb );
+void LAPACK_ctfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n,
+                   lapack_complex_float* alpha, const lapack_complex_float* a,
+                   lapack_complex_float* b, lapack_int* ldb );
+void LAPACK_dtfttp( char* transr, char* uplo, lapack_int* n, const double* arf,
+                    double* ap, lapack_int *info );
+void LAPACK_stfttp( char* transr, char* uplo, lapack_int* n, const float* arf,
+                    float* ap, lapack_int *info );
+void LAPACK_ztfttp( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* arf, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_ctfttp( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* arf, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_dtfttr( char* transr, char* uplo, lapack_int* n, const double* arf,
+                    double* a, lapack_int* lda, lapack_int *info );
+void LAPACK_stfttr( char* transr, char* uplo, lapack_int* n, const float* arf,
+                    float* a, lapack_int* lda, lapack_int *info );
+void LAPACK_ztfttr( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* arf, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_ctfttr( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* arf, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dtpttf( char* transr, char* uplo, lapack_int* n, const double* ap,
+                    double* arf, lapack_int *info );
+void LAPACK_stpttf( char* transr, char* uplo, lapack_int* n, const float* ap,
+                    float* arf, lapack_int *info );
+void LAPACK_ztpttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* ap, lapack_complex_double* arf,
+                    lapack_int *info );
+void LAPACK_ctpttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* ap, lapack_complex_float* arf,
+                    lapack_int *info );
+void LAPACK_dtpttr( char* uplo, lapack_int* n, const double* ap, double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_stpttr( char* uplo, lapack_int* n, const float* ap, float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_ztpttr( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_ctpttr( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dtrttf( char* transr, char* uplo, lapack_int* n, const double* a,
+                    lapack_int* lda, double* arf, lapack_int *info );
+void LAPACK_strttf( char* transr, char* uplo, lapack_int* n, const float* a,
+                    lapack_int* lda, float* arf, lapack_int *info );
+void LAPACK_ztrttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* arf, lapack_int *info );
+void LAPACK_ctrttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* arf, lapack_int *info );
+void LAPACK_dtrttp( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
+                    double* ap, lapack_int *info );
+void LAPACK_strttp( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                    float* ap, lapack_int *info );
+void LAPACK_ztrttp( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_ctrttp( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_sgeqrfp( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* tau, float* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_dgeqrfp( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* tau, double* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_cgeqrfp( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* tau,
+                     lapack_complex_float* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_zgeqrfp( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* tau,
+                     lapack_complex_double* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_clacgv( lapack_int* n, lapack_complex_float* x, lapack_int* incx );
+void LAPACK_zlacgv( lapack_int* n, lapack_complex_double* x, lapack_int* incx );
+void LAPACK_slarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    float* x );
+void LAPACK_dlarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    double* x );
+void LAPACK_clarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    lapack_complex_float* x );
+void LAPACK_zlarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    lapack_complex_double* x );
+void LAPACK_sgeqr2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int *info );
+void LAPACK_dgeqr2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int *info );
+void LAPACK_cgeqr2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgeqr2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slacpy( char* uplo, lapack_int* m, lapack_int* n, const float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb );
+void LAPACK_dlacpy( char* uplo, lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb );
+void LAPACK_clacpy( char* uplo, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb );
+void LAPACK_zlacpy( char* uplo, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb );
+void LAPACK_sgetf2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgetf2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgetf2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_zgetf2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_slaswp( lapack_int* n, float* a, lapack_int* lda, lapack_int* k1,
+                    lapack_int* k2, const lapack_int* ipiv, lapack_int* incx );
+void LAPACK_dlaswp( lapack_int* n, double* a, lapack_int* lda, lapack_int* k1,
+                    lapack_int* k2, const lapack_int* ipiv, lapack_int* incx );
+void LAPACK_claswp( lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
+                    lapack_int* incx );
+void LAPACK_zlaswp( lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
+                    lapack_int* incx );
+float LAPACK_slange( char* norm, lapack_int* m, lapack_int* n, const float* a,
+                    lapack_int* lda, float* work );
+double LAPACK_dlange( char* norm, lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, double* work );
+float LAPACK_clange( char* norm, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda, float* work );
+double LAPACK_zlange( char* norm, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* work );
+float LAPACK_clanhe( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda, float* work );
+double LAPACK_zlanhe( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* work );
+float LAPACK_slansy( char* norm, char* uplo, lapack_int* n, const float* a,
+                    lapack_int* lda, float* work );
+double LAPACK_dlansy( char* norm, char* uplo, lapack_int* n, const double* a,
+                    lapack_int* lda, double* work );
+float LAPACK_clansy( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda, float* work );
+double LAPACK_zlansy( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* work );
+float LAPACK_slantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const float* a, lapack_int* lda, float* work );
+double LAPACK_dlantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const double* a, lapack_int* lda, double* work );
+float LAPACK_clantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
+                    float* work );
+double LAPACK_zlantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                    double* work );
+float LAPACK_slamch( char* cmach );
+double LAPACK_dlamch( char* cmach );
+void LAPACK_sgelq2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int *info );
+void LAPACK_dgelq2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int *info );
+void LAPACK_cgelq2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgelq2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, const float* v,
+                    lapack_int* ldv, const float* t, lapack_int* ldt, float* c,
+                    lapack_int* ldc, float* work, lapack_int* ldwork );
+void LAPACK_dlarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k,
+                    const double* v, lapack_int* ldv, const double* t,
+                    lapack_int* ldt, double* c, lapack_int* ldc, double* work,
+                    lapack_int* ldwork );
+void LAPACK_clarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k,
+                    const lapack_complex_float* v, lapack_int* ldv,
+                    const lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* ldwork );
+void LAPACK_zlarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k,
+                    const lapack_complex_double* v, lapack_int* ldv,
+                    const lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* ldwork );
+void LAPACK_slarfg( lapack_int* n, float* alpha, float* x, lapack_int* incx,
+                    float* tau );
+void LAPACK_dlarfg( lapack_int* n, double* alpha, double* x, lapack_int* incx,
+                    double* tau );
+void LAPACK_clarfg( lapack_int* n, lapack_complex_float* alpha,
+                    lapack_complex_float* x, lapack_int* incx,
+                    lapack_complex_float* tau );
+void LAPACK_zlarfg( lapack_int* n, lapack_complex_double* alpha,
+                    lapack_complex_double* x, lapack_int* incx,
+                    lapack_complex_double* tau );
+void LAPACK_slarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const float* v, lapack_int* ldv, const float* tau, float* t,
+                    lapack_int* ldt );
+void LAPACK_dlarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const double* v, lapack_int* ldv, const double* tau,
+                    double* t, lapack_int* ldt );
+void LAPACK_clarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const lapack_complex_float* v, lapack_int* ldv,
+                    const lapack_complex_float* tau, lapack_complex_float* t,
+                    lapack_int* ldt );
+void LAPACK_zlarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const lapack_complex_double* v, lapack_int* ldv,
+                    const lapack_complex_double* tau, lapack_complex_double* t,
+                    lapack_int* ldt );
+void LAPACK_slarfx( char* side, lapack_int* m, lapack_int* n, const float* v,
+                    float* tau, float* c, lapack_int* ldc, float* work );
+void LAPACK_dlarfx( char* side, lapack_int* m, lapack_int* n, const double* v,
+                    double* tau, double* c, lapack_int* ldc, double* work );
+void LAPACK_clarfx( char* side, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* v, lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work );
+void LAPACK_zlarfx( char* side, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* v, lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work );
+void LAPACK_slatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, float* d, lapack_int* mode, float* cond,
+                    float* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    float* a, lapack_int* lda, float* work, lapack_int *info );
+void LAPACK_dlatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, double* d, lapack_int* mode, double* cond,
+                    double* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    double* a, lapack_int* lda, double* work,
+                    lapack_int *info );
+void LAPACK_clatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, float* d, lapack_int* mode, float* cond,
+                    float* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, double* d, lapack_int* mode, double* cond,
+                    double* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slag2d( lapack_int* m, lapack_int* n, const float* sa,
+                    lapack_int* ldsa, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dlag2s( lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, float* sa, lapack_int* ldsa,
+                    lapack_int *info );
+void LAPACK_clag2z( lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* sa, lapack_int* ldsa,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_zlag2c( lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_float* sa, lapack_int* ldsa,
+                    lapack_int *info );
+void LAPACK_slauum( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dlauum( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_clauum( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_zlauum( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_slagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const float* d, float* a, lapack_int* lda,
+                    lapack_int* iseed, float* work, lapack_int *info );
+void LAPACK_dlagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const double* d, double* a, lapack_int* lda,
+                    lapack_int* iseed, double* work, lapack_int *info );
+void LAPACK_clagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const float* d, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const double* d, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slaset( char* uplo, lapack_int* m, lapack_int* n, float* alpha,
+                    float* beta, float* a, lapack_int* lda );
+void LAPACK_dlaset( char* uplo, lapack_int* m, lapack_int* n, double* alpha,
+                    double* beta, double* a, lapack_int* lda );
+void LAPACK_claset( char* uplo, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* a, lapack_int* lda );
+void LAPACK_zlaset( char* uplo, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* a, lapack_int* lda );
+void LAPACK_slasrt( char* id, lapack_int* n, float* d, lapack_int *info );
+void LAPACK_dlasrt( char* id, lapack_int* n, double* d, lapack_int *info );
+void LAPACK_claghe( lapack_int* n, lapack_int* k, const float* d,
+                    lapack_complex_float* a, lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlaghe( lapack_int* n, lapack_int* k, const double* d,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int* iseed, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_slagsy( lapack_int* n, lapack_int* k, const float* d, float* a,
+                    lapack_int* lda, lapack_int* iseed, float* work,
+                    lapack_int *info );
+void LAPACK_dlagsy( lapack_int* n, lapack_int* k, const double* d, double* a,
+                    lapack_int* lda, lapack_int* iseed, double* work,
+                    lapack_int *info );
+void LAPACK_clagsy( lapack_int* n, lapack_int* k, const float* d,
+                    lapack_complex_float* a, lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlagsy( lapack_int* n, lapack_int* k, const double* d,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int* iseed, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_slapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    float* x, lapack_int* ldx, lapack_int* k );
+void LAPACK_dlapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    double* x, lapack_int* ldx, lapack_int* k );
+void LAPACK_clapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* x, lapack_int* ldx, lapack_int* k );
+void LAPACK_zlapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* x, lapack_int* ldx, lapack_int* k );
+float LAPACK_slapy2( float* x, float* y );
+double LAPACK_dlapy2( double* x, double* y );
+float LAPACK_slapy3( float* x, float* y, float* z );
+double LAPACK_dlapy3( double* x, double* y, double* z );
+void LAPACK_slartgp( float* f, float* g, float* cs, float* sn, float* r );
+void LAPACK_dlartgp( double* f, double* g, double* cs, double* sn, double* r );
+void LAPACK_slartgs( float* x, float* y, float* sigma, float* cs, float* sn );
+void LAPACK_dlartgs( double* x, double* y, double* sigma, double* cs,
+                     double* sn );
+// LAPACK 3.3.0
+void LAPACK_cbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    float* theta, float* phi,
+                    lapack_complex_float* u1, lapack_int* ldu1,
+                    lapack_complex_float* u2, lapack_int* ldu2,
+                    lapack_complex_float* v1t, lapack_int* ldv1t,
+                    lapack_complex_float* v2t, lapack_int* ldv2t,
+                    float* b11d, float* b11e, float* b12d,
+                    float* b12e, float* b21d, float* b21e,
+                    float* b22d, float* b22e, float* rwork,
+                    lapack_int* lrwork , lapack_int *info );
+void LAPACK_cheswapr( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_chetri2( char* uplo, lapack_int* n,
+                     lapack_complex_float* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_chetri2x( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_float* work, lapack_int* nb , lapack_int *info );
+void LAPACK_chetrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const lapack_complex_float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* work , lapack_int *info );
+void LAPACK_csyconv( char* uplo, char* way,
+                     lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* work , lapack_int *info );
+void LAPACK_csyswapr( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_csytri2( char* uplo, lapack_int* n,
+                     lapack_complex_float* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_csytri2x( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_float* work, lapack_int* nb , lapack_int *info );
+void LAPACK_csytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const lapack_complex_float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* work , lapack_int *info );
+void LAPACK_cunbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    lapack_complex_float* x11, lapack_int* ldx11,
+                    lapack_complex_float* x12, lapack_int* ldx12,
+                    lapack_complex_float* x21, lapack_int* ldx21,
+                    lapack_complex_float* x22, lapack_int* ldx22,
+                    float* theta, float* phi,
+                    lapack_complex_float* taup1,
+                    lapack_complex_float* taup2,
+                    lapack_complex_float* tauq1,
+                    lapack_complex_float* tauq2,
+                    lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_cuncsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, lapack_complex_float* x11,
+                    lapack_int* ldx11, lapack_complex_float* x12,
+                    lapack_int* ldx12, lapack_complex_float* x21,
+                    lapack_int* ldx21, lapack_complex_float* x22,
+                    lapack_int* ldx22, float* theta,
+                    lapack_complex_float* u1, lapack_int* ldu1,
+                    lapack_complex_float* u2, lapack_int* ldu2,
+                    lapack_complex_float* v1t, lapack_int* ldv1t,
+                    lapack_complex_float* v2t, lapack_int* ldv2t,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    float* rwork, lapack_int* lrwork,
+                    lapack_int* iwork , lapack_int *info );
+void LAPACK_dbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    double* theta, double* phi, double* u1,
+                    lapack_int* ldu1, double* u2, lapack_int* ldu2,
+                    double* v1t, lapack_int* ldv1t, double* v2t,
+                    lapack_int* ldv2t, double* b11d, double* b11e,
+                    double* b12d, double* b12e, double* b21d,
+                    double* b21e, double* b22d, double* b22e,
+                    double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_dorbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    double* x11, lapack_int* ldx11, double* x12,
+                    lapack_int* ldx12, double* x21, lapack_int* ldx21,
+                    double* x22, lapack_int* ldx22, double* theta,
+                    double* phi, double* taup1, double* taup2,
+                    double* tauq1, double* tauq2, double* work,
+                    lapack_int* lwork , lapack_int *info );
+void LAPACK_dorcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, double* x11, lapack_int* ldx11,
+                    double* x12, lapack_int* ldx12, double* x21,
+                    lapack_int* ldx21, double* x22, lapack_int* ldx22,
+                    double* theta, double* u1, lapack_int* ldu1,
+                    double* u2, lapack_int* ldu2, double* v1t,
+                    lapack_int* ldv1t, double* v2t, lapack_int* ldv2t,
+                    double* work, lapack_int* lwork,
+                    lapack_int* iwork , lapack_int *info );
+void LAPACK_dsyconv( char* uplo, char* way,
+                     lapack_int* n, double* a, lapack_int* lda,
+                     const lapack_int* ipiv, double* work , lapack_int *info );
+void LAPACK_dsyswapr( char* uplo, lapack_int* n,
+                      double* a, lapack_int* i1, lapack_int* i2 );
+void LAPACK_dsytri2( char* uplo, lapack_int* n,
+                     double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_dsytri2x( char* uplo, lapack_int* n,
+                      double* a, lapack_int* lda,
+                      const lapack_int* ipiv, double* work,
+                      lapack_int* nb , lapack_int *info );
+void LAPACK_dsytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const double* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     double* b, lapack_int* ldb, double* work , lapack_int *info );
+void LAPACK_sbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    float* theta, float* phi, float* u1,
+                    lapack_int* ldu1, float* u2, lapack_int* ldu2,
+                    float* v1t, lapack_int* ldv1t, float* v2t,
+                    lapack_int* ldv2t, float* b11d, float* b11e,
+                    float* b12d, float* b12e, float* b21d,
+                    float* b21e, float* b22d, float* b22e,
+                    float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_sorbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    float* x11, lapack_int* ldx11, float* x12,
+                    lapack_int* ldx12, float* x21, lapack_int* ldx21,
+                    float* x22, lapack_int* ldx22, float* theta,
+                    float* phi, float* taup1, float* taup2,
+                    float* tauq1, float* tauq2, float* work,
+                    lapack_int* lwork , lapack_int *info );
+void LAPACK_sorcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, float* x11, lapack_int* ldx11,
+                    float* x12, lapack_int* ldx12, float* x21,
+                    lapack_int* ldx21, float* x22, lapack_int* ldx22,
+                    float* theta, float* u1, lapack_int* ldu1,
+                    float* u2, lapack_int* ldu2, float* v1t,
+                    lapack_int* ldv1t, float* v2t, lapack_int* ldv2t,
+                    float* work, lapack_int* lwork,
+                    lapack_int* iwork , lapack_int *info );
+void LAPACK_ssyconv( char* uplo, char* way,
+                     lapack_int* n, float* a, lapack_int* lda,
+                     const lapack_int* ipiv, float* work , lapack_int *info );
+void LAPACK_ssyswapr( char* uplo, lapack_int* n,
+                      float* a, lapack_int* i1, lapack_int* i2 );
+void LAPACK_ssytri2( char* uplo, lapack_int* n,
+                     float* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_ssytri2x( char* uplo, lapack_int* n,
+                      float* a, lapack_int* lda,
+                      const lapack_int* ipiv, float* work,
+                      lapack_int* nb , lapack_int *info );
+void LAPACK_ssytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     float* b, lapack_int* ldb, float* work , lapack_int *info );
+void LAPACK_zbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    double* theta, double* phi,
+                    lapack_complex_double* u1, lapack_int* ldu1,
+                    lapack_complex_double* u2, lapack_int* ldu2,
+                    lapack_complex_double* v1t, lapack_int* ldv1t,
+                    lapack_complex_double* v2t, lapack_int* ldv2t,
+                    double* b11d, double* b11e, double* b12d,
+                    double* b12e, double* b21d, double* b21e,
+                    double* b22d, double* b22e, double* rwork,
+                    lapack_int* lrwork , lapack_int *info );
+void LAPACK_zheswapr( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_zhetri2( char* uplo, lapack_int* n,
+                     lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_zhetri2x( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_double* work, lapack_int* nb , lapack_int *info );
+void LAPACK_zhetrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* work , lapack_int *info );
+void LAPACK_zsyconv( char* uplo, char* way,
+                     lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_double* work , lapack_int *info );
+void LAPACK_zsyswapr( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_zsytri2( char* uplo, lapack_int* n,
+                     lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_zsytri2x( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_double* work, lapack_int* nb , lapack_int *info );
+void LAPACK_zsytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* work , lapack_int *info );
+void LAPACK_zunbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    lapack_complex_double* x11, lapack_int* ldx11,
+                    lapack_complex_double* x12, lapack_int* ldx12,
+                    lapack_complex_double* x21, lapack_int* ldx21,
+                    lapack_complex_double* x22, lapack_int* ldx22,
+                    double* theta, double* phi,
+                    lapack_complex_double* taup1,
+                    lapack_complex_double* taup2,
+                    lapack_complex_double* tauq1,
+                    lapack_complex_double* tauq2,
+                    lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_zuncsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, lapack_complex_double* x11,
+                    lapack_int* ldx11, lapack_complex_double* x12,
+                    lapack_int* ldx12, lapack_complex_double* x21,
+                    lapack_int* ldx21, lapack_complex_double* x22,
+                    lapack_int* ldx22, double* theta,
+                    lapack_complex_double* u1, lapack_int* ldu1,
+                    lapack_complex_double* u2, lapack_int* ldu2,
+                    lapack_complex_double* v1t, lapack_int* ldv1t,
+                    lapack_complex_double* v2t, lapack_int* ldv2t,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork,
+                    lapack_int* iwork , lapack_int *info );
+// LAPACK 3.4.0
+void LAPACK_sgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb, const float* v,
+                     lapack_int* ldv, const float* t, lapack_int* ldt, float* c,
+                     lapack_int* ldc, float* work, lapack_int *info );
+void LAPACK_dgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb, const double* v,
+                     lapack_int* ldv, const double* t, lapack_int* ldt,
+                     double* c, lapack_int* ldc, double* work,
+                     lapack_int *info );
+void LAPACK_cgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb,
+                     const lapack_complex_float* v, lapack_int* ldv,
+                     const lapack_complex_float* t, lapack_int* ldt,
+                     lapack_complex_float* c, lapack_int* ldc,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb,
+                     const lapack_complex_double* v, lapack_int* ldv,
+                     const lapack_complex_double* t, lapack_int* ldt,
+                     lapack_complex_double* c, lapack_int* ldc,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_sgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, float* a,
+                    lapack_int* lda, float* t, lapack_int* ldt, float* work,
+                    lapack_int *info );
+void LAPACK_dgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, double* a,
+                    lapack_int* lda, double* t, lapack_int* ldt, double* work,
+                    lapack_int *info );
+void LAPACK_cgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_sgeqrt2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_dgeqrt2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_cgeqrt2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_zgeqrt2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_sgeqrt3( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_dgeqrt3( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_cgeqrt3( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_zgeqrt3( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_stpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const float* v, lapack_int* ldv, const float* t,
+                     lapack_int* ldt, float* a, lapack_int* lda, float* b,
+                     lapack_int* ldb, float* work, lapack_int *info );
+void LAPACK_dtpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const double* v, lapack_int* ldv, const double* t,
+                     lapack_int* ldt, double* a, lapack_int* lda, double* b,
+                     lapack_int* ldb, double* work, lapack_int *info );
+void LAPACK_ctpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const lapack_complex_float* v, lapack_int* ldv,
+                     const lapack_complex_float* t, lapack_int* ldt,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_ztpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const lapack_complex_double* v, lapack_int* ldv,
+                     const lapack_complex_double* t, lapack_int* ldt,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_dtpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* t, lapack_int* ldt, double* work,
+                    lapack_int *info );
+void LAPACK_ctpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* t, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int* ldt,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_ztpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_stpqrt2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* b, lapack_int* ldb, float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_dtpqrt2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* b, lapack_int* ldb, double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_ctpqrt2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_ztpqrt2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_stprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const float* v, lapack_int* ldv, const float* t,
+                    lapack_int* ldt, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, const float* mywork,
+                    lapack_int* myldwork );
+void LAPACK_dtprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const double* v, lapack_int* ldv, const double* t,
+                    lapack_int* ldt, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, const double* mywork,
+                    lapack_int* myldwork );
+void LAPACK_ctprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const lapack_complex_float* v, lapack_int* ldv,
+                    const lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    const float* mywork, lapack_int* myldwork );
+void LAPACK_ztprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const lapack_complex_double* v, lapack_int* ldv,
+                    const lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    const double* mywork, lapack_int* myldwork );
+// LAPACK 3.X.X
+void LAPACK_csyr( char* uplo, lapack_int* n, lapack_complex_float* alpha,
+                      const lapack_complex_float* x, lapack_int* incx,
+                      lapack_complex_float* a, lapack_int* lda );
+void LAPACK_zsyr( char* uplo, lapack_int* n, lapack_complex_double* alpha,
+                      const lapack_complex_double* x, lapack_int* incx,
+                      lapack_complex_double* a, lapack_int* lda );
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _LAPACKE_H_ */
+
+#endif /* _MKL_LAPACKE_H_ */
diff --git a/Eigen/src/misc/lapacke_mangling.h b/Eigen/src/misc/lapacke_mangling.h
new file mode 100644
index 000000000..6211fd144
--- /dev/null
+++ b/Eigen/src/misc/lapacke_mangling.h
@@ -0,0 +1,17 @@
+#ifndef LAPACK_HEADER_INCLUDED
+#define LAPACK_HEADER_INCLUDED
+
+#ifndef LAPACK_GLOBAL
+#if defined(LAPACK_GLOBAL_PATTERN_LC) || defined(ADD_)
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
+#elif defined(LAPACK_GLOBAL_PATTERN_UC) || defined(UPPER)
+#define LAPACK_GLOBAL(lcname,UCNAME)  UCNAME
+#elif defined(LAPACK_GLOBAL_PATTERN_MC) || defined(NOCHANGE)
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname
+#else
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
+#endif
+#endif
+
+#endif
+
diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
index 1951286f3..1f8a531af 100644
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -1,12 +1,14 @@
+
 /** \returns an expression of the coefficient wise product of \c *this and \a other
   *
   * \sa MatrixBase::cwiseProduct
   */
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)
 operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)(derived(), other.derived());
+  return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient wise quotient of \c *this and \a other
@@ -14,10 +16,11 @@ operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa MatrixBase::cwiseQuotient
   */
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar,typename OtherDerived::Scalar>, const Derived, const OtherDerived>
 operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar,typename OtherDerived::Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise min of \c *this and \a other
@@ -27,13 +30,14 @@ operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   *
   * \sa max()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(min,internal::scalar_min_op)
+EIGEN_MAKE_CWISE_BINARY_OP(min,min)
 
 /** \returns an expression of the coefficient-wise min of \c *this and scalar \a other
   *
   * \sa max()
   */
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived,
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived,
                                         const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 min
@@ -52,13 +56,14 @@ min
   *
   * \sa min()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(max,internal::scalar_max_op)
+EIGEN_MAKE_CWISE_BINARY_OP(max,max)
 
 /** \returns an expression of the coefficient-wise max of \c *this and scalar \a other
   *
   * \sa min()
   */
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived,
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived,
                                         const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 max
@@ -70,33 +75,62 @@ max
   return (max)(Derived::PlainObject::Constant(rows(), cols(), other));
 }
 
+/** \returns an expression of the coefficient-wise power of \c *this to the given array of \a exponents.
+  *
+  * This function computes the coefficient-wise power.
+  *
+  * Example: \include Cwise_array_power_array.cpp
+  * Output: \verbinclude Cwise_array_power_array.out
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(pow,pow)
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(pow,pow)
+#else
+/** \returns an expression of the coefficients of \c *this rasied to the constant power \a exponent
+  *
+  * \tparam T is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression.
+  *
+  * This function computes the coefficient-wise power. The function MatrixBase::pow() in the
+  * unsupported module MatrixFunctions computes the matrix power.
+  *
+  * Example: \include Cwise_pow.cpp
+  * Output: \verbinclude Cwise_pow.out
+  *
+  * \sa ArrayBase::pow(ArrayBase), square(), cube(), exp(), log()
+  */
+template<typename T>
+const CwiseBinaryOp<internal::scalar_pow_op<Scalar,T>,Derived,Constant<T> > pow(const T& exponent) const;
+#endif
+
 
+// TODO code generating macros could be moved to Macros.h and could include generation of documentation
 #define EIGEN_MAKE_CWISE_COMP_OP(OP, COMPARATOR) \
 template<typename OtherDerived> \
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived> \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, typename OtherDerived::Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived> \
 OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
 { \
-  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived>(derived(), other.derived()); \
+  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, typename OtherDerived::Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived>(derived(), other.derived()); \
 }\
-typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> > Cmp ## COMPARATOR ## ReturnType; \
-typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived > RCmp ## COMPARATOR ## ReturnType; \
-EIGEN_STRONG_INLINE const Cmp ## COMPARATOR ## ReturnType \
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> > Cmp ## COMPARATOR ## ReturnType; \
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,Scalar, internal::cmp_ ## COMPARATOR>, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived > RCmp ## COMPARATOR ## ReturnType; \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Cmp ## COMPARATOR ## ReturnType \
 OP(const Scalar& s) const { \
   return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \
 } \
-friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \
+EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \
 OP(const Scalar& s, const Derived& d) { \
   return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d); \
 }
 
 #define EIGEN_MAKE_CWISE_COMP_R_OP(OP, R_OP, RCOMPARATOR) \
 template<typename OtherDerived> \
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived> \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived> \
 OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
 { \
-  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived>(other.derived(), derived()); \
+  return CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived>(other.derived(), derived()); \
 } \
-\
+EIGEN_DEVICE_FUNC \
 inline const RCmp ## RCOMPARATOR ## ReturnType \
 OP(const Scalar& s) const { \
   return Derived::PlainObject::Constant(rows(), cols(), s).R_OP(*this); \
@@ -107,6 +141,7 @@ OP(const Scalar& s, const Derived& d) { \
 }
 
 
+
 /** \returns an expression of the coefficient-wise \< operator of *this and \a other
   *
   * Example: \include Cwise_less.cpp
@@ -171,83 +206,127 @@ EIGEN_MAKE_CWISE_COMP_OP(operator==, EQ)
   */
 EIGEN_MAKE_CWISE_COMP_OP(operator!=, NEQ)
 
+
 #undef EIGEN_MAKE_CWISE_COMP_OP
 #undef EIGEN_MAKE_CWISE_COMP_R_OP
 
 // scalar addition
-
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator+,sum)
+#else
 /** \returns an expression of \c *this with each coeff incremented by the constant \a scalar
   *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  *
   * Example: \include Cwise_plus.cpp
   * Output: \verbinclude Cwise_plus.out
   *
   * \sa operator+=(), operator-()
   */
-inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
-operator+(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>(derived(), internal::scalar_add_op<Scalar>(scalar));
-}
-
-friend inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
-operator+(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
-{
-  return other + scalar;
-}
+template<typename T>
+const CwiseBinaryOp<internal::scalar_sum_op<Scalar,T>,Derived,Constant<T> > operator+(const T& scalar) const;
+/** \returns an expression of \a expr with each coeff incremented by the constant \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T> friend
+const CwiseBinaryOp<internal::scalar_sum_op<T,Scalar>,Constant<T>,Derived> operator+(const T& scalar, const StorageBaseType& expr);
+#endif
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator-,difference)
+#else
 /** \returns an expression of \c *this with each coeff decremented by the constant \a scalar
   *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  *
   * Example: \include Cwise_minus.cpp
   * Output: \verbinclude Cwise_minus.out
   *
-  * \sa operator+(), operator-=()
+  * \sa operator+=(), operator-()
   */
-inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
-operator-(const Scalar& scalar) const
-{
-  return *this + (-scalar);
-}
+template<typename T>
+const CwiseBinaryOp<internal::scalar_difference_op<Scalar,T>,Derived,Constant<T> > operator-(const T& scalar) const;
+/** \returns an expression of the constant matrix of value \a scalar decremented by the coefficients of \a expr
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T> friend
+const CwiseBinaryOp<internal::scalar_difference_op<T,Scalar>,Constant<T>,Derived> operator-(const T& scalar, const StorageBaseType& expr);
+#endif
 
-friend inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> >
-operator-(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(operator/,quotient)
+#else
+  /**
+    * \brief Component-wise division of the scalar \a s by array elements of \a a.
+    *
+    * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression (\c Derived::Scalar).
+    */
+  template<typename T> friend
+  inline const CwiseBinaryOp<internal::scalar_quotient_op<T,Scalar>,Constant<T>,Derived>
+  operator/(const T& s,const StorageBaseType& a);
+#endif
+
+/** \returns an expression of the coefficient-wise ^ operator of *this and \a other
+ *
+ * \warning this operator is for expression of bool only.
+ *
+ * Example: \include Cwise_boolean_xor.cpp
+ * Output: \verbinclude Cwise_boolean_xor.out
+ *
+ * \sa operator&&(), select()
+ */
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+inline const CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
+operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return (-other) + scalar;
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>(derived(),other.derived());
 }
 
-/** \returns an expression of the coefficient-wise && operator of *this and \a other
+// NOTE disabled until we agree on argument order
+#if 0
+/** \cpp11 \returns an expression of the coefficient-wise polygamma function.
   *
-  * \warning this operator is for expression of bool only.
+  * \specialfunctions_module
   *
-  * Example: \include Cwise_boolean_and.cpp
-  * Output: \verbinclude Cwise_boolean_and.out
+  * It returns the \a n -th derivative of the digamma(psi) evaluated at \c *this.
   *
-  * \sa operator||(), select()
+  * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+  *
+  * \sa Eigen::polygamma()
   */
-template<typename OtherDerived>
-inline const CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
-operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+template<typename DerivedN>
+inline const CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>
+polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedN> &n) const
 {
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>(derived(),other.derived());
+  return CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>(n.derived(), this->derived());
 }
+#endif
 
-/** \returns an expression of the coefficient-wise || operator of *this and \a other
+/** \returns an expression of the coefficient-wise zeta function.
+  *
+  * \specialfunctions_module
+  *
+  * It returns the Riemann zeta function of two arguments \c *this and \a q:
   *
-  * \warning this operator is for expression of bool only.
+  * \param *this is the exposent, it must be > 1
+  * \param q is the shift, it must be > 0
   *
-  * Example: \include Cwise_boolean_or.cpp
-  * Output: \verbinclude Cwise_boolean_or.out
+  * \note This function supports only float and double scalar types. To support other scalar types, the user has
+  * to provide implementations of zeta(T,T) for any scalar type T to be supported.
   *
-  * \sa operator&&(), select()
+  * This method is an alias for zeta(*this,q);
+  *
+  * \sa Eigen::zeta()
   */
-template<typename OtherDerived>
-inline const CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
-operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+template<typename DerivedQ>
+inline const CwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const DerivedQ>
+zeta(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedQ> &q) const
 {
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
+  return CwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const DerivedQ>(this->derived(), q.derived());
 }
-
-
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 1c3ed3fcd..ebaa3f192 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -1,16 +1,62 @@
 
 
+typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> AbsReturnType;
+typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> ArgReturnType;
+typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> Abs2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> SqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived> RsqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> SignReturnType;
+typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> InverseReturnType;
+typedef CwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived> BooleanNotReturnType;
+
+typedef CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> ExpReturnType;
+typedef CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> LogReturnType;
+typedef CwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> Log1pReturnType;
+typedef CwiseUnaryOp<internal::scalar_log10_op<Scalar>, const Derived> Log10ReturnType;
+typedef CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived> CosReturnType;
+typedef CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived> SinReturnType;
+typedef CwiseUnaryOp<internal::scalar_tan_op<Scalar>, const Derived> TanReturnType;
+typedef CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived> AcosReturnType;
+typedef CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived> AsinReturnType;
+typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturnType;
+typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
+typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
+typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
+typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
+typedef CwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived> RoundReturnType;
+typedef CwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived> FloorReturnType;
+typedef CwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived> CeilReturnType;
+typedef CwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived> IsNaNReturnType;
+typedef CwiseUnaryOp<internal::scalar_isinf_op<Scalar>, const Derived> IsInfReturnType;
+typedef CwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived> IsFiniteReturnType;
+
 /** \returns an expression of the coefficient-wise absolute value of \c *this
   *
   * Example: \include Cwise_abs.cpp
   * Output: \verbinclude Cwise_abs.out
   *
-  * \sa abs2()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs">Math functions</a>, abs2()
   */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const AbsReturnType
 abs() const
 {
-  return derived();
+  return AbsReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise phase angle of \c *this
+  *
+  * Example: \include Cwise_arg.cpp
+  * Output: \verbinclude Cwise_arg.out
+  *
+  * \sa abs()
+  */
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const ArgReturnType
+arg() const
+{
+  return ArgReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise squared absolute value of \c *this
@@ -18,78 +64,190 @@ abs() const
   * Example: \include Cwise_abs2.cpp
   * Output: \verbinclude Cwise_abs2.out
   *
-  * \sa abs(), square()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs2">Math functions</a>, abs(), square()
   */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const Abs2ReturnType
 abs2() const
 {
-  return derived();
+  return Abs2ReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise exponential of *this.
   *
+  * This function computes the coefficient-wise exponential. The function MatrixBase::exp() in the
+  * unsupported module MatrixFunctions computes the matrix exponential.
+  *
   * Example: \include Cwise_exp.cpp
   * Output: \verbinclude Cwise_exp.out
   *
-  * \sa pow(), log(), sin(), cos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, pow(), log(), sin(), cos()
   */
-inline const CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const ExpReturnType
 exp() const
 {
-  return derived();
+  return ExpReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise logarithm of *this.
   *
+  * This function computes the coefficient-wise logarithm. The function MatrixBase::log() in the
+  * unsupported module MatrixFunctions computes the matrix logarithm.
+  *
   * Example: \include Cwise_log.cpp
   * Output: \verbinclude Cwise_log.out
   *
-  * \sa exp()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, exp()
   */
-inline const CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const LogReturnType
 log() const
 {
-  return derived();
+  return LogReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise logarithm of 1 plus \c *this.
+  *
+  * In exact arithmetic, \c x.log() is equivalent to \c (x+1).log(),
+  * however, with finite precision, this function is much more accurate when \c x is close to zero.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log1p">Math functions</a>, log()
+  */
+EIGEN_DEVICE_FUNC
+inline const Log1pReturnType
+log1p() const
+{
+  return Log1pReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise base-10 logarithm of *this.
+  *
+  * This function computes the coefficient-wise base-10 logarithm.
+  *
+  * Example: \include Cwise_log10.cpp
+  * Output: \verbinclude Cwise_log10.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log10">Math functions</a>, log()
+  */
+EIGEN_DEVICE_FUNC
+inline const Log10ReturnType
+log10() const
+{
+  return Log10ReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise square root of *this.
   *
+  * This function computes the coefficient-wise square root. The function MatrixBase::sqrt() in the
+  * unsupported module MatrixFunctions computes the matrix square root.
+  *
   * Example: \include Cwise_sqrt.cpp
   * Output: \verbinclude Cwise_sqrt.out
   *
-  * \sa pow(), square()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square()
   */
-inline const CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const SqrtReturnType
 sqrt() const
 {
-  return derived();
+  return SqrtReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise inverse square root of *this.
+  *
+  * This function computes the coefficient-wise inverse square root.
+  *
+  * Example: \include Cwise_sqrt.cpp
+  * Output: \verbinclude Cwise_sqrt.out
+  *
+  * \sa pow(), square()
+  */
+EIGEN_DEVICE_FUNC
+inline const RsqrtReturnType
+rsqrt() const
+{
+  return RsqrtReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise signum of *this.
+  *
+  * This function computes the coefficient-wise signum.
+  *
+  * Example: \include Cwise_sign.cpp
+  * Output: \verbinclude Cwise_sign.out
+  *
+  * \sa pow(), square()
+  */
+EIGEN_DEVICE_FUNC
+inline const SignReturnType
+sign() const
+{
+  return SignReturnType(derived());
+}
+
+
 /** \returns an expression of the coefficient-wise cosine of *this.
   *
+  * This function computes the coefficient-wise cosine. The function MatrixBase::cos() in the
+  * unsupported module MatrixFunctions computes the matrix cosine.
+  *
   * Example: \include Cwise_cos.cpp
   * Output: \verbinclude Cwise_cos.out
   *
-  * \sa sin(), acos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cos">Math functions</a>, sin(), acos()
   */
-inline const CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const CosReturnType
 cos() const
 {
-  return derived();
+  return CosReturnType(derived());
 }
 
 
 /** \returns an expression of the coefficient-wise sine of *this.
   *
+  * This function computes the coefficient-wise sine. The function MatrixBase::sin() in the
+  * unsupported module MatrixFunctions computes the matrix sine.
+  *
   * Example: \include Cwise_sin.cpp
   * Output: \verbinclude Cwise_sin.out
   *
-  * \sa cos(), asin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sin">Math functions</a>, cos(), asin()
   */
-inline const CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const SinReturnType
 sin() const
 {
-  return derived();
+  return SinReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise tan of *this.
+  *
+  * Example: \include Cwise_tan.cpp
+  * Output: \verbinclude Cwise_tan.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tan">Math functions</a>, cos(), sin()
+  */
+EIGEN_DEVICE_FUNC
+inline const TanReturnType
+tan() const
+{
+  return TanReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise arc tan of *this.
+  *
+  * Example: \include Cwise_atan.cpp
+  * Output: \verbinclude Cwise_atan.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atan">Math functions</a>, tan(), asin(), acos()
+  */
+EIGEN_DEVICE_FUNC
+inline const AtanReturnType
+atan() const
+{
+  return AtanReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise arc cosine of *this.
@@ -97,12 +255,13 @@ sin() const
   * Example: \include Cwise_acos.cpp
   * Output: \verbinclude Cwise_acos.out
   *
-  * \sa cos(), asin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acos">Math functions</a>, cos(), asin()
   */
-inline const CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const AcosReturnType
 acos() const
 {
-  return derived();
+  return AcosReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise arc sine of *this.
@@ -110,42 +269,56 @@ acos() const
   * Example: \include Cwise_asin.cpp
   * Output: \verbinclude Cwise_asin.out
   *
-  * \sa sin(), acos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asin">Math functions</a>, sin(), acos()
   */
-inline const CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const AsinReturnType
 asin() const
 {
-  return derived();
+  return AsinReturnType(derived());
 }
 
-/** \returns an expression of the coefficient-wise tan of *this.
+/** \returns an expression of the coefficient-wise hyperbolic tan of *this.
   *
-  * Example: \include Cwise_tan.cpp
-  * Output: \verbinclude Cwise_tan.out
+  * Example: \include Cwise_tanh.cpp
+  * Output: \verbinclude Cwise_tanh.out
   *
-  * \sa cos(), sin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tanh">Math functions</a>, tan(), sinh(), cosh()
   */
-inline const CwiseUnaryOp<internal::scalar_tan_op<Scalar>, Derived>
-tan() const
+EIGEN_DEVICE_FUNC
+inline const TanhReturnType
+tanh() const
 {
-  return derived();
+  return TanhReturnType(derived());
 }
 
-
-/** \returns an expression of the coefficient-wise power of *this to the given exponent.
+/** \returns an expression of the coefficient-wise hyperbolic sin of *this.
   *
-  * Example: \include Cwise_pow.cpp
-  * Output: \verbinclude Cwise_pow.out
+  * Example: \include Cwise_sinh.cpp
+  * Output: \verbinclude Cwise_sinh.out
   *
-  * \sa exp(), log()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sinh">Math functions</a>, sin(), tanh(), cosh()
   */
-inline const CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
-pow(const Scalar& exponent) const
+EIGEN_DEVICE_FUNC
+inline const SinhReturnType
+sinh() const
 {
-  return CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
-          (derived(), internal::scalar_pow_op<Scalar>(exponent));
+  return SinhReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise hyperbolic cos of *this.
+  *
+  * Example: \include Cwise_cosh.cpp
+  * Output: \verbinclude Cwise_cosh.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tan(), sinh(), cosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const CoshReturnType
+cosh() const
+{
+  return CoshReturnType(derived());
+}
 
 /** \returns an expression of the coefficient-wise inverse of *this.
   *
@@ -154,10 +327,11 @@ pow(const Scalar& exponent) const
   *
   * \sa operator/(), operator*()
   */
-inline const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const InverseReturnType
 inverse() const
 {
-  return derived();
+  return InverseReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise square of *this.
@@ -165,12 +339,13 @@ inverse() const
   * Example: \include Cwise_square.cpp
   * Output: \verbinclude Cwise_square.out
   *
-  * \sa operator/(), operator*(), abs2()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_squareE">Math functions</a>, abs2(), cube(), pow()
   */
-inline const CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const SquareReturnType
 square() const
 {
-  return derived();
+  return SquareReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise cube of *this.
@@ -178,10 +353,200 @@ square() const
   * Example: \include Cwise_cube.cpp
   * Output: \verbinclude Cwise_cube.out
   *
-  * \sa square(), pow()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cube">Math functions</a>, square(), pow()
   */
-inline const CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const CubeReturnType
 cube() const
 {
-  return derived();
+  return CubeReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise round of *this.
+  *
+  * Example: \include Cwise_round.cpp
+  * Output: \verbinclude Cwise_round.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_round">Math functions</a>, ceil(), floor()
+  */
+EIGEN_DEVICE_FUNC
+inline const RoundReturnType
+round() const
+{
+  return RoundReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise floor of *this.
+  *
+  * Example: \include Cwise_floor.cpp
+  * Output: \verbinclude Cwise_floor.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_floor">Math functions</a>, ceil(), round()
+  */
+EIGEN_DEVICE_FUNC
+inline const FloorReturnType
+floor() const
+{
+  return FloorReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise ceil of *this.
+  *
+  * Example: \include Cwise_ceil.cpp
+  * Output: \verbinclude Cwise_ceil.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ceil">Math functions</a>, floor(), round()
+  */
+EIGEN_DEVICE_FUNC
+inline const CeilReturnType
+ceil() const
+{
+  return CeilReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise isnan of *this.
+  *
+  * Example: \include Cwise_isNaN.cpp
+  * Output: \verbinclude Cwise_isNaN.out
+  *
+  * \sa isfinite(), isinf()
+  */
+EIGEN_DEVICE_FUNC
+inline const IsNaNReturnType
+isNaN() const
+{
+  return IsNaNReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise isinf of *this.
+  *
+  * Example: \include Cwise_isInf.cpp
+  * Output: \verbinclude Cwise_isInf.out
+  *
+  * \sa isnan(), isfinite()
+  */
+EIGEN_DEVICE_FUNC
+inline const IsInfReturnType
+isInf() const
+{
+  return IsInfReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise isfinite of *this.
+  *
+  * Example: \include Cwise_isFinite.cpp
+  * Output: \verbinclude Cwise_isFinite.out
+  *
+  * \sa isnan(), isinf()
+  */
+EIGEN_DEVICE_FUNC
+inline const IsFiniteReturnType
+isFinite() const
+{
+  return IsFiniteReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise ! operator of *this
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_not.cpp
+  * Output: \verbinclude Cwise_boolean_not.out
+  *
+  * \sa operator!=()
+  */
+EIGEN_DEVICE_FUNC
+inline const BooleanNotReturnType
+operator!() const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return BooleanNotReturnType(derived());
+}
+
+
+// --- SpecialFunctions module ---
+
+typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
+typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
+
+/** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
+  *
+  * \specialfunctions_module
+  *
+  * Example: \include Cwise_lgamma.cpp
+  * Output: \verbinclude Cwise_lgamma.out
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_lgamma">Math functions</a>, digamma()
+  */
+EIGEN_DEVICE_FUNC
+inline const LgammaReturnType
+lgamma() const
+{
+  return LgammaReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
+  *
+  * \specialfunctions_module
+  *
+  * \note This function supports only float and double scalar types. To support other scalar types,
+  * the user has to provide implementations of digamma(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_digamma">Math functions</a>, Eigen::digamma(), Eigen::polygamma(), lgamma()
+  */
+EIGEN_DEVICE_FUNC
+inline const DigammaReturnType
+digamma() const
+{
+  return DigammaReturnType(derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise Gauss error
+  * function of *this.
+  *
+  * \specialfunctions_module
+  *
+  * Example: \include Cwise_erf.cpp
+  * Output: \verbinclude Cwise_erf.out
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erf">Math functions</a>, erfc()
+  */
+EIGEN_DEVICE_FUNC
+inline const ErfReturnType
+erf() const
+{
+  return ErfReturnType(derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise Complementary error
+  * function of *this.
+  *
+  * \specialfunctions_module
+  *
+  * Example: \include Cwise_erfc.cpp
+  * Output: \verbinclude Cwise_erfc.out
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erfc">Math functions</a>, erf()
+  */
+EIGEN_DEVICE_FUNC
+inline const ErfcReturnType
+erfc() const
+{
+  return ErfcReturnType(derived());
 }
diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h
index 2788251e0..ac35a0086 100644
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
@@ -8,27 +8,32 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-/** \internal expression type of a column */
+/// \internal expression type of a column */
 typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ColXpr;
 typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ConstColXpr;
-/** \internal expression type of a row */
+/// \internal expression type of a row */
 typedef Block<Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowXpr;
 typedef const Block<const Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowXpr;
-/** \internal expression type of a block of whole columns */
+/// \internal expression type of a block of whole columns */
 typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ColsBlockXpr;
 typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ConstColsBlockXpr;
-/** \internal expression type of a block of whole rows */
+/// \internal expression type of a block of whole rows */
 typedef Block<Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowsBlockXpr;
 typedef const Block<const Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowsBlockXpr;
-/** \internal expression type of a block of whole columns */
+/// \internal expression type of a block of whole columns */
 template<int N> struct NColsBlockXpr { typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
 template<int N> struct ConstNColsBlockXpr { typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
-/** \internal expression type of a block of whole rows */
+/// \internal expression type of a block of whole rows */
 template<int N> struct NRowsBlockXpr { typedef Block<Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
 template<int N> struct ConstNRowsBlockXpr { typedef const Block<const Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
+/// \internal expression of a block */
+typedef Block<Derived> BlockXpr;
+typedef const Block<const Derived> ConstBlockXpr;
+/// \internal expression of a block of fixed sizes */
+template<int Rows, int Cols> struct FixedBlockXpr { typedef Block<Derived,Rows,Cols> Type; };
+template<int Rows, int Cols> struct ConstFixedBlockXpr { typedef Block<const Derived,Rows,Cols> Type; };
 
 typedef VectorBlock<Derived> SegmentReturnType;
 typedef const VectorBlock<const Derived> ConstSegmentReturnType;
@@ -37,378 +42,430 @@ template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBloc
 
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
-/** \returns a dynamic-size expression of a block in *this.
-  *
-  * \param startRow the first row in the block
-  * \param startCol the first column in the block
-  * \param blockRows the number of rows in the block
-  * \param blockCols the number of columns in the block
-  *
-  * Example: \include MatrixBase_block_int_int_int_int.cpp
-  * Output: \verbinclude MatrixBase_block_int_int_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
-inline Block<Derived> block(Index startRow, Index startCol, Index blockRows, Index blockCols)
-{
-  return Block<Derived>(derived(), startRow, startCol, blockRows, blockCols);
-}
-
-/** This is the const version of block(Index,Index,Index,Index). */
-inline const Block<const Derived> block(Index startRow, Index startCol, Index blockRows, Index blockCols) const
-{
-  return Block<const Derived>(derived(), startRow, startCol, blockRows, blockCols);
-}
-
-
-
-
-/** \returns a dynamic-size expression of a top-right corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_topRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_topRightCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline Block<Derived> topRightCorner(Index cRows, Index cCols)
-{
-  return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
-}
-
-/** This is the const version of topRightCorner(Index, Index).*/
-inline const Block<const Derived> topRightCorner(Index cRows, Index cCols) const
-{
-  return Block<const Derived>(derived(), 0, cols() - cCols, cRows, cCols);
-}
-
-/** \returns an expression of a fixed-size top-right corner of *this.
-  *
-  * \tparam CRows the number of rows in the corner
-  * \tparam CCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_template_int_int_topRightCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
-  *
-  * \sa class Block, block<int,int>(Index,Index)
-  */
+/// \returns a dynamic-size expression of a block in *this.
+///
+/// \param startRow the first row in the block
+/// \param startCol the first column in the block
+/// \param blockRows the number of rows in the block
+/// \param blockCols the number of columns in the block
+///
+/// Example: \include MatrixBase_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_block_int_int_int_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index)
+///
+EIGEN_DEVICE_FUNC
+inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols)
+{
+  return BlockXpr(derived(), startRow, startCol, blockRows, blockCols);
+}
+
+/// This is the const version of block(Index,Index,Index,Index). */
+EIGEN_DEVICE_FUNC
+inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const
+{
+  return ConstBlockXpr(derived(), startRow, startCol, blockRows, blockCols);
+}
+
+
+
+
+/// \returns a dynamic-size expression of a top-right corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_topRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_topRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+EIGEN_DEVICE_FUNC
+inline BlockXpr topRightCorner(Index cRows, Index cCols)
+{
+  return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols);
+}
+
+/// This is the const version of topRightCorner(Index, Index).
+EIGEN_DEVICE_FUNC
+inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const
+{
+  return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols);
+}
+
+/// \returns an expression of a fixed-size top-right corner of *this.
+///
+/// \tparam CRows the number of rows in the corner
+/// \tparam CCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_template_int_int_topRightCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block<int,int>(Index,Index)
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> topRightCorner()
+EIGEN_DEVICE_FUNC
+inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner()
 {
-  return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
-/** This is the const version of topRightCorner<int, int>().*/
+/// This is the const version of topRightCorner<int, int>().
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> topRightCorner() const
-{
-  return Block<const Derived, CRows, CCols>(derived(), 0, cols() - CCols);
-}
-
-/** \returns an expression of a top-right corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
-  *
-  * \sa class Block
-  */
+EIGEN_DEVICE_FUNC
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner() const
+{
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
+}
+
+/// \returns an expression of a top-right corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> topRightCorner(Index cRows, Index cCols)
+inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols)
 {
-  return Block<Derived, CRows, CCols>(derived(), 0, cols() - cCols, cRows, cCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of topRightCorner<int, int>(Index, Index).*/
+/// This is the const version of topRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> topRightCorner(Index cRows, Index cCols) const
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived, CRows, CCols>(derived(), 0, cols() - cCols, cRows, cCols);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
 
 
-/** \returns a dynamic-size expression of a top-left corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_topLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline Block<Derived> topLeftCorner(Index cRows, Index cCols)
+/// \returns a dynamic-size expression of a top-left corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_topLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+EIGEN_DEVICE_FUNC
+inline BlockXpr topLeftCorner(Index cRows, Index cCols)
 {
-  return Block<Derived>(derived(), 0, 0, cRows, cCols);
+  return BlockXpr(derived(), 0, 0, cRows, cCols);
 }
 
-/** This is the const version of topLeftCorner(Index, Index).*/
-inline const Block<const Derived> topLeftCorner(Index cRows, Index cCols) const
+/// This is the const version of topLeftCorner(Index, Index).
+EIGEN_DEVICE_FUNC
+inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived>(derived(), 0, 0, cRows, cCols);
+  return ConstBlockXpr(derived(), 0, 0, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size top-left corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size top-left corner of *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> topLeftCorner()
+EIGEN_DEVICE_FUNC
+inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner()
 {
-  return Block<Derived, CRows, CCols>(derived(), 0, 0);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
-/** This is the const version of topLeftCorner<int, int>().*/
+/// This is the const version of topLeftCorner<int, int>().
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> topLeftCorner() const
-{
-  return Block<const Derived, CRows, CCols>(derived(), 0, 0);
-}
-
-/** \returns an expression of a top-left corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
-  *
-  * \sa class Block
-  */
+EIGEN_DEVICE_FUNC
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner() const
+{
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
+}
+
+/// \returns an expression of a top-left corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> topLeftCorner(Index cRows, Index cCols)
+inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols)
 {
-  return Block<Derived, CRows, CCols>(derived(), 0, 0, cRows, cCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
-/** This is the const version of topLeftCorner<int, int>(Index, Index).*/
+/// This is the const version of topLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> topLeftCorner(Index cRows, Index cCols) const
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived, CRows, CCols>(derived(), 0, 0, cRows, cCols);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
 
 
-/** \returns a dynamic-size expression of a bottom-right corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_bottomRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline Block<Derived> bottomRightCorner(Index cRows, Index cCols)
+/// \returns a dynamic-size expression of a bottom-right corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_bottomRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+EIGEN_DEVICE_FUNC
+inline BlockXpr bottomRightCorner(Index cRows, Index cCols)
 {
-  return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of bottomRightCorner(Index, Index).*/
-inline const Block<const Derived> bottomRightCorner(Index cRows, Index cCols) const
+/// This is the const version of bottomRightCorner(Index, Index).
+EIGEN_DEVICE_FUNC
+inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size bottom-right corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size bottom-right corner of *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> bottomRightCorner()
+EIGEN_DEVICE_FUNC
+inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
 {
-  return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
-/** This is the const version of bottomRightCorner<int, int>().*/
+/// This is the const version of bottomRightCorner<int, int>().
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> bottomRightCorner() const
-{
-  return Block<const Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
-}
-
-/** \returns an expression of a bottom-right corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
-  *
-  * \sa class Block
-  */
+EIGEN_DEVICE_FUNC
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner() const
+{
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
+}
+
+/// \returns an expression of a bottom-right corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> bottomRightCorner(Index cRows, Index cCols)
+inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols)
 {
-  return Block<Derived, CRows, CCols>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of bottomRightCorner<int, int>(Index, Index).*/
+/// This is the const version of bottomRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> bottomRightCorner(Index cRows, Index cCols) const
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived, CRows, CCols>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
 
 
-/** \returns a dynamic-size expression of a bottom-left corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline Block<Derived> bottomLeftCorner(Index cRows, Index cCols)
+/// \returns a dynamic-size expression of a bottom-left corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+EIGEN_DEVICE_FUNC
+inline BlockXpr bottomLeftCorner(Index cRows, Index cCols)
 {
-  return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
+  return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/** This is the const version of bottomLeftCorner(Index, Index).*/
-inline const Block<const Derived> bottomLeftCorner(Index cRows, Index cCols) const
+/// This is the const version of bottomLeftCorner(Index, Index).
+EIGEN_DEVICE_FUNC
+inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived>(derived(), rows() - cRows, 0, cRows, cCols);
+  return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size bottom-left corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size bottom-left corner of *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> bottomLeftCorner()
+EIGEN_DEVICE_FUNC
+inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner()
 {
-  return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
-/** This is the const version of bottomLeftCorner<int, int>().*/
+/// This is the const version of bottomLeftCorner<int, int>().
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> bottomLeftCorner() const
-{
-  return Block<const Derived, CRows, CCols>(derived(), rows() - CRows, 0);
-}
-
-/** \returns an expression of a bottom-left corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
-  *
-  * \sa class Block
-  */
+EIGEN_DEVICE_FUNC
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner() const
+{
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
+}
+
+/// \returns an expression of a bottom-left corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> bottomLeftCorner(Index cRows, Index cCols)
+inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols)
 {
-  return Block<Derived, CRows, CCols>(derived(), rows() - cRows, 0, cRows, cCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/** This is the const version of bottomLeftCorner<int, int>(Index, Index).*/
+/// This is the const version of bottomLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> bottomLeftCorner(Index cRows, Index cCols) const
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived, CRows, CCols>(derived(), rows() - cRows, 0, cRows, cCols);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
 
 
-/** \returns a block consisting of the top rows of *this.
-  *
-  * \param n the number of rows in the block
-  *
-  * Example: \include MatrixBase_topRows_int.cpp
-  * Output: \verbinclude MatrixBase_topRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the top rows of *this.
+///
+/// \param n the number of rows in the block
+///
+/// Example: \include MatrixBase_topRows_int.cpp
+/// Output: \verbinclude MatrixBase_topRows_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+EIGEN_DEVICE_FUNC
 inline RowsBlockXpr topRows(Index n)
 {
   return RowsBlockXpr(derived(), 0, 0, n, cols());
 }
 
-/** This is the const version of topRows(Index).*/
+/// This is the const version of topRows(Index).
+EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr topRows(Index n) const
 {
   return ConstRowsBlockXpr(derived(), 0, 0, n, cols());
 }
 
-/** \returns a block consisting of the top rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_topRows.cpp
-  * Output: \verbinclude MatrixBase_template_int_topRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the top rows of *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_topRows.cpp
+/// Output: \verbinclude MatrixBase_template_int_topRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
-/** This is the const version of topRows<int>().*/
+/// This is the const version of topRows<int>().
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
@@ -416,47 +473,55 @@ inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
 
 
 
-/** \returns a block consisting of the bottom rows of *this.
-  *
-  * \param n the number of rows in the block
-  *
-  * Example: \include MatrixBase_bottomRows_int.cpp
-  * Output: \verbinclude MatrixBase_bottomRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the bottom rows of *this.
+///
+/// \param n the number of rows in the block
+///
+/// Example: \include MatrixBase_bottomRows_int.cpp
+/// Output: \verbinclude MatrixBase_bottomRows_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+EIGEN_DEVICE_FUNC
 inline RowsBlockXpr bottomRows(Index n)
 {
   return RowsBlockXpr(derived(), rows() - n, 0, n, cols());
 }
 
-/** This is the const version of bottomRows(Index).*/
+/// This is the const version of bottomRows(Index).
+EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr bottomRows(Index n) const
 {
   return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols());
 }
 
-/** \returns a block consisting of the bottom rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_bottomRows.cpp
-  * Output: \verbinclude MatrixBase_template_int_bottomRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the bottom rows of *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_bottomRows.cpp
+/// Output: \verbinclude MatrixBase_template_int_bottomRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
-/** This is the const version of bottomRows<int>().*/
+/// This is the const version of bottomRows<int>().
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
@@ -464,49 +529,57 @@ inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
 
 
 
-/** \returns a block consisting of a range of rows of *this.
-  *
-  * \param startRow the index of the first row in the block
-  * \param n the number of rows in the block
-  *
-  * Example: \include DenseBase_middleRows_int.cpp
-  * Output: \verbinclude DenseBase_middleRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of rows of *this.
+///
+/// \param startRow the index of the first row in the block
+/// \param n the number of rows in the block
+///
+/// Example: \include DenseBase_middleRows_int.cpp
+/// Output: \verbinclude DenseBase_middleRows_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+EIGEN_DEVICE_FUNC
 inline RowsBlockXpr middleRows(Index startRow, Index n)
 {
   return RowsBlockXpr(derived(), startRow, 0, n, cols());
 }
 
-/** This is the const version of middleRows(Index,Index).*/
+/// This is the const version of middleRows(Index,Index).
+EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const
 {
   return ConstRowsBlockXpr(derived(), startRow, 0, n, cols());
 }
 
-/** \returns a block consisting of a range of rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param startRow the index of the first row in the block
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include DenseBase_template_int_middleRows.cpp
-  * Output: \verbinclude DenseBase_template_int_middleRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of rows of *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param startRow the index of the first row in the block
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include DenseBase_template_int_middleRows.cpp
+/// Output: \verbinclude DenseBase_template_int_middleRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
-/** This is the const version of middleRows<int>().*/
+/// This is the const version of middleRows<int>().
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
@@ -514,47 +587,55 @@ inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n =
 
 
 
-/** \returns a block consisting of the left columns of *this.
-  *
-  * \param n the number of columns in the block
-  *
-  * Example: \include MatrixBase_leftCols_int.cpp
-  * Output: \verbinclude MatrixBase_leftCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the left columns of *this.
+///
+/// \param n the number of columns in the block
+///
+/// Example: \include MatrixBase_leftCols_int.cpp
+/// Output: \verbinclude MatrixBase_leftCols_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+EIGEN_DEVICE_FUNC
 inline ColsBlockXpr leftCols(Index n)
 {
   return ColsBlockXpr(derived(), 0, 0, rows(), n);
 }
 
-/** This is the const version of leftCols(Index).*/
+/// This is the const version of leftCols(Index).
+EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr leftCols(Index n) const
 {
   return ConstColsBlockXpr(derived(), 0, 0, rows(), n);
 }
 
-/** \returns a block consisting of the left columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_leftCols.cpp
-  * Output: \verbinclude MatrixBase_template_int_leftCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the left columns of *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_leftCols.cpp
+/// Output: \verbinclude MatrixBase_template_int_leftCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
-/** This is the const version of leftCols<int>().*/
+/// This is the const version of leftCols<int>().
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
@@ -562,47 +643,55 @@ inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
 
 
 
-/** \returns a block consisting of the right columns of *this.
-  *
-  * \param n the number of columns in the block
-  *
-  * Example: \include MatrixBase_rightCols_int.cpp
-  * Output: \verbinclude MatrixBase_rightCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the right columns of *this.
+///
+/// \param n the number of columns in the block
+///
+/// Example: \include MatrixBase_rightCols_int.cpp
+/// Output: \verbinclude MatrixBase_rightCols_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+EIGEN_DEVICE_FUNC
 inline ColsBlockXpr rightCols(Index n)
 {
   return ColsBlockXpr(derived(), 0, cols() - n, rows(), n);
 }
 
-/** This is the const version of rightCols(Index).*/
+/// This is the const version of rightCols(Index).
+EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr rightCols(Index n) const
 {
   return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n);
 }
 
-/** \returns a block consisting of the right columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_rightCols.cpp
-  * Output: \verbinclude MatrixBase_template_int_rightCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the right columns of *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_rightCols.cpp
+/// Output: \verbinclude MatrixBase_template_int_rightCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
-/** This is the const version of rightCols<int>().*/
+/// This is the const version of rightCols<int>().
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
@@ -610,49 +699,57 @@ inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
 
 
 
-/** \returns a block consisting of a range of columns of *this.
-  *
-  * \param startCol the index of the first column in the block
-  * \param numCols the number of columns in the block
-  *
-  * Example: \include DenseBase_middleCols_int.cpp
-  * Output: \verbinclude DenseBase_middleCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of columns of *this.
+///
+/// \param startCol the index of the first column in the block
+/// \param numCols the number of columns in the block
+///
+/// Example: \include DenseBase_middleCols_int.cpp
+/// Output: \verbinclude DenseBase_middleCols_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+EIGEN_DEVICE_FUNC
 inline ColsBlockXpr middleCols(Index startCol, Index numCols)
 {
   return ColsBlockXpr(derived(), 0, startCol, rows(), numCols);
 }
 
-/** This is the const version of middleCols(Index,Index).*/
+/// This is the const version of middleCols(Index,Index).
+EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
 {
   return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols);
 }
 
-/** \returns a block consisting of a range of columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param startCol the index of the first column in the block
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include DenseBase_template_int_middleCols.cpp
-  * Output: \verbinclude DenseBase_template_int_middleCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of columns of *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param startCol the index of the first column in the block
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include DenseBase_template_int_middleCols.cpp
+/// Output: \verbinclude DenseBase_template_int_middleCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
-/** This is the const version of middleCols<int>().*/
+/// This is the const version of middleCols<int>().
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
@@ -660,119 +757,134 @@ inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n =
 
 
 
-/** \returns a fixed-size expression of a block in *this.
-  *
-  * The template parameters \a BlockRows and \a BlockCols are the number of
-  * rows and columns in the block.
-  *
-  * \param startRow the first row in the block
-  * \param startCol the first column in the block
-  *
-  * Example: \include MatrixBase_block_int_int.cpp
-  * Output: \verbinclude MatrixBase_block_int_int.out
-  *
-  * \note since block is a templated member, the keyword template has to be used
-  * if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int BlockRows, int BlockCols>
-inline Block<Derived, BlockRows, BlockCols> block(Index startRow, Index startCol)
-{
-  return Block<Derived, BlockRows, BlockCols>(derived(), startRow, startCol);
-}
-
-/** This is the const version of block<>(Index, Index). */
-template<int BlockRows, int BlockCols>
-inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, Index startCol) const
-{
-  return Block<const Derived, BlockRows, BlockCols>(derived(), startRow, startCol);
-}
-
-/** \returns an expression of a block in *this.
-  *
-  * \tparam BlockRows number of rows in block as specified at compile-time
-  * \tparam BlockCols number of columns in block as specified at compile-time
-  * \param  startRow  the first row in the block
-  * \param  startCol  the first column in the block
-  * \param  blockRows number of rows in block as specified at run-time
-  * \param  blockCols number of columns in block as specified at run-time
-  *
-  * This function is mainly useful for blocks where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a blockRows should equal \a BlockRows unless
-  * \a BlockRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int BlockRows, int BlockCols>
-inline Block<Derived, BlockRows, BlockCols> block(Index startRow, Index startCol, 
+/// \returns a fixed-size expression of a block in *this.
+///
+/// The template parameters \a NRows and \a NCols are the number of
+/// rows and columns in the block.
+///
+/// \param startRow the first row in the block
+/// \param startCol the first column in the block
+///
+/// Example: \include MatrixBase_block_int_int.cpp
+/// Output: \verbinclude MatrixBase_block_int_int.out
+///
+/// \note since block is a templated member, the keyword template has to be used
+/// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+template<int NRows, int NCols>
+EIGEN_DEVICE_FUNC
+inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol)
+{
+  return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
+}
+
+/// This is the const version of block<>(Index, Index). */
+template<int NRows, int NCols>
+EIGEN_DEVICE_FUNC
+inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol) const
+{
+  return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
+}
+
+/// \returns an expression of a block in *this.
+///
+/// \tparam NRows number of rows in block as specified at compile-time
+/// \tparam NCols number of columns in block as specified at compile-time
+/// \param  startRow  the first row in the block
+/// \param  startCol  the first column in the block
+/// \param  blockRows number of rows in block as specified at run-time
+/// \param  blockCols number of columns in block as specified at run-time
+///
+/// This function is mainly useful for blocks where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a blockRows should equal \a NRows unless
+/// \a NRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
+template<int NRows, int NCols>
+inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                   Index blockRows, Index blockCols)
 {
-  return Block<Derived, BlockRows, BlockCols>(derived(), startRow, startCol, blockRows, blockCols);
+  return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/** This is the const version of block<>(Index, Index, Index, Index). */
-template<int BlockRows, int BlockCols>
-inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, Index startCol,
+/// This is the const version of block<>(Index, Index, Index, Index).
+template<int NRows, int NCols>
+inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                               Index blockRows, Index blockCols) const
 {
-  return Block<const Derived, BlockRows, BlockCols>(derived(), startRow, startCol, blockRows, blockCols);
+  return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/** \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
-  *
-  * Example: \include MatrixBase_col.cpp
-  * Output: \verbinclude MatrixBase_col.out
-  *
+/// \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
+///
+/// Example: \include MatrixBase_col.cpp
+/// Output: \verbinclude MatrixBase_col.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+/**
   * \sa row(), class Block */
+EIGEN_DEVICE_FUNC
 inline ColXpr col(Index i)
 {
   return ColXpr(derived(), i);
 }
 
-/** This is the const version of col(). */
+/// This is the const version of col().
+EIGEN_DEVICE_FUNC
 inline ConstColXpr col(Index i) const
 {
   return ConstColXpr(derived(), i);
 }
 
-/** \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
-  *
-  * Example: \include MatrixBase_row.cpp
-  * Output: \verbinclude MatrixBase_row.out
-  *
+/// \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
+///
+/// Example: \include MatrixBase_row.cpp
+/// Output: \verbinclude MatrixBase_row.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+/**
   * \sa col(), class Block */
+EIGEN_DEVICE_FUNC
 inline RowXpr row(Index i)
 {
   return RowXpr(derived(), i);
 }
 
-/** This is the const version of row(). */
+/// This is the const version of row(). */
+EIGEN_DEVICE_FUNC
 inline ConstRowXpr row(Index i) const
 {
   return ConstRowXpr(derived(), i);
 }
 
-/** \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
-  *
-  * \only_for_vectors
-  *
-  * \param start the first coefficient in the segment
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_segment_int_int.cpp
-  * Output: \verbinclude MatrixBase_segment_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, segment(Index)
-  */
+/// \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
+///
+/// \only_for_vectors
+///
+/// \param start the first coefficient in the segment
+/// \param n the number of coefficients in the segment
+///
+/// Example: \include MatrixBase_segment_int_int.cpp
+/// Output: \verbinclude MatrixBase_segment_int_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, segment(Index)
+///
+EIGEN_DEVICE_FUNC
 inline SegmentReturnType segment(Index start, Index n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -780,154 +892,165 @@ inline SegmentReturnType segment(Index start, Index n)
 }
 
 
-/** This is the const version of segment(Index,Index).*/
+/// This is the const version of segment(Index,Index).
+EIGEN_DEVICE_FUNC
 inline ConstSegmentReturnType segment(Index start, Index n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return ConstSegmentReturnType(derived(), start, n);
 }
 
-/** \returns a dynamic-size expression of the first coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_start_int.cpp
-  * Output: \verbinclude MatrixBase_start_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
+/// \returns a dynamic-size expression of the first coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \param n the number of coefficients in the segment
+///
+/// Example: \include MatrixBase_start_int.cpp
+/// Output: \verbinclude MatrixBase_start_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, block(Index,Index)
+///
+EIGEN_DEVICE_FUNC
 inline SegmentReturnType head(Index n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return SegmentReturnType(derived(), 0, n);
 }
 
-/** This is the const version of head(Index).*/
+/// This is the const version of head(Index).
+EIGEN_DEVICE_FUNC
 inline ConstSegmentReturnType head(Index n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return ConstSegmentReturnType(derived(), 0, n);
 }
 
-/** \returns a dynamic-size expression of the last coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_end_int.cpp
-  * Output: \verbinclude MatrixBase_end_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
+/// \returns a dynamic-size expression of the last coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \param n the number of coefficients in the segment
+///
+/// Example: \include MatrixBase_end_int.cpp
+/// Output: \verbinclude MatrixBase_end_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, block(Index,Index)
+///
+EIGEN_DEVICE_FUNC
 inline SegmentReturnType tail(Index n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return SegmentReturnType(derived(), this->size() - n, n);
 }
 
-/** This is the const version of tail(Index).*/
+/// This is the const version of tail(Index).
+EIGEN_DEVICE_FUNC
 inline ConstSegmentReturnType tail(Index n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return ConstSegmentReturnType(derived(), this->size() - n, n);
 }
 
-/** \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param start the index of the first element in the segment
-  * \param n the number of coefficients in the segment as specified at compile-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_segment.cpp
-  * Output: \verbinclude MatrixBase_template_int_segment.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param start the index of the first element in the segment
+/// \param n the number of coefficients in the segment as specified at compile-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_segment.cpp
+/// Output: \verbinclude MatrixBase_template_int_segment.out
+///
+/// \sa class Block
+///
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
-/** This is the const version of segment<int>(Index).*/
+/// This is the const version of segment<int>(Index).
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
-/** \returns a fixed-size expression of the first coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param  n the number of coefficients in the segment as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_start.cpp
-  * Output: \verbinclude MatrixBase_template_int_start.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of the first coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param  n the number of coefficients in the segment as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_start.cpp
+/// Output: \verbinclude MatrixBase_template_int_start.out
+///
+/// \sa class Block
+///
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
-/** This is the const version of head<int>().*/
+/// This is the const version of head<int>().
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
-/** \returns a fixed-size expression of the last coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param  n the number of coefficients in the segment as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_end.cpp
-  * Output: \verbinclude MatrixBase_template_int_end.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of the last coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param  n the number of coefficients in the segment as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_end.cpp
+/// Output: \verbinclude MatrixBase_template_int_end.out
+///
+/// \sa class Block
+///
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
 
-/** This is the const version of tail<int>.*/
+/// This is the const version of tail<int>.
 template<int N>
+EIGEN_DEVICE_FUNC
 inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
diff --git a/Eigen/src/plugins/CMakeLists.txt b/Eigen/src/plugins/CMakeLists.txt
deleted file mode 100644
index 1a1d3ffbd..000000000
--- a/Eigen/src/plugins/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_plugins_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_plugins_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/plugins COMPONENT Devel
-  )
diff --git a/Eigen/src/plugins/CommonCwiseBinaryOps.h b/Eigen/src/plugins/CommonCwiseBinaryOps.h
index 688d22440..8b6730ede 100644
--- a/Eigen/src/plugins/CommonCwiseBinaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseBinaryOps.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -16,7 +16,7 @@
   *
   * \sa class CwiseBinaryOp, operator-=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator-,internal::scalar_difference_op)
+EIGEN_MAKE_CWISE_BINARY_OP(operator-,difference)
 
 /** \returns an expression of the sum of \c *this and \a other
   *
@@ -24,7 +24,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator-,internal::scalar_difference_op)
   *
   * \sa class CwiseBinaryOp, operator+=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator+,internal::scalar_sum_op)
+EIGEN_MAKE_CWISE_BINARY_OP(operator+,sum)
 
 /** \returns an expression of a custom coefficient-wise operator \a func of *this and \a other
   *
@@ -38,9 +38,78 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator+,internal::scalar_sum_op)
   * \sa class CwiseBinaryOp, operator+(), operator-(), cwiseProduct()
   */
 template<typename CustomBinaryOp, typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
 binaryExpr(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other, const CustomBinaryOp& func = CustomBinaryOp()) const
 {
   return CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other.derived(), func);
 }
 
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator*,product)
+#else
+/** \returns an expression of \c *this scaled by the scalar factor \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T>
+const CwiseBinaryOp<internal::scalar_product_op<Scalar,T>,Derived,Constant<T> > operator*(const T& scalar) const;
+/** \returns an expression of \a expr scaled by the scalar factor \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T> friend
+const CwiseBinaryOp<internal::scalar_product_op<T,Scalar>,Constant<T>,Derived> operator*(const T& scalar, const StorageBaseType& expr);
+#endif
+
+
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(operator/,quotient)
+#else
+/** \returns an expression of \c *this divided by the scalar value \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T>
+const CwiseBinaryOp<internal::scalar_quotient_op<Scalar,T>,Derived,Constant<T> > operator/(const T& scalar) const;
+#endif
+
+/** \returns an expression of the coefficient-wise boolean \b and operator of \c *this and \a other
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_and.cpp
+  * Output: \verbinclude Cwise_boolean_and.out
+  *
+  * \sa operator||(), select()
+  */
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+inline const CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
+operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>(derived(),other.derived());
+}
+
+/** \returns an expression of the coefficient-wise boolean \b or operator of \c *this and \a other
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_or.cpp
+  * Output: \verbinclude Cwise_boolean_or.out
+  *
+  * \sa operator&&(), select()
+  */
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+inline const CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
+operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
+}
diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h
index 08e931aad..89f4faaac 100644
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h
@@ -12,10 +12,6 @@
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-/** \internal Represents a scalar multiple of an expression */
-typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived> ScalarMultipleReturnType;
-/** \internal Represents a quotient of an expression by a scalar*/
-typedef CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived> ScalarQuotient1ReturnType;
 /** \internal the return type of conjugate() */
 typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
                     const CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
@@ -36,137 +32,132 @@ typedef CwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived> ImagReturn
 /** \internal the return type of imag() */
 typedef CwiseUnaryView<internal::scalar_imag_ref_op<Scalar>, Derived> NonConstImagReturnType;
 
-#endif // not EIGEN_PARSED_BY_DOXYGEN
-
-/** \returns an expression of the opposite of \c *this
-  */
-inline const CwiseUnaryOp<internal::scalar_opposite_op<typename internal::traits<Derived>::Scalar>, const Derived>
-operator-() const { return derived(); }
-
-
-/** \returns an expression of \c *this scaled by the scalar factor \a scalar */
-inline const ScalarMultipleReturnType
-operator*(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
-    (derived(), internal::scalar_multiple_op<Scalar>(scalar));
-}
+typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> NegativeReturnType;
 
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-const ScalarMultipleReturnType operator*(const RealScalar& scalar) const;
-#endif
-
-/** \returns an expression of \c *this divided by the scalar value \a scalar */
-inline const CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>, const Derived>
-operator/(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
-    (derived(), internal::scalar_quotient1_op<Scalar>(scalar));
-}
-
-/** Overloaded for efficient real matrix times complex scalar value */
-inline const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-operator*(const std::complex<Scalar>& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-    (*static_cast<const Derived*>(this), internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >(scalar));
-}
+#endif // not EIGEN_PARSED_BY_DOXYGEN
 
-inline friend const ScalarMultipleReturnType
-operator*(const Scalar& scalar, const StorageBaseType& matrix)
-{ return matrix*scalar; }
-
-inline friend const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-operator*(const std::complex<Scalar>& scalar, const StorageBaseType& matrix)
-{ return matrix*scalar; }
-
-/** \returns an expression of *this with the \a Scalar type casted to
-  * \a NewScalar.
-  *
-  * The template parameter \a NewScalar is the type we are casting the scalars to.
-  *
-  * \sa class CwiseUnaryOp
-  */
+/// \returns an expression of the opposite of \c *this
+///
+EIGEN_DOC_UNARY_ADDONS(operator-,opposite)
+///
+EIGEN_DEVICE_FUNC
+inline const NegativeReturnType
+operator-() const { return NegativeReturnType(derived()); }
+
+
+template<class NewType> struct CastXpr { typedef typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> >::type Type; };
+
+/// \returns an expression of \c *this with the \a Scalar type casted to
+/// \a NewScalar.
+///
+/// The template parameter \a NewScalar is the type we are casting the scalars to.
+///
+EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
+///
+/// \sa class CwiseUnaryOp
+///
 template<typename NewType>
-typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<typename internal::traits<Derived>::Scalar, NewType>, const Derived> >::type
+EIGEN_DEVICE_FUNC
+typename CastXpr<NewType>::Type
 cast() const
 {
-  return derived();
+  return typename CastXpr<NewType>::Type(derived());
 }
 
-/** \returns an expression of the complex conjugate of \c *this.
-  *
-  * \sa adjoint() */
+/// \returns an expression of the complex conjugate of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
+///
+/// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
+EIGEN_DEVICE_FUNC
 inline ConjugateReturnType
 conjugate() const
 {
   return ConjugateReturnType(derived());
 }
 
-/** \returns a read-only expression of the real part of \c *this.
-  *
-  * \sa imag() */
+/// \returns a read-only expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
+///
+/// \sa imag()
+EIGEN_DEVICE_FUNC
 inline RealReturnType
-real() const { return derived(); }
-
-/** \returns an read-only expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
+real() const { return RealReturnType(derived()); }
+
+/// \returns an read-only expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
+///
+/// \sa real()
+EIGEN_DEVICE_FUNC
 inline const ImagReturnType
-imag() const { return derived(); }
-
-/** \brief Apply a unary operator coefficient-wise
-  * \param[in]  func  Functor implementing the unary operator
-  * \tparam  CustomUnaryOp Type of \a func  
-  * \returns An expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp_ptrfun.cpp
-  * Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
-  *
-  * Genuine functors allow for more possibilities, for instance it may contain a state.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
+imag() const { return ImagReturnType(derived()); }
+
+/// \brief Apply a unary operator coefficient-wise
+/// \param[in]  func  Functor implementing the unary operator
+/// \tparam  CustomUnaryOp Type of \a func
+/// \returns An expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
+///
+/// Example:
+/// \include class_CwiseUnaryOp_ptrfun.cpp
+/// Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
+///
+/// Genuine functors allow for more possibilities, for instance it may contain a state.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryExpr,unary function)
+///
+/// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
+///
 template<typename CustomUnaryOp>
+EIGEN_DEVICE_FUNC
 inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
 unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
 {
   return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
 }
 
-/** \returns an expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The template parameter \a CustomUnaryOp is the type of the functor
-  * of the custom unary operator.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
+/// \returns an expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The template parameter \a CustomUnaryOp is the type of the functor
+/// of the custom unary operator.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryViewExpr,unary function)
+///
+/// \sa unaryExpr, binaryExpr class CwiseUnaryOp
+///
 template<typename CustomViewOp>
+EIGEN_DEVICE_FUNC
 inline const CwiseUnaryView<CustomViewOp, const Derived>
 unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
 {
   return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
 }
 
-/** \returns a non const expression of the real part of \c *this.
-  *
-  * \sa imag() */
+/// \returns a non const expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
+///
+/// \sa imag()
+EIGEN_DEVICE_FUNC
 inline NonConstRealReturnType
-real() { return derived(); }
-
-/** \returns a non const expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
+real() { return NonConstRealReturnType(derived()); }
+
+/// \returns a non const expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
+///
+/// \sa real()
+EIGEN_DEVICE_FUNC
 inline NonConstImagReturnType
-imag() { return derived(); }
+imag() { return NonConstImagReturnType(derived()); }
diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
index c4a042b70..f1084abef 100644
--- a/Eigen/src/plugins/MatrixCwiseBinaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
@@ -18,10 +18,11 @@
   * \sa class CwiseBinaryOp, cwiseAbs2
   */
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)
 cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)(derived(), other.derived());
+  return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise == operator of *this and \a other
@@ -37,6 +38,7 @@ cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa cwiseNotEqual(), isApprox(), isMuchSmallerThan()
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 inline const CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
 cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -56,6 +58,7 @@ cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa cwiseEqual(), isApprox(), isMuchSmallerThan()
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 inline const CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
 cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -70,17 +73,19 @@ cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa class CwiseBinaryOp, max()
   */
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const OtherDerived>
 cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise min of *this and scalar \a other
   *
   * \sa class CwiseBinaryOp, min()
   */
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const ConstantReturnType>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const ConstantReturnType>
 cwiseMin(const Scalar &other) const
 {
   return cwiseMin(Derived::Constant(rows(), cols(), other));
@@ -94,17 +99,19 @@ cwiseMin(const Scalar &other) const
   * \sa class CwiseBinaryOp, min()
   */
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const OtherDerived>
 cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise max of *this and scalar \a other
   *
   * \sa class CwiseBinaryOp, min()
   */
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const ConstantReturnType>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const ConstantReturnType>
 cwiseMax(const Scalar &other) const
 {
   return cwiseMax(Derived::Constant(rows(), cols(), other));
@@ -119,13 +126,14 @@ cwiseMax(const Scalar &other) const
   * \sa class CwiseBinaryOp, cwiseProduct(), cwiseInverse()
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
 cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
   return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
-typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,internal::cmp_EQ>, const Derived, const ConstantReturnType> CwiseScalarEqualReturnType;
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,Scalar,internal::cmp_EQ>, const Derived, const ConstantReturnType> CwiseScalarEqualReturnType;
 
 /** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
   *
@@ -136,8 +144,9 @@ typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,internal::cmp_EQ>, const De
   *
   * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
   */
+EIGEN_DEVICE_FUNC
 inline const CwiseScalarEqualReturnType
 cwiseEqual(const Scalar& s) const
 {
-  return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s), internal::scalar_cmp_op<Scalar,internal::cmp_EQ>());
+  return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s), internal::scalar_cmp_op<Scalar,Scalar,internal::cmp_EQ>());
 }
diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
index 8de10935d..b1be3d566 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@@ -8,45 +8,78 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// This file is a base class plugin containing matrix specifics coefficient wise functions.
-
-/** \returns an expression of the coefficient-wise absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs.out
-  *
-  * \sa cwiseAbs2()
-  */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
-cwiseAbs() const { return derived(); }
-
-/** \returns an expression of the coefficient-wise squared absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs2.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs2.out
-  *
-  * \sa cwiseAbs()
-  */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived>
-cwiseAbs2() const { return derived(); }
-
-/** \returns an expression of the coefficient-wise square root of *this.
-  *
-  * Example: \include MatrixBase_cwiseSqrt.cpp
-  * Output: \verbinclude MatrixBase_cwiseSqrt.out
-  *
-  * \sa cwisePow(), cwiseSquare()
-  */
-inline const CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
-cwiseSqrt() const { return derived(); }
-
-/** \returns an expression of the coefficient-wise inverse of *this.
-  *
-  * Example: \include MatrixBase_cwiseInverse.cpp
-  * Output: \verbinclude MatrixBase_cwiseInverse.out
-  *
-  * \sa cwiseProduct()
-  */
-inline const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
-cwiseInverse() const { return derived(); }
+// This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
+// This include MatrixBase and SparseMatrixBase.
+
+
+typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
+typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
+typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
+
+/// \returns an expression of the coefficient-wise absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs,absolute value)
+///
+/// \sa cwiseAbs2()
+///
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseAbsReturnType
+cwiseAbs() const { return CwiseAbsReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise squared absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs2.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs2.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs2,squared absolute value)
+///
+/// \sa cwiseAbs()
+///
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseAbs2ReturnType
+cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise square root of *this.
+///
+/// Example: \include MatrixBase_cwiseSqrt.cpp
+/// Output: \verbinclude MatrixBase_cwiseSqrt.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root)
+///
+/// \sa cwisePow(), cwiseSquare()
+///
+EIGEN_DEVICE_FUNC
+inline const CwiseSqrtReturnType
+cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise signum of *this.
+///
+/// Example: \include MatrixBase_cwiseSign.cpp
+/// Output: \verbinclude MatrixBase_cwiseSign.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSign,sign function)
+///
+EIGEN_DEVICE_FUNC
+inline const CwiseSignReturnType
+cwiseSign() const { return CwiseSignReturnType(derived()); }
+
+
+/// \returns an expression of the coefficient-wise inverse of *this.
+///
+/// Example: \include MatrixBase_cwiseInverse.cpp
+/// Output: \verbinclude MatrixBase_cwiseInverse.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseInverse,inverse)
+///
+/// \sa cwiseProduct()
+///
+EIGEN_DEVICE_FUNC
+inline const CwiseInverseReturnType
+cwiseInverse() const { return CwiseInverseReturnType(derived()); }
+
 
diff --git a/README.version b/README.version
index 3a1f025f2..01c4b7384 100644
--- a/README.version
+++ b/README.version
@@ -1,3 +1,3 @@
-URL: http://bitbucket.org/eigen/eigen/get/3.2.4.tar.bz2
-Version: 3.2.4
+URL: http://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2
+Version: 3.3.3
 BugComponent: 99023
diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h
index 28e2bcaea..ea28496b7 100644
--- a/bench/BenchTimer.h
+++ b/bench/BenchTimer.h
@@ -22,12 +22,19 @@
 # endif
 # include <windows.h>
 #elif defined(__APPLE__)
-#include <CoreServices/CoreServices.h>
 #include <mach/mach_time.h>
 #else
 # include <unistd.h>
 #endif
 
+static void escape(void *p) {
+  asm volatile("" : : "g"(p) : "memory");
+}
+
+static void clobber() {
+  asm volatile("" : : : "memory");
+}
+
 #include <Eigen/Core>
 
 namespace Eigen
@@ -168,6 +175,7 @@ public:
         CODE; \
       } \
       TIMER.stop(); \
+      clobber(); \
     } \
   }
 
diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
new file mode 100644
index 000000000..d563a1d2d
--- /dev/null
+++ b/bench/analyze-blocking-sizes.cpp
@@ -0,0 +1,876 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <memory>
+
+#include <Eigen/Core>
+
+using namespace std;
+
+const int default_precision = 4;
+
+// see --only-cubic-sizes
+bool only_cubic_sizes = false;
+
+// see --dump-tables
+bool dump_tables = false;
+
+uint8_t log2_pot(size_t x) {
+  size_t l = 0;
+  while (x >>= 1) l++;
+  return l;
+}
+
+uint16_t compact_size_triple(size_t k, size_t m, size_t n)
+{
+  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
+}
+
+// just a helper to store a triple of K,M,N sizes for matrix product
+struct size_triple_t
+{
+  uint16_t k, m, n;
+  size_triple_t() : k(0), m(0), n(0) {}
+  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
+  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
+  size_triple_t(uint16_t compact)
+  {
+    k = 1 << ((compact & 0xf00) >> 8);
+    m = 1 << ((compact & 0x0f0) >> 4);
+    n = 1 << ((compact & 0x00f) >> 0);
+  }
+  bool is_cubic() const { return k == m && m == n; }
+};
+
+ostream& operator<<(ostream& s, const size_triple_t& t)
+{
+  return s << "(" << t.k << ", " << t.m << ", " << t.n << ")";
+}
+
+struct inputfile_entry_t
+{
+  uint16_t product_size;
+  uint16_t pot_block_size;
+  size_triple_t nonpot_block_size;
+  float gflops;
+};
+
+struct inputfile_t
+{
+  enum class type_t {
+    unknown,
+    all_pot_sizes,
+    default_sizes
+  };
+
+  string filename;
+  vector<inputfile_entry_t> entries;
+  type_t type;
+
+  inputfile_t(const string& fname)
+    : filename(fname)
+    , type(type_t::unknown)
+  {
+    ifstream stream(filename);
+    if (!stream.is_open()) {
+      cerr << "couldn't open input file: " << filename << endl;
+      exit(1);
+    }
+    string line;
+    while (getline(stream, line)) {
+      if (line.empty()) continue;
+      if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
+        if (type != type_t::unknown) {
+          cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
+          exit(1);
+        }
+        type = type_t::all_pot_sizes;
+        continue;
+      }
+      if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
+        if (type != type_t::unknown) {
+          cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
+          exit(1);
+        }
+        type = type_t::default_sizes;
+        continue;
+      }
+      
+
+      if (type == type_t::unknown) {
+        continue;
+      }
+      switch(type) {
+        case type_t::all_pot_sizes: {
+          unsigned int product_size, block_size;
+          float gflops;
+          int sscanf_result =
+            sscanf(line.c_str(), "%x %x %f",
+                   &product_size,
+                   &block_size,
+                   &gflops);
+          if (3 != sscanf_result ||
+              !product_size ||
+              product_size > 0xfff ||
+              !block_size ||
+              block_size > 0xfff ||
+              !isfinite(gflops))
+          {
+            cerr << "ill-formed input file: " << filename << endl;
+            cerr << "offending line:" << endl << line << endl;
+            exit(1);
+          }
+          if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
+            continue;
+          }
+          inputfile_entry_t entry;
+          entry.product_size = uint16_t(product_size);
+          entry.pot_block_size = uint16_t(block_size);
+          entry.gflops = gflops;
+          entries.push_back(entry);
+          break;
+        }
+        case type_t::default_sizes: {
+          unsigned int product_size;
+          float gflops;
+          int bk, bm, bn;
+          int sscanf_result =
+            sscanf(line.c_str(), "%x default(%d, %d, %d) %f",
+                   &product_size,
+                   &bk, &bm, &bn,
+                   &gflops);
+          if (5 != sscanf_result ||
+              !product_size ||
+              product_size > 0xfff ||
+              !isfinite(gflops))
+          {
+            cerr << "ill-formed input file: " << filename << endl;
+            cerr << "offending line:" << endl << line << endl;
+            exit(1);
+          }
+          if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
+            continue;
+          }
+          inputfile_entry_t entry;
+          entry.product_size = uint16_t(product_size);
+          entry.pot_block_size = 0;
+          entry.nonpot_block_size = size_triple_t(bk, bm, bn);
+          entry.gflops = gflops;
+          entries.push_back(entry);
+          break;
+        }
+        
+        default:
+          break;
+      }
+    }
+    stream.close();
+    if (type == type_t::unknown) {
+      cerr << "Unrecognized input file " << filename << endl;
+      exit(1);
+    }
+    if (entries.empty()) {
+      cerr << "didn't find any measurements in input file: " << filename << endl;
+      exit(1);
+    }
+  }
+};
+
+struct preprocessed_inputfile_entry_t
+{
+  uint16_t product_size;
+  uint16_t block_size;
+
+  float efficiency;
+};
+
+bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2)
+{
+  return e1.efficiency < e2.efficiency;
+}
+
+struct preprocessed_inputfile_t
+{
+  string filename;
+  vector<preprocessed_inputfile_entry_t> entries;
+
+  preprocessed_inputfile_t(const inputfile_t& inputfile)
+    : filename(inputfile.filename)
+  {
+    if (inputfile.type != inputfile_t::type_t::all_pot_sizes) {
+      abort();
+    }
+    auto it = inputfile.entries.begin();
+    auto it_first_with_given_product_size = it;
+    while (it != inputfile.entries.end()) {
+      ++it;
+      if (it == inputfile.entries.end() ||
+        it->product_size != it_first_with_given_product_size->product_size)
+      {
+        import_input_file_range_one_product_size(it_first_with_given_product_size, it);
+        it_first_with_given_product_size = it;
+      }
+    }
+  }
+
+private:
+  void import_input_file_range_one_product_size(
+    const vector<inputfile_entry_t>::const_iterator& begin,
+    const vector<inputfile_entry_t>::const_iterator& end)
+  {
+    uint16_t product_size = begin->product_size;
+    float max_gflops = 0.0f;
+    for (auto it = begin; it != end; ++it) {
+      if (it->product_size != product_size) {
+        cerr << "Unexpected ordering of entries in " << filename << endl;
+        cerr << "(Expected all entries for product size " << hex << product_size << dec << " to be grouped)" << endl;
+        exit(1);
+      }
+      max_gflops = max(max_gflops, it->gflops);
+    }
+    for (auto it = begin; it != end; ++it) {
+      preprocessed_inputfile_entry_t entry;
+      entry.product_size = it->product_size;
+      entry.block_size = it->pot_block_size;
+      entry.efficiency = it->gflops / max_gflops;
+      entries.push_back(entry);
+    }
+  }
+};
+
+void check_all_files_in_same_exact_order(
+       const vector<preprocessed_inputfile_t>& preprocessed_inputfiles)
+{
+  if (preprocessed_inputfiles.empty()) {
+    return;
+  }
+
+  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[0];
+  const size_t num_entries = first_file.entries.size();
+
+  for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
+    if (preprocessed_inputfiles[i].entries.size() != num_entries) {
+      cerr << "these files have different number of entries: "
+           << preprocessed_inputfiles[i].filename
+           << " and "
+           << first_file.filename
+           << endl;
+      exit(1);
+    }
+  }
+
+  for (size_t entry_index = 0; entry_index < num_entries; entry_index++) {
+    const uint16_t entry_product_size = first_file.entries[entry_index].product_size;
+    const uint16_t entry_block_size = first_file.entries[entry_index].block_size;
+    for (size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) {
+      const preprocessed_inputfile_t& cur_file = preprocessed_inputfiles[file_index];
+      if (cur_file.entries[entry_index].product_size != entry_product_size ||
+          cur_file.entries[entry_index].block_size != entry_block_size)
+      {
+        cerr << "entries not in same order between these files: "
+             << first_file.filename
+             << " and "
+             << cur_file.filename
+             << endl;
+        exit(1);
+      }
+    }
+  }
+}
+
+float efficiency_of_subset(
+        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+        const vector<size_t>& subset)
+{
+  if (subset.size() <= 1) {
+    return 1.0f;
+  }
+  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
+  const size_t num_entries = first_file.entries.size();
+  float efficiency = 1.0f;
+  size_t entry_index = 0;
+  size_t first_entry_index_with_this_product_size = 0;
+  uint16_t product_size = first_file.entries[0].product_size;
+  while (entry_index < num_entries) {
+    ++entry_index;
+    if (entry_index == num_entries ||
+        first_file.entries[entry_index].product_size != product_size)
+    {
+      float efficiency_this_product_size = 0.0f;
+      for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
+        float efficiency_this_entry = 1.0f;
+        for (auto i = subset.begin(); i != subset.end(); ++i) {
+          efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
+        }
+        efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry);
+      }
+      efficiency = min(efficiency, efficiency_this_product_size);
+      if (entry_index < num_entries) {
+        first_entry_index_with_this_product_size = entry_index;
+        product_size = first_file.entries[entry_index].product_size;
+      }
+    }
+  }
+
+  return efficiency;
+}
+
+void dump_table_for_subset(
+        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+        const vector<size_t>& subset)
+{
+  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
+  const size_t num_entries = first_file.entries.size();
+  size_t entry_index = 0;
+  size_t first_entry_index_with_this_product_size = 0;
+  uint16_t product_size = first_file.entries[0].product_size;
+  size_t i = 0;
+  size_triple_t min_product_size(first_file.entries.front().product_size);
+  size_triple_t max_product_size(first_file.entries.back().product_size);
+  if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) {
+    abort();
+  }
+  if (only_cubic_sizes) {
+    cerr << "Can't generate tables with --only-cubic-sizes." << endl;
+    abort();
+  }
+  cout << "struct LookupTable {" << endl;
+  cout << "  static const size_t BaseSize = " << min_product_size.k << ";" << endl;
+  const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1;
+  const size_t TableSize = NumSizes * NumSizes * NumSizes;
+  cout << "  static const size_t NumSizes = " << NumSizes << ";" << endl;
+  cout << "  static const unsigned short* Data() {" << endl;
+  cout << "    static const unsigned short data[" << TableSize << "] = {";
+  while (entry_index < num_entries) {
+    ++entry_index;
+    if (entry_index == num_entries ||
+        first_file.entries[entry_index].product_size != product_size)
+    {
+      float best_efficiency_this_product_size = 0.0f;
+      uint16_t best_block_size_this_product_size = 0;
+      for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
+        float efficiency_this_entry = 1.0f;
+        for (auto i = subset.begin(); i != subset.end(); ++i) {
+          efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
+        }
+        if (efficiency_this_entry > best_efficiency_this_product_size) {
+          best_efficiency_this_product_size = efficiency_this_entry;
+          best_block_size_this_product_size = first_file.entries[e].block_size;
+        }
+      }
+      if ((i++) % NumSizes) {
+        cout << " ";
+      } else {
+        cout << endl << "      ";
+      }
+      cout << "0x" << hex << best_block_size_this_product_size << dec;
+      if (entry_index < num_entries) {
+        cout << ",";
+        first_entry_index_with_this_product_size = entry_index;
+        product_size = first_file.entries[entry_index].product_size;
+      }
+    }
+  }
+  if (i != TableSize) {
+    cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl;
+    abort();
+  }
+  cout << endl << "    };" << endl;
+  cout << "    return data;" << endl;
+  cout << "  }" << endl;
+  cout << "};" << endl;
+}
+
+float efficiency_of_partition(
+        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+        const vector<vector<size_t>>& partition)
+{
+  float efficiency = 1.0f;
+  for (auto s = partition.begin(); s != partition.end(); ++s) {
+    efficiency = min(efficiency, efficiency_of_subset(preprocessed_inputfiles, *s));
+  }
+  return efficiency;
+}
+
+void make_first_subset(size_t subset_size, vector<size_t>& out_subset, size_t set_size)
+{
+  assert(subset_size >= 1 && subset_size <= set_size);
+  out_subset.resize(subset_size);
+  for (size_t i = 0; i < subset_size; i++) {
+    out_subset[i] = i;
+  }
+}
+
+bool is_last_subset(const vector<size_t>& subset, size_t set_size)
+{
+  return subset[0] == set_size - subset.size();
+}
+
+void next_subset(vector<size_t>& inout_subset, size_t set_size)
+{
+  if (is_last_subset(inout_subset, set_size)) {
+    cerr << "iterating past the last subset" << endl;
+    abort();
+  }
+  size_t i = 1;
+  while (inout_subset[inout_subset.size() - i] == set_size - i) {
+    i++;
+    assert(i <= inout_subset.size());
+  }
+  size_t first_index_to_change = inout_subset.size() - i;
+  inout_subset[first_index_to_change]++;
+  size_t p = inout_subset[first_index_to_change];
+  for (size_t j = first_index_to_change + 1; j < inout_subset.size(); j++) {
+    inout_subset[j] = ++p;
+  }
+}
+
+const size_t number_of_subsets_limit = 100;
+const size_t always_search_subsets_of_size_at_least = 2;
+
+bool is_number_of_subsets_feasible(size_t n, size_t p)
+{ 
+  assert(n>0 && p>0 && p<=n);
+  uint64_t numerator = 1, denominator = 1;
+  for (size_t i = 0; i < p; i++) {
+    numerator *= n - i;
+    denominator *= i + 1;
+    if (numerator > denominator * number_of_subsets_limit) {
+      return false;
+    }
+  }
+  return true;
+}
+
+size_t max_feasible_subset_size(size_t n)
+{
+  assert(n > 0);
+  const size_t minresult = min<size_t>(n-1, always_search_subsets_of_size_at_least);
+  for (size_t p = 1; p <= n - 1; p++) {
+    if (!is_number_of_subsets_feasible(n, p+1)) {
+      return max(p, minresult);
+    }
+  }
+  return n - 1;
+}
+
+void find_subset_with_efficiency_higher_than(
+       const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+       float required_efficiency_to_beat,
+       vector<size_t>& inout_remainder,
+       vector<size_t>& out_subset)
+{
+  out_subset.resize(0);
+
+  if (required_efficiency_to_beat >= 1.0f) {
+    cerr << "can't beat efficiency 1." << endl;
+    abort();
+  }
+
+  while (!inout_remainder.empty()) {
+
+    vector<size_t> candidate_indices(inout_remainder.size());
+    for (size_t i = 0; i < candidate_indices.size(); i++) {
+      candidate_indices[i] = i;
+    }
+
+    size_t candidate_indices_subset_size = max_feasible_subset_size(candidate_indices.size());
+    while (candidate_indices_subset_size >= 1) {
+      vector<size_t> candidate_indices_subset;
+      make_first_subset(candidate_indices_subset_size,
+                        candidate_indices_subset,
+                        candidate_indices.size());
+
+      vector<size_t> best_candidate_indices_subset;
+      float best_efficiency = 0.0f;
+      vector<size_t> trial_subset = out_subset;
+      trial_subset.resize(out_subset.size() + candidate_indices_subset_size);
+      while (true)
+      {
+        for (size_t i = 0; i < candidate_indices_subset_size; i++) {
+          trial_subset[out_subset.size() + i] = inout_remainder[candidate_indices_subset[i]];
+        }
+        
+        float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
+        if (trial_efficiency > best_efficiency) {
+          best_efficiency = trial_efficiency;
+          best_candidate_indices_subset = candidate_indices_subset;
+        }
+        if (is_last_subset(candidate_indices_subset, candidate_indices.size())) {
+          break;
+        }
+        next_subset(candidate_indices_subset, candidate_indices.size());
+      }
+       
+      if (best_efficiency > required_efficiency_to_beat) {
+        for (size_t i = 0; i < best_candidate_indices_subset.size(); i++) {
+          candidate_indices[i] = candidate_indices[best_candidate_indices_subset[i]];
+        }
+        candidate_indices.resize(best_candidate_indices_subset.size());
+      }
+      candidate_indices_subset_size--;
+    }
+      
+    size_t candidate_index = candidate_indices[0];
+    auto candidate_iterator = inout_remainder.begin() + candidate_index;
+    vector<size_t> trial_subset = out_subset;
+
+    trial_subset.push_back(*candidate_iterator);
+    float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
+    if (trial_efficiency > required_efficiency_to_beat) {
+      out_subset.push_back(*candidate_iterator);
+      inout_remainder.erase(candidate_iterator);
+    } else {
+      break;
+    }
+  }
+}
+
+void find_partition_with_efficiency_higher_than(
+       const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+       float required_efficiency_to_beat,
+       vector<vector<size_t>>& out_partition)
+{
+  out_partition.resize(0);
+
+  vector<size_t> remainder;
+  for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
+    remainder.push_back(i);
+  }
+
+  while (!remainder.empty()) {
+    vector<size_t> new_subset;
+    find_subset_with_efficiency_higher_than(
+      preprocessed_inputfiles,
+      required_efficiency_to_beat,
+      remainder,
+      new_subset);
+    out_partition.push_back(new_subset);
+  }
+}
+
+void print_partition(
+       const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+       const vector<vector<size_t>>& partition)
+{
+  float efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
+  cout << "Partition into " << partition.size() << " subsets for " << efficiency * 100.0f << "% efficiency"  << endl;
+  for (auto subset = partition.begin(); subset != partition.end(); ++subset) {
+    cout << "  Subset " << (subset - partition.begin())
+         << ", efficiency " << efficiency_of_subset(preprocessed_inputfiles, *subset) * 100.0f << "%:"
+         << endl;
+    for (auto file = subset->begin(); file != subset->end(); ++file) {
+      cout << "    " << preprocessed_inputfiles[*file].filename << endl;
+    }
+    if (dump_tables) {
+      cout << "  Table:" << endl;
+      dump_table_for_subset(preprocessed_inputfiles, *subset);
+    }
+  }
+  cout << endl;
+}
+
+struct action_t
+{
+  virtual const char* invokation_name() const { abort(); return nullptr; }
+  virtual void run(const vector<string>&) const { abort(); }
+  virtual ~action_t() {}
+};
+
+struct partition_action_t : action_t
+{
+  virtual const char* invokation_name() const override { return "partition"; }
+  virtual void run(const vector<string>& input_filenames) const override
+  {
+    vector<preprocessed_inputfile_t> preprocessed_inputfiles;
+
+    if (input_filenames.empty()) {
+      cerr << "The " << invokation_name() << " action needs a list of input files." << endl;
+      exit(1);
+    }
+
+    for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
+      inputfile_t inputfile(*it);
+      switch (inputfile.type) {
+        case inputfile_t::type_t::all_pot_sizes:
+          preprocessed_inputfiles.emplace_back(inputfile);
+          break;
+        case inputfile_t::type_t::default_sizes:
+          cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and "
+               << "has no use for " << *it << " which contains measurements for default sizes." << endl;
+          exit(1);
+          break;
+        default:
+          cerr << "Unrecognized input file: " << *it << endl;
+          exit(1);
+      }
+    }
+
+    check_all_files_in_same_exact_order(preprocessed_inputfiles);
+
+    float required_efficiency_to_beat = 0.0f;
+    vector<vector<vector<size_t>>> partitions;
+    cerr << "searching for partitions...\r" << flush;
+    while (true)
+    {
+      vector<vector<size_t>> partition;
+      find_partition_with_efficiency_higher_than(
+        preprocessed_inputfiles,
+        required_efficiency_to_beat,
+        partition);
+      float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
+      cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
+           << " subsets for " << 100.0f * actual_efficiency
+           << " % efficiency"
+           << "                  \r" << flush;
+      partitions.push_back(partition);
+      if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
+        break;
+      }
+      required_efficiency_to_beat = actual_efficiency;
+    }
+    cerr << "                                                                  " << endl;
+    while (true) {
+      bool repeat = false;
+      for (size_t i = 0; i < partitions.size() - 1; i++) {
+        if (partitions[i].size() >= partitions[i+1].size()) {
+          partitions.erase(partitions.begin() + i);
+          repeat = true;
+          break;
+        }
+      }
+      if (!repeat) {
+        break;
+      }
+    }
+    for (auto it = partitions.begin(); it != partitions.end(); ++it) {
+      print_partition(preprocessed_inputfiles, *it);
+    }
+  }
+};
+
+struct evaluate_defaults_action_t : action_t
+{
+  struct results_entry_t {
+    uint16_t product_size;
+    size_triple_t default_block_size;
+    uint16_t best_pot_block_size;
+    float default_gflops;
+    float best_pot_gflops;
+    float default_efficiency;
+  };
+  friend ostream& operator<<(ostream& s, const results_entry_t& entry)
+  {
+    return s
+      << "Product size " << size_triple_t(entry.product_size)
+      << ": default block size " << entry.default_block_size
+      << " -> " << entry.default_gflops
+      << " GFlop/s = " << entry.default_efficiency * 100.0f << " %"
+      << " of best POT block size " << size_triple_t(entry.best_pot_block_size)
+      << " -> " << entry.best_pot_gflops
+      << " GFlop/s" << dec;
+  }
+  static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) {
+    return e1.default_efficiency < e2.default_efficiency;
+  }
+  virtual const char* invokation_name() const override { return "evaluate-defaults"; }
+  void show_usage_and_exit() const
+  {
+    cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl;
+    cerr << "checks how well the performance with default sizes compares to the best "
+         << "performance measured over all POT sizes." << endl;
+    exit(1);
+  }
+  virtual void run(const vector<string>& input_filenames) const override
+  {
+    if (input_filenames.size() != 2) {
+      show_usage_and_exit();
+    }
+    inputfile_t inputfile_default_sizes(input_filenames[0]);
+    inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
+    if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) {
+      cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl;
+      show_usage_and_exit();
+    }
+    if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) {
+      cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl;
+      show_usage_and_exit();
+    }
+    vector<results_entry_t> results;
+    vector<results_entry_t> cubic_results;
+    
+    uint16_t product_size = 0;
+    auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin();
+    for (auto it_default_sizes = inputfile_default_sizes.entries.begin();
+         it_default_sizes != inputfile_default_sizes.entries.end();
+         ++it_default_sizes)
+    {
+      if (it_default_sizes->product_size == product_size) {
+        continue;
+      }
+      product_size = it_default_sizes->product_size;
+      while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() &&
+             it_all_pot_sizes->product_size != product_size)
+      {
+        ++it_all_pot_sizes;
+      }
+      if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) {
+        break;
+      }
+      uint16_t best_pot_block_size = 0;
+      float best_pot_gflops = 0;
+      for (auto it = it_all_pot_sizes;
+           it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size;
+           ++it)
+      {
+        if (it->gflops > best_pot_gflops) {
+          best_pot_gflops = it->gflops;
+          best_pot_block_size = it->pot_block_size;
+        }
+      }
+      results_entry_t entry;
+      entry.product_size = product_size;
+      entry.default_block_size = it_default_sizes->nonpot_block_size;
+      entry.best_pot_block_size = best_pot_block_size;
+      entry.default_gflops = it_default_sizes->gflops;
+      entry.best_pot_gflops = best_pot_gflops;
+      entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops;
+      results.push_back(entry);
+
+      size_triple_t t(product_size);
+      if (t.k == t.m && t.m == t.n) {
+        cubic_results.push_back(entry);
+      }
+    }
+
+    cout << "All results:" << endl;
+    for (auto it = results.begin(); it != results.end(); ++it) {
+      cout << *it << endl;
+    }
+    cout << endl;
+
+    sort(results.begin(), results.end(), lower_efficiency);
+    
+    const size_t n = min<size_t>(20, results.size());
+    cout << n << " worst results:" << endl;
+    for (size_t i = 0; i < n; i++) {
+      cout << results[i] << endl;
+    }
+    cout << endl;
+
+    cout << "cubic results:" << endl;
+    for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
+      cout << *it << endl;
+    }
+    cout << endl;
+
+    sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
+    
+    cout.precision(2);
+    vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
+    for (auto it = a.begin(); it != a.end(); ++it) {
+      size_t n = min(results.size() - 1, size_t(*it * results.size()));
+      cout << (100.0f * n / (results.size() - 1))
+           << " % of product sizes have default efficiency <= "
+           << 100.0f * results[n].default_efficiency << " %" << endl;
+    }
+    cout.precision(default_precision);
+  }
+};
+
+
+void show_usage_and_exit(int argc, char* argv[],
+                         const vector<unique_ptr<action_t>>& available_actions)
+{
+  cerr << "usage: " << argv[0] << " <action> [options...] <input files...>" << endl;
+  cerr << "available actions:" << endl;
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    cerr << "  " << (*it)->invokation_name() << endl;
+  } 
+  cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
+  exit(1);
+}
+
+int main(int argc, char* argv[])
+{
+  cout.precision(default_precision);
+  cerr.precision(default_precision);
+
+  vector<unique_ptr<action_t>> available_actions;
+  available_actions.emplace_back(new partition_action_t);
+  available_actions.emplace_back(new evaluate_defaults_action_t);
+
+  vector<string> input_filenames;
+
+  action_t* action = nullptr;
+
+  if (argc < 2) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+  for (int i = 1; i < argc; i++) {
+    bool arg_handled = false;
+    // Step 1. Try to match action invokation names.
+    for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+      if (!strcmp(argv[i], (*it)->invokation_name())) {
+        if (!action) {
+          action = it->get();
+          arg_handled = true;
+          break;
+        } else {
+          cerr << "can't specify more than one action!" << endl;
+          show_usage_and_exit(argc, argv, available_actions);
+        }
+      }
+    }
+    if (arg_handled) {
+      continue;
+    }
+    // Step 2. Try to match option names.
+    if (argv[i][0] == '-') {
+      if (!strcmp(argv[i], "--only-cubic-sizes")) {
+        only_cubic_sizes = true;
+        arg_handled = true;
+      }
+      if (!strcmp(argv[i], "--dump-tables")) {
+        dump_tables = true;
+        arg_handled = true;
+      }
+      if (!arg_handled) {
+        cerr << "Unrecognized option: " << argv[i] << endl;
+        show_usage_and_exit(argc, argv, available_actions);
+      }
+    }
+    if (arg_handled) {
+      continue;
+    }
+    // Step 3. Default to interpreting args as input filenames.
+    input_filenames.emplace_back(argv[i]);
+  }
+
+  if (dump_tables && only_cubic_sizes) {
+    cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  if (!action) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  action->run(input_filenames);
+}
diff --git a/bench/benchCholesky.cpp b/bench/benchCholesky.cpp
index 42b3e1285..9a8e7cf63 100644
--- a/bench/benchCholesky.cpp
+++ b/bench/benchCholesky.cpp
@@ -31,7 +31,7 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
   int rows = m.rows();
   int cols = m.cols();
 
-  int cost = 0;
+  double cost = 0;
   for (int j=0; j<rows; ++j)
   {
     int r = std::max(rows - j -1,0);
@@ -78,10 +78,10 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
   else
     std::cout << "fixed ";
   std::cout << covMat.rows() << " \t"
-            << (timerNoSqrt.value() * REPEAT) / repeats << "s "
-            << "(" << 1e-6 * cost*repeats/timerNoSqrt.value() << " MFLOPS)\t"
-            << (timerSqrt.value() * REPEAT) / repeats << "s "
-            << "(" << 1e-6 * cost*repeats/timerSqrt.value() << " MFLOPS)\n";
+            << (timerNoSqrt.best()) / repeats << "s "
+            << "(" << 1e-9 * cost*repeats/timerNoSqrt.best() << " GFLOPS)\t"
+            << (timerSqrt.best()) / repeats << "s "
+            << "(" << 1e-9 * cost*repeats/timerSqrt.best() << " GFLOPS)\n";
 
 
   #ifdef BENCH_GSL
@@ -119,13 +119,13 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
 
 int main(int argc, char* argv[])
 {
-  const int dynsizes[] = {4,6,8,16,24,32,49,64,128,256,512,900,0};
-  std::cout << "size            no sqrt                           standard";
+  const int dynsizes[] = {4,6,8,16,24,32,49,64,128,256,512,900,1500,0};
+  std::cout << "size            LDLT                            LLT";
 //   #ifdef BENCH_GSL
 //   std::cout << "       GSL (standard + double + ATLAS)  ";
 //   #endif
   std::cout << "\n";
-  for (uint i=0; dynsizes[i]>0; ++i)
+  for (int i=0; dynsizes[i]>0; ++i)
     benchLLT(Matrix<Scalar,Dynamic,Dynamic>(dynsizes[i],dynsizes[i]));
 
   benchLLT(Matrix<Scalar,2,2>());
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index 41ca8b3b6..8528c5587 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -2,6 +2,14 @@
 // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2  ./a.out
 // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
 
+// Compilation options:
+// 
+// -DSCALAR=std::complex<double>
+// -DSCALARA=double or -DSCALARB=double
+// -DHAVE_BLAS
+// -DDECOUPLED
+//
+
 #include <iostream>
 #include <Eigen/Core>
 #include <bench/BenchTimer.h>
@@ -14,10 +22,18 @@ using namespace Eigen;
 #define SCALAR float
 #endif
 
+#ifndef SCALARA
+#define SCALARA SCALAR
+#endif
+
+#ifndef SCALARB
+#define SCALARB SCALAR
+#endif
+
 typedef SCALAR Scalar;
 typedef NumTraits<Scalar>::Real RealScalar;
-typedef Matrix<RealScalar,Dynamic,Dynamic> A;
-typedef Matrix</*Real*/Scalar,Dynamic,Dynamic> B;
+typedef Matrix<SCALARA,Dynamic,Dynamic> A;
+typedef Matrix<SCALARB,Dynamic,Dynamic> B;
 typedef Matrix<Scalar,Dynamic,Dynamic> C;
 typedef Matrix<RealScalar,Dynamic,Dynamic> M;
 
@@ -129,35 +145,69 @@ int main(int argc, char ** argv)
   int tries = 2;  // number of tries, we keep the best
 
   int s = 2048;
-  int cache_size = -1;
+  int m = s;
+  int n = s;
+  int p = s;
+  int cache_size1=-1, cache_size2=l2, cache_size3 = 0;
 
   bool need_help = false;
-  for (int i=1; i<argc; ++i)
+  for (int i=1; i<argc;)
   {
-    if(argv[i][0]=='s')
-      s = atoi(argv[i]+1);
-    else if(argv[i][0]=='c')
-      cache_size = atoi(argv[i]+1);
-    else if(argv[i][0]=='t')
-      tries = atoi(argv[i]+1);
-    else if(argv[i][0]=='p')
-      rep = atoi(argv[i]+1);
+    if(argv[i][0]=='-')
+    {
+      if(argv[i][1]=='s')
+      {
+        ++i;
+        s = atoi(argv[i++]);
+        m = n = p = s;
+        if(argv[i][0]!='-')
+        {
+          n = atoi(argv[i++]);
+          p = atoi(argv[i++]);
+        }
+      }
+      else if(argv[i][1]=='c')
+      {
+        ++i;
+        cache_size1 = atoi(argv[i++]);
+        if(argv[i][0]!='-')
+        {
+          cache_size2 = atoi(argv[i++]);
+          if(argv[i][0]!='-')
+            cache_size3 = atoi(argv[i++]);
+        }
+      }
+      else if(argv[i][1]=='t')
+      {
+        ++i;
+        tries = atoi(argv[i++]);
+      }
+      else if(argv[i][1]=='p')
+      {
+        ++i;
+        rep = atoi(argv[i++]);
+      }
+    }
     else
+    {
       need_help = true;
+      break;
+    }
   }
 
   if(need_help)
   {
-    std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n";
+    std::cout << argv[0] << " -s <matrix sizes> -c <cache sizes> -t <nb tries> -p <nb repeats>\n";
+    std::cout << "   <matrix sizes> : size\n";
+    std::cout << "   <matrix sizes> : rows columns depth\n";
     return 1;
   }
 
-  if(cache_size>0)
-    setCpuCacheSizes(cache_size,96*cache_size);
-
-  int m = s;
-  int n = s;
-  int p = s;
+#if EIGEN_VERSION_AT_LEAST(3,2,90)
+  if(cache_size1>0)
+    setCpuCacheSizes(cache_size1,cache_size2,cache_size3);
+#endif
+  
   A a(m,p); a.setRandom();
   B b(p,n); b.setRandom();
   C c(m,n); c.setOnes();
@@ -172,6 +222,7 @@ int main(int argc, char ** argv)
 
   // check the parallel product is correct
   #if defined EIGEN_HAS_OPENMP
+  Eigen::initParallel();
   int procs = omp_get_max_threads();
   if(procs>1)
   {
@@ -188,11 +239,20 @@ int main(int argc, char ** argv)
   #elif defined HAVE_BLAS
     blas_gemm(a,b,r);
     c.noalias() += a * b;
-    if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
+    if(!r.isApprox(c)) {
+      std::cout << r  - c << "\n";
+      std::cerr << "Warning, your product is crap!\n\n";
+    }
   #else
-    gemm(a,b,c);
-    r.noalias() += a.cast<Scalar>() * b.cast<Scalar>();
-    if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
+    if(1.*m*n*p<2000.*2000*2000)
+    {
+      gemm(a,b,c);
+      r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() );
+      if(!r.isApprox(c)) {
+        std::cout << r - c << "\n";
+        std::cerr << "Warning, your product is crap!\n\n";
+      }
+    }
   #endif
 
   #ifdef HAVE_BLAS
@@ -214,7 +274,7 @@ int main(int argc, char ** argv)
   {
     BenchTimer tmono;
     omp_set_num_threads(1);
-    Eigen::internal::setNbThreads(1);
+    Eigen::setNbThreads(1);
     c = rc;
     BENCH(tmono, tries, rep, gemm(a,b,c));
     std::cout << "eigen mono cpu    " << tmono.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmono.total(CPU_TIMER)  << "s)\n";
@@ -223,6 +283,15 @@ int main(int argc, char ** argv)
   }
   #endif
   
+  if(1.*m*n*p<30*30*30)
+  {
+      BenchTimer tmt;
+      c = rc;
+      BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b));
+      std::cout << "lazy cpu         " << tmt.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmt.total(CPU_TIMER)  << "s)\n";
+      std::cout << "lazy real        " << tmt.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
+  }
+  
   #ifdef DECOUPLED
   if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
   {
diff --git a/bench/bench_norm.cpp b/bench/bench_norm.cpp
index 806db292c..129afcfb2 100644
--- a/bench/bench_norm.cpp
+++ b/bench/bench_norm.cpp
@@ -6,19 +6,25 @@ using namespace Eigen;
 using namespace std;
 
 template<typename T>
-EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(const T& v)
+EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(T& v)
 {
   return v.norm();
 }
 
 template<typename T>
-EIGEN_DONT_INLINE typename T::Scalar hypotNorm(const T& v)
+EIGEN_DONT_INLINE typename T::Scalar stableNorm(T& v)
+{
+  return v.stableNorm();
+}
+
+template<typename T>
+EIGEN_DONT_INLINE typename T::Scalar hypotNorm(T& v)
 {
   return v.hypotNorm();
 }
 
 template<typename T>
-EIGEN_DONT_INLINE typename T::Scalar blueNorm(const T& v)
+EIGEN_DONT_INLINE typename T::Scalar blueNorm(T& v)
 {
   return v.blueNorm();
 }
@@ -32,25 +38,25 @@ EIGEN_DONT_INLINE typename T::Scalar lapackNorm(T& v)
   Scalar ssq = 1;
   for (int i=0;i<n;++i)
   {
-    Scalar ax = internal::abs(v.coeff(i));
+    Scalar ax = std::abs(v.coeff(i));
     if (scale >= ax)
     {
-      ssq += internal::abs2(ax/scale);
+      ssq += numext::abs2(ax/scale);
     }
     else
     {
-      ssq = Scalar(1) + ssq * internal::abs2(scale/ax);
+      ssq = Scalar(1) + ssq * numext::abs2(scale/ax);
       scale = ax;
     }
   }
-  return scale * internal::sqrt(ssq);
+  return scale * std::sqrt(ssq);
 }
 
 template<typename T>
 EIGEN_DONT_INLINE typename T::Scalar twopassNorm(T& v)
 {
   typedef typename T::Scalar Scalar;
-  Scalar s = v.cwise().abs().maxCoeff();
+  Scalar s = v.array().abs().maxCoeff();
   return s*(v/s).norm();
 }
 
@@ -73,16 +79,20 @@ EIGEN_DONT_INLINE typename T::Scalar divacNorm(T& v)
       v(i) = v(2*i) + v(2*i+1);
     n = n/2;
   }
-  return internal::sqrt(v(0));
+  return std::sqrt(v(0));
 }
 
+namespace Eigen {
+namespace internal {
 #ifdef EIGEN_VECTORIZE
-Packet4f internal::plt(const Packet4f& a, Packet4f& b) { return _mm_cmplt_ps(a,b); }
-Packet2d internal::plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); }
+Packet4f plt(const Packet4f& a, Packet4f& b) { return _mm_cmplt_ps(a,b); }
+Packet2d plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); }
 
-Packet4f internal::pandnot(const Packet4f& a, Packet4f& b) { return _mm_andnot_ps(a,b); }
-Packet2d internal::pandnot(const Packet2d& a, Packet2d& b) { return _mm_andnot_pd(a,b); }
+Packet4f pandnot(const Packet4f& a, Packet4f& b) { return _mm_andnot_ps(a,b); }
+Packet2d pandnot(const Packet2d& a, Packet2d& b) { return _mm_andnot_pd(a,b); }
 #endif
+}
+}
 
 template<typename T>
 EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
@@ -126,7 +136,7 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
 
     overfl  = rbig*s2m;          // overfow boundary for abig
     eps     = std::pow(ibeta, 1-it);
-    relerr  = internal::sqrt(eps);      // tolerance for neglecting asml
+    relerr  = std::sqrt(eps);      // tolerance for neglecting asml
     abig    = 1.0/eps - 1.0;
     if (Scalar(nbig)>abig)  nmax = abig;  // largest safe n
     else                    nmax = nbig;
@@ -134,13 +144,13 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
 
   typedef typename internal::packet_traits<Scalar>::type Packet;
   const int ps = internal::packet_traits<Scalar>::size;
-  Packet pasml = internal::pset1(Scalar(0));
-  Packet pamed = internal::pset1(Scalar(0));
-  Packet pabig = internal::pset1(Scalar(0));
-  Packet ps2m = internal::pset1(s2m);
-  Packet ps1m = internal::pset1(s1m);
-  Packet pb2  = internal::pset1(b2);
-  Packet pb1  = internal::pset1(b1);
+  Packet pasml = internal::pset1<Packet>(Scalar(0));
+  Packet pamed = internal::pset1<Packet>(Scalar(0));
+  Packet pabig = internal::pset1<Packet>(Scalar(0));
+  Packet ps2m = internal::pset1<Packet>(s2m);
+  Packet ps1m = internal::pset1<Packet>(s1m);
+  Packet pb2  = internal::pset1<Packet>(b2);
+  Packet pb1  = internal::pset1<Packet>(b1);
   for(int j=0; j<v.size(); j+=ps)
   {
     Packet ax = internal::pabs(v.template packet<Aligned>(j));
@@ -170,7 +180,7 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
   Scalar amed = internal::predux(pamed);
   if(abig > Scalar(0))
   {
-    abig = internal::sqrt(abig);
+    abig = std::sqrt(abig);
     if(abig > overfl)
     {
       eigen_assert(false && "overflow");
@@ -179,7 +189,7 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
     if(amed > Scalar(0))
     {
       abig = abig/s2m;
-      amed = internal::sqrt(amed);
+      amed = std::sqrt(amed);
     }
     else
     {
@@ -191,55 +201,56 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
   {
     if (amed > Scalar(0))
     {
-      abig = internal::sqrt(amed);
-      amed = internal::sqrt(asml) / s1m;
+      abig = std::sqrt(amed);
+      amed = std::sqrt(asml) / s1m;
     }
     else
     {
-      return internal::sqrt(asml)/s1m;
+      return std::sqrt(asml)/s1m;
     }
   }
   else
   {
-    return internal::sqrt(amed);
+    return std::sqrt(amed);
   }
   asml = std::min(abig, amed);
   abig = std::max(abig, amed);
   if(asml <= abig*relerr)
     return abig;
   else
-    return abig * internal::sqrt(Scalar(1) + internal::abs2(asml/abig));
+    return abig * std::sqrt(Scalar(1) + numext::abs2(asml/abig));
   #endif
 }
 
 #define BENCH_PERF(NRM) { \
+  float af = 0; double ad = 0; std::complex<float> ac = 0; \
   Eigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\
   for (int k=0; k<tries; ++k) { \
     tf.start(); \
-    for (int i=0; i<iters; ++i) NRM(vf); \
+    for (int i=0; i<iters; ++i) { af += NRM(vf); } \
     tf.stop(); \
   } \
   for (int k=0; k<tries; ++k) { \
     td.start(); \
-    for (int i=0; i<iters; ++i) NRM(vd); \
+    for (int i=0; i<iters; ++i) { ad += NRM(vd); } \
     td.stop(); \
   } \
-  for (int k=0; k<std::max(1,tries/3); ++k) { \
+  /*for (int k=0; k<std::max(1,tries/3); ++k) { \
     tcf.start(); \
-    for (int i=0; i<iters; ++i) NRM(vcf); \
+    for (int i=0; i<iters; ++i) { ac += NRM(vcf); } \
     tcf.stop(); \
-  } \
+  } */\
   std::cout << #NRM << "\t" << tf.value() << "   " << td.value() <<  "    " << tcf.value() << "\n"; \
 }
 
 void check_accuracy(double basef, double based, int s)
 {
-  double yf = basef * internal::abs(internal::random<double>());
-  double yd = based * internal::abs(internal::random<double>());
+  double yf = basef * std::abs(internal::random<double>());
+  double yd = based * std::abs(internal::random<double>());
   VectorXf vf = VectorXf::Ones(s) * yf;
   VectorXd vd = VectorXd::Ones(s) * yd;
 
-  std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n";
+  std::cout << "reference\t" << std::sqrt(double(s))*yf << "\t" << std::sqrt(double(s))*yd << "\n";
   std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\n";
   std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\n";
   std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\n";
@@ -255,8 +266,8 @@ void check_accuracy_var(int ef0, int ef1, int ed0, int ed1, int s)
   VectorXd vd(s);
   for (int i=0; i<s; ++i)
   {
-    vf[i] = internal::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ef0,ef1));
-    vd[i] = internal::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ed0,ed1));
+    vf[i] = std::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ef0,ef1));
+    vd[i] = std::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ed0,ed1));
   }
 
   //std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n";
@@ -312,34 +323,38 @@ int main(int argc, char** argv)
     std::cout << "\n";
   }
 
+  y = 1;
   std::cout.precision(4);
-  std::cerr << "Performance (out of cache):\n";
+  int s1 = 1024*1024*32;
+  std::cerr << "Performance (out of cache, " << s1 << "):\n";
   {
     int iters = 1;
-    VectorXf vf = VectorXf::Random(1024*1024*32) * y;
-    VectorXd vd = VectorXd::Random(1024*1024*32) * y;
-    VectorXcf vcf = VectorXcf::Random(1024*1024*32) * y;
+    VectorXf vf = VectorXf::Random(s1) * y;
+    VectorXd vd = VectorXd::Random(s1) * y;
+    VectorXcf vcf = VectorXcf::Random(s1) * y;
     BENCH_PERF(sqsumNorm);
+    BENCH_PERF(stableNorm);
     BENCH_PERF(blueNorm);
-//     BENCH_PERF(pblueNorm);
-//     BENCH_PERF(lapackNorm);
-//     BENCH_PERF(hypotNorm);
-//     BENCH_PERF(twopassNorm);
+    BENCH_PERF(pblueNorm);
+    BENCH_PERF(lapackNorm);
+    BENCH_PERF(hypotNorm);
+    BENCH_PERF(twopassNorm);
     BENCH_PERF(bl2passNorm);
   }
 
-  std::cerr << "\nPerformance (in cache):\n";
+  std::cerr << "\nPerformance (in cache, " << 512 << "):\n";
   {
     int iters = 100000;
     VectorXf vf = VectorXf::Random(512) * y;
     VectorXd vd = VectorXd::Random(512) * y;
     VectorXcf vcf = VectorXcf::Random(512) * y;
     BENCH_PERF(sqsumNorm);
+    BENCH_PERF(stableNorm);
     BENCH_PERF(blueNorm);
-//     BENCH_PERF(pblueNorm);
-//     BENCH_PERF(lapackNorm);
-//     BENCH_PERF(hypotNorm);
-//     BENCH_PERF(twopassNorm);
+    BENCH_PERF(pblueNorm);
+    BENCH_PERF(lapackNorm);
+    BENCH_PERF(hypotNorm);
+    BENCH_PERF(twopassNorm);
     BENCH_PERF(bl2passNorm);
   }
 }
diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
new file mode 100644
index 000000000..827be2880
--- /dev/null
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -0,0 +1,677 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+#include <fstream>
+#include <memory>
+#include <cstdio>
+
+bool eigen_use_specific_block_size;
+int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
+#include <Eigen/Core>
+
+#include <bench/BenchTimer.h>
+
+using namespace Eigen;
+using namespace std;
+
+static BenchTimer timer;
+
+// how many times we repeat each measurement.
+// measurements are randomly shuffled - we're not doing
+// all N identical measurements in a row.
+const int measurement_repetitions = 3;
+
+// Timings below this value are too short to be accurate,
+// we'll repeat measurements with more iterations until
+// we get a timing above that threshold.
+const float min_accurate_time = 1e-2f;
+
+// See --min-working-set-size command line parameter.
+size_t min_working_set_size = 0;
+
+float max_clock_speed = 0.0f;
+
+// range of sizes that we will benchmark (in all 3 K,M,N dimensions)
+const size_t maxsize = 2048;
+const size_t minsize = 16;
+
+typedef MatrixXf MatrixType;
+typedef MatrixType::Scalar Scalar;
+typedef internal::packet_traits<Scalar>::type Packet;
+
+static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
+static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
+static_assert(maxsize > minsize, "maxsize must be larger than minsize");
+static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
+
+// just a helper to store a triple of K,M,N sizes for matrix product
+struct size_triple_t
+{
+  size_t k, m, n;
+  size_triple_t() : k(0), m(0), n(0) {}
+  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
+  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
+  size_triple_t(uint16_t compact)
+  {
+    k = 1 << ((compact & 0xf00) >> 8);
+    m = 1 << ((compact & 0x0f0) >> 4);
+    n = 1 << ((compact & 0x00f) >> 0);
+  }
+};
+
+uint8_t log2_pot(size_t x) {
+  size_t l = 0;
+  while (x >>= 1) l++;
+  return l;
+}
+
+// Convert between size tripes and a compact form fitting in 12 bits
+// where each size, which must be a POT, is encoded as its log2, on 4 bits
+// so the largest representable size is 2^15 == 32k  ... big enough.
+uint16_t compact_size_triple(size_t k, size_t m, size_t n)
+{
+  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
+}
+
+uint16_t compact_size_triple(const size_triple_t& t)
+{
+  return compact_size_triple(t.k, t.m, t.n);
+}
+
+// A single benchmark. Initially only contains benchmark params.
+// Then call run(), which stores the result in the gflops field.
+struct benchmark_t
+{
+  uint16_t compact_product_size;
+  uint16_t compact_block_size;
+  bool use_default_block_size;
+  float gflops;
+  benchmark_t()
+    : compact_product_size(0)
+    , compact_block_size(0)
+    , use_default_block_size(false)
+    , gflops(0)
+  {
+  }
+  benchmark_t(size_t pk, size_t pm, size_t pn,
+              size_t bk, size_t bm, size_t bn)
+    : compact_product_size(compact_size_triple(pk, pm, pn))
+    , compact_block_size(compact_size_triple(bk, bm, bn))
+    , use_default_block_size(false)
+    , gflops(0)
+  {}
+  benchmark_t(size_t pk, size_t pm, size_t pn)
+    : compact_product_size(compact_size_triple(pk, pm, pn))
+    , compact_block_size(0)
+    , use_default_block_size(true)
+    , gflops(0)
+  {}
+
+  void run();
+};
+
+ostream& operator<<(ostream& s, const benchmark_t& b)
+{
+  s << hex << b.compact_product_size << dec;
+  if (b.use_default_block_size) {
+    size_triple_t t(b.compact_product_size);
+    Index k = t.k, m = t.m, n = t.n;
+    internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
+    s << " default(" << k << ", " << m << ", " << n << ")";
+  } else {
+    s << " " << hex << b.compact_block_size << dec;
+  }
+  s << " " << b.gflops;
+  return s;
+}
+
+// We sort first by increasing benchmark parameters,
+// then by decreasing performance.
+bool operator<(const benchmark_t& b1, const benchmark_t& b2)
+{ 
+  return b1.compact_product_size < b2.compact_product_size ||
+           (b1.compact_product_size == b2.compact_product_size && (
+             (b1.compact_block_size < b2.compact_block_size || (
+               b1.compact_block_size == b2.compact_block_size &&
+                 b1.gflops > b2.gflops))));
+}
+
+void benchmark_t::run()
+{
+  size_triple_t productsizes(compact_product_size);
+
+  if (use_default_block_size) {
+    eigen_use_specific_block_size = false;
+  } else {
+    // feed eigen with our custom blocking params
+    eigen_use_specific_block_size = true;
+    size_triple_t blocksizes(compact_block_size);
+    eigen_block_size_k = blocksizes.k;
+    eigen_block_size_m = blocksizes.m;
+    eigen_block_size_n = blocksizes.n;
+  }
+
+  // set up the matrix pool
+
+  const size_t combined_three_matrices_sizes =
+    sizeof(Scalar) *
+      (productsizes.k * productsizes.m +
+       productsizes.k * productsizes.n +
+       productsizes.m * productsizes.n);
+
+  // 64 M is large enough that nobody has a cache bigger than that,
+  // while still being small enough that everybody has this much RAM,
+  // so conveniently we don't need to special-case platforms here.
+  const size_t unlikely_large_cache_size = 64 << 20;
+
+  const size_t working_set_size =
+    min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
+
+  const size_t matrix_pool_size =
+    1 + working_set_size / combined_three_matrices_sizes;
+
+  MatrixType *lhs = new MatrixType[matrix_pool_size];
+  MatrixType *rhs = new MatrixType[matrix_pool_size];
+  MatrixType *dst = new MatrixType[matrix_pool_size];
+  
+  for (size_t i = 0; i < matrix_pool_size; i++) {
+    lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
+    rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
+    dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
+  }
+
+  // main benchmark loop
+
+  int iters_at_a_time = 1;
+  float time_per_iter = 0.0f;
+  size_t matrix_index = 0;
+  while (true) {
+
+    double starttime = timer.getCpuTime();
+    for (int i = 0; i < iters_at_a_time; i++) {
+      dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
+      matrix_index++;
+      if (matrix_index == matrix_pool_size) {
+        matrix_index = 0;
+      }
+    }
+    double endtime = timer.getCpuTime();
+
+    const float timing = float(endtime - starttime);
+
+    if (timing >= min_accurate_time) {
+      time_per_iter = timing / iters_at_a_time;
+      break;
+    }
+
+    iters_at_a_time *= 2;
+  }
+
+  delete[] lhs;
+  delete[] rhs;
+  delete[] dst;
+
+  gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
+}
+
+void print_cpuinfo()
+{
+#ifdef __linux__
+  cout << "contents of /proc/cpuinfo:" << endl;
+  string line;
+  ifstream cpuinfo("/proc/cpuinfo");
+  if (cpuinfo.is_open()) {
+    while (getline(cpuinfo, line)) {
+      cout << line << endl;
+    }
+    cpuinfo.close();
+  }
+  cout << endl;
+#elif defined __APPLE__
+  cout << "output of sysctl hw:" << endl;
+  system("sysctl hw");
+  cout << endl;
+#endif
+}
+
+template <typename T>
+string type_name()
+{
+  return "unknown";
+}
+
+template<>
+string type_name<float>()
+{
+  return "float";
+}
+
+template<>
+string type_name<double>()
+{
+  return "double";
+}
+
+struct action_t
+{
+  virtual const char* invokation_name() const { abort(); return nullptr; }
+  virtual void run() const { abort(); }
+  virtual ~action_t() {}
+};
+
+void show_usage_and_exit(int /*argc*/, char* argv[],
+                         const vector<unique_ptr<action_t>>& available_actions)
+{
+  cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
+  cerr << "available actions:" << endl << endl;
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    cerr << "  " << (*it)->invokation_name() << endl;
+  }
+  cerr << endl;
+  cerr << "options:" << endl << endl;
+  cerr << "  --min-working-set-size=N:" << endl;
+  cerr << "       Set the minimum working set size to N bytes." << endl;
+  cerr << "       This is rounded up as needed to a multiple of matrix size." << endl;
+  cerr << "       A larger working set lowers the chance of a warm cache." << endl;
+  cerr << "       The default value 0 means use a large enough working" << endl;
+  cerr << "       set to likely outsize caches." << endl;
+  cerr << "       A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
+  cerr << "       avoid warm caches." << endl;
+  exit(1);
+}
+     
+float measure_clock_speed()
+{
+  cerr << "Measuring clock speed...                              \r" << flush;
+          
+  vector<float> all_gflops;
+  for (int i = 0; i < 8; i++) {
+    benchmark_t b(1024, 1024, 1024);
+    b.run();
+    all_gflops.push_back(b.gflops);
+  }
+
+  sort(all_gflops.begin(), all_gflops.end());
+  float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
+
+  // multiply by an arbitrary constant to discourage trying doing anything with the
+  // returned values besides just comparing them with each other.
+  float result = stable_estimate * 123.456f;
+
+  return result;
+}
+
+struct human_duration_t
+{
+  int seconds;
+  human_duration_t(int s) : seconds(s) {}
+};
+
+ostream& operator<<(ostream& s, const human_duration_t& d)
+{
+  int remainder = d.seconds;
+  if (remainder > 3600) {
+    int hours = remainder / 3600;
+    s << hours << " h ";
+    remainder -= hours * 3600;
+  }
+  if (remainder > 60) {
+    int minutes = remainder / 60;
+    s << minutes << " min ";
+    remainder -= minutes * 60;
+  }
+  if (d.seconds < 600) {
+    s << remainder << " s";
+  }
+  return s;
+}
+
+const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
+
+void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
+{
+  FILE* file = fopen(filename, "w");
+  if (!file) {
+    cerr << "Could not open file " << filename << " for writing." << endl;
+    cerr << "Do you have write permissions on the current working directory?" << endl;
+    exit(1);
+  }
+  size_t benchmarks_vector_size = benchmarks.size();
+  fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
+  fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
+  fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
+  fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
+  fclose(file);
+}
+
+bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
+{
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    return false;
+  }
+  if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
+    return false;
+  }
+  size_t benchmarks_vector_size = 0;
+  if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
+    return false;
+  }
+  if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
+    return false;
+  }
+  benchmarks.resize(benchmarks_vector_size);
+  if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
+    return false;
+  }
+  unlink(filename);
+  return true;
+}
+
+void try_run_some_benchmarks(
+  vector<benchmark_t>& benchmarks,
+  double time_start,
+  size_t& first_benchmark_to_run)
+{
+  if (first_benchmark_to_run == benchmarks.size()) {
+    return;
+  }
+
+  double time_last_progress_update = 0;
+  double time_last_clock_speed_measurement = 0;
+  double time_now = 0;
+
+  size_t benchmark_index = first_benchmark_to_run;
+
+  while (true) {
+    float ratio_done = float(benchmark_index) / benchmarks.size();
+    time_now = timer.getRealTime();
+
+    // We check clock speed every minute and at the end.
+    if (benchmark_index == benchmarks.size() ||
+        time_now > time_last_clock_speed_measurement + 60.0f)
+    {
+      time_last_clock_speed_measurement = time_now;
+
+      // Ensure that clock speed is as expected
+      float current_clock_speed = measure_clock_speed();
+
+      // The tolerance needs to be smaller than the relative difference between
+      // clock speeds that a device could operate under.
+      // It seems unlikely that a device would be throttling clock speeds by
+      // amounts smaller than 2%.
+      // With a value of 1%, I was getting within noise on a Sandy Bridge.
+      const float clock_speed_tolerance = 0.02f;
+
+      if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
+        // Clock speed is now higher than we previously measured.
+        // Either our initial measurement was inaccurate, which won't happen
+        // too many times as we are keeping the best clock speed value and
+        // and allowing some tolerance; or something really weird happened,
+        // which invalidates all benchmark results collected so far.
+        // Either way, we better restart all over again now.
+        if (benchmark_index) {
+          cerr << "Restarting at " << 100.0f * ratio_done
+               << " % because clock speed increased.          " << endl;
+        }
+        max_clock_speed = current_clock_speed;
+        first_benchmark_to_run = 0;
+        return;
+      }
+
+      bool rerun_last_tests = false;
+
+      if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
+        cerr << "Measurements completed so far: "
+             << 100.0f * ratio_done
+             << " %                             " << endl;
+        cerr << "Clock speed seems to be only "
+             << current_clock_speed/max_clock_speed
+             << " times what it used to be." << endl;
+
+        unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
+
+        while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
+          if (seconds_to_sleep_if_lower_clock_speed > 32) {
+            cerr << "Sleeping longer probably won't make a difference." << endl;
+            cerr << "Serializing benchmarks to " << session_filename << endl;
+            serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
+            cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
+            exit(2);
+          }
+          rerun_last_tests = true;
+          cerr << "Sleeping "
+               << seconds_to_sleep_if_lower_clock_speed
+               << " s...                                   \r" << endl;
+          sleep(seconds_to_sleep_if_lower_clock_speed);
+          current_clock_speed = measure_clock_speed();
+          seconds_to_sleep_if_lower_clock_speed *= 2;
+        }
+      }
+
+      if (rerun_last_tests) {
+        cerr << "Redoing the last "
+             << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
+             << " % because clock speed had been low.   " << endl;
+        return;
+      }
+
+      // nothing wrong with the clock speed so far, so there won't be a need to rerun
+      // benchmarks run so far in case we later encounter a lower clock speed.
+      first_benchmark_to_run = benchmark_index;
+    }
+
+    if (benchmark_index == benchmarks.size()) {
+      // We're done!
+      first_benchmark_to_run = benchmarks.size();
+      // Erase progress info
+      cerr << "                                                            " << endl;
+      return;
+    }
+
+    // Display progress info on stderr
+    if (time_now > time_last_progress_update + 1.0f) {
+      time_last_progress_update = time_now;
+      cerr << "Measurements... " << 100.0f * ratio_done
+           << " %, ETA "
+           << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
+           << "                          \r" << flush;
+    }
+
+    // This is where we actually run a benchmark!
+    benchmarks[benchmark_index].run();
+    benchmark_index++;
+  }
+}
+
+void run_benchmarks(vector<benchmark_t>& benchmarks)
+{
+  size_t first_benchmark_to_run;
+  vector<benchmark_t> deserialized_benchmarks;
+  bool use_deserialized_benchmarks = false;
+  if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
+    cerr << "Found serialized session with "
+         << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
+         << " % already done" << endl;
+    if (deserialized_benchmarks.size() == benchmarks.size() &&
+        first_benchmark_to_run > 0 &&
+        first_benchmark_to_run < benchmarks.size())
+    {
+      use_deserialized_benchmarks = true;
+    }
+  }
+
+  if (use_deserialized_benchmarks) {
+    benchmarks = deserialized_benchmarks;
+  } else {
+    // not using deserialized benchmarks, starting from scratch
+    first_benchmark_to_run = 0;
+
+    // Randomly shuffling benchmarks allows us to get accurate enough progress info,
+    // as now the cheap/expensive benchmarks are randomly mixed so they average out.
+    // It also means that if data is corrupted for some time span, the odds are that
+    // not all repetitions of a given benchmark will be corrupted.
+    random_shuffle(benchmarks.begin(), benchmarks.end());
+  }
+
+  for (int i = 0; i < 4; i++) {
+    max_clock_speed = max(max_clock_speed, measure_clock_speed());
+  }
+  
+  double time_start = 0.0;
+  while (first_benchmark_to_run < benchmarks.size()) {
+    if (first_benchmark_to_run == 0) {
+      time_start = timer.getRealTime();
+    }
+    try_run_some_benchmarks(benchmarks,
+                            time_start,
+                            first_benchmark_to_run);
+  }
+
+  // Sort timings by increasing benchmark parameters, and decreasing gflops.
+  // The latter is very important. It means that we can ignore all but the first
+  // benchmark with given parameters.
+  sort(benchmarks.begin(), benchmarks.end());
+
+  // Collect best (i.e. now first) results for each parameter values.
+  vector<benchmark_t> best_benchmarks;
+  for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+    if (best_benchmarks.empty() ||
+        best_benchmarks.back().compact_product_size != it->compact_product_size ||
+        best_benchmarks.back().compact_block_size != it->compact_block_size)
+    {
+      best_benchmarks.push_back(*it);
+    }
+  }
+
+  // keep and return only the best benchmarks
+  benchmarks = best_benchmarks;
+}
+
+struct measure_all_pot_sizes_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "all-pot-sizes"; }
+  virtual void run() const
+  {
+    vector<benchmark_t> benchmarks;
+    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+        for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+          for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+            for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
+              for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
+                for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
+                  benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    run_benchmarks(benchmarks);
+
+    cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
+    for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+      cout << *it << endl;
+    }
+  }
+};
+
+struct measure_default_sizes_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "default-sizes"; }
+  virtual void run() const
+  {
+    vector<benchmark_t> benchmarks;
+    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+        for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+          for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+            benchmarks.emplace_back(ksize, msize, nsize);
+          }
+        }
+      }
+    }
+
+    run_benchmarks(benchmarks);
+
+    cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
+    for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+      cout << *it << endl;
+    }
+  }
+};
+
+int main(int argc, char* argv[])
+{
+  double time_start = timer.getRealTime();
+  cout.precision(4);
+  cerr.precision(4);
+
+  vector<unique_ptr<action_t>> available_actions;
+  available_actions.emplace_back(new measure_all_pot_sizes_action_t);
+  available_actions.emplace_back(new measure_default_sizes_action_t);
+
+  auto action = available_actions.end();
+
+  if (argc <= 1) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    if (!strcmp(argv[1], (*it)->invokation_name())) {
+      action = it;
+      break;
+    }
+  }
+
+  if (action == available_actions.end()) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  for (int i = 2; i < argc; i++) {
+    if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
+      const char* equals_sign = strchr(argv[i], '=');
+      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
+    } else {
+      cerr << "unrecognized option: " << argv[i] << endl << endl;
+      show_usage_and_exit(argc, argv, available_actions);
+    }
+  }
+
+  print_cpuinfo();
+
+  cout << "benchmark parameters:" << endl;
+  cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
+  cout << "scalar type: " << type_name<Scalar>() << endl;
+  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
+  cout << "minsize = " << minsize << endl;
+  cout << "maxsize = " << maxsize << endl;
+  cout << "measurement_repetitions = " << measurement_repetitions << endl;
+  cout << "min_accurate_time = " << min_accurate_time << endl;
+  cout << "min_working_set_size = " << min_working_set_size;
+  if (min_working_set_size == 0) {
+    cout << " (try to outsize caches)";
+  }
+  cout << endl << endl;
+
+  (*action)->run();
+
+  double time_end = timer.getRealTime();
+  cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
+}
diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt
index 119b470d9..38ff9f483 100644
--- a/bench/btl/CMakeLists.txt
+++ b/bench/btl/CMakeLists.txt
@@ -11,29 +11,24 @@ SET(CMAKE_INCLUDE_CURRENT_DIR ON)
 
 string(REGEX MATCH icpc IS_ICPC ${CMAKE_CXX_COMPILER})
 IF(CMAKE_COMPILER_IS_GNUCXX OR IS_ICPC)
-  SET(CMAKE_CXX_FLAGS "-g0 -O3 -DNDEBUG")
-  SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG")
-  IF(NOT BTL_NOVEC)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
-    SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -msse2")
-  ELSE(NOT BTL_NOVEC)
+  SET(CMAKE_CXX_FLAGS "-g0 -O3 -DNDEBUG ${CMAKE_CXX_FLAGS}")
+  SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG ${CMAKE_Fortran_FLAGS}")
+  IF(BTL_NOVEC)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_DONT_VECTORIZE")
-  ENDIF(NOT BTL_NOVEC)
+  ENDIF(BTL_NOVEC)
 ENDIF(CMAKE_COMPILER_IS_GNUCXX OR IS_ICPC)
 
 IF(MSVC)
   SET(CMAKE_CXX_FLAGS " /O2 /Ot /GL /fp:fast -DNDEBUG")
 #   SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG")
-  IF(NOT BTL_NOVEC)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2")
-  ELSE(NOT BTL_NOVEC)
+  IF(BTL_NOVEC)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_DONT_VECTORIZE")
-  ENDIF(NOT BTL_NOVEC)
+  ENDIF(BTL_NOVEC)
 ENDIF(MSVC)
 
 if(IS_ICPC)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fast")
-  set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fast")
+  set(CMAKE_CXX_FLAGS "-fast ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_Fortran_FLAGS "-fast ${CMAKE_Fortran_FLAGS}")
 endif(IS_ICPC)
 
 include_directories(
@@ -48,6 +43,12 @@ include_directories(
 #   set(DEFAULT_LIBRARIES ${MKL_LIBRARIES})
 # endif (MKL_FOUND)
 
+find_library(EIGEN_BTL_RT_LIBRARY rt)
+# if we cannot find it easily, then we don't need it!
+if(NOT EIGEN_BTL_RT_LIBRARY)
+  set(EIGEN_BTL_RT_LIBRARY "")
+endif()
+
 MACRO(BTL_ADD_BENCH targetname)
 
   foreach(_current_var ${ARGN})
@@ -70,7 +71,7 @@ MACRO(BTL_ADD_BENCH targetname)
   IF(BUILD_${targetname})
     ADD_EXECUTABLE(${targetname} ${_sources})
     ADD_TEST(${targetname} "${targetname}")
-    target_link_libraries(${targetname} ${DEFAULT_LIBRARIES} rt)
+    target_link_libraries(${targetname} ${DEFAULT_LIBRARIES} ${EIGEN_BTL_RT_LIBRARY})
   ENDIF(BUILD_${targetname})
 
 ENDMACRO(BTL_ADD_BENCH)
@@ -91,6 +92,7 @@ ENABLE_TESTING()
 
 add_subdirectory(libs/eigen3)
 add_subdirectory(libs/eigen2)
+add_subdirectory(libs/tensors)
 add_subdirectory(libs/BLAS)
 add_subdirectory(libs/ublas)
 add_subdirectory(libs/gmm)
@@ -98,6 +100,7 @@ add_subdirectory(libs/mtl4)
 add_subdirectory(libs/blitz)
 add_subdirectory(libs/tvmet)
 add_subdirectory(libs/STL)
+add_subdirectory(libs/blaze)
 
 add_subdirectory(data)
 
diff --git a/bench/btl/actions/action_axpby.hh b/bench/btl/actions/action_axpby.hh
index 98511ab6a..dadd0ccf3 100644
--- a/bench/btl/actions/action_axpby.hh
+++ b/bench/btl/actions/action_axpby.hh
@@ -33,7 +33,7 @@ class Action_axpby {
 public :
 
   // Ctor
-  Action_axpby( int size ):_size(size),_alpha(0.5),_beta(0.95)
+  Action_axpby( int size ):_alpha(0.5),_beta(0.95),_size(size)
   {
     MESSAGE("Action_axpby Ctor");
 
diff --git a/bench/btl/actions/action_axpy.hh b/bench/btl/actions/action_axpy.hh
index e4cb3a5bd..261be4cb8 100644
--- a/bench/btl/actions/action_axpy.hh
+++ b/bench/btl/actions/action_axpy.hh
@@ -35,7 +35,7 @@ public :
 
   // Ctor
 
-  Action_axpy( int size ):_size(size),_coef(1.0)
+  Action_axpy( int size ):_coef(1.0),_size(size)
   {
     MESSAGE("Action_axpy Ctor");
 
diff --git a/bench/btl/cmake/FindACML.cmake b/bench/btl/cmake/FindACML.cmake
index f45ae1b0d..4989fa2f4 100644
--- a/bench/btl/cmake/FindACML.cmake
+++ b/bench/btl/cmake/FindACML.cmake
@@ -17,6 +17,7 @@ find_file(ACML_LIBRARIES
   libacml_mp.so
   PATHS
   /usr/lib
+  /usr/lib64
   $ENV{ACMLDIR}/lib
   ${LIB_INSTALL_DIR}
 )
@@ -35,6 +36,7 @@ if(NOT ACML_LIBRARIES)
         libacml.so libacml_mv.so
         PATHS
         /usr/lib
+        /usr/lib64
         $ENV{ACMLDIR}/lib
         ${LIB_INSTALL_DIR}
         )
diff --git a/bench/btl/cmake/FindATLAS.cmake b/bench/btl/cmake/FindATLAS.cmake
index 6b9065206..4136a989d 100644
--- a/bench/btl/cmake/FindATLAS.cmake
+++ b/bench/btl/cmake/FindATLAS.cmake
@@ -3,33 +3,25 @@ if (ATLAS_LIBRARIES)
   set(ATLAS_FIND_QUIETLY TRUE)
 endif (ATLAS_LIBRARIES)
 
-find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-find_library(ATLAS_LIB atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 
-find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_LAPACK NAMES liblapack_atlas.so.3 liblapack.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_library(ATLAS_LAPACK NAMES lapack_atlas lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 
-find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-
-if(NOT ATLAS_LAPACK)
-  find_file(ATLAS_LAPACK liblapack.so.3 PATHS /usr/lib/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-  find_library(ATLAS_LAPACK lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-endif(NOT ATLAS_LAPACK)
-
-find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 
 if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
 
-  set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_CBLAS}  ${ATLAS_F77BLAS} ${ATLAS_LIB})
+  set(ATLAS_LIBRARIES ${ATLAS_LAPACK}  ${ATLAS_LIB})
   
   # search the default lapack lib link to it
   find_file(ATLAS_REFERENCE_LAPACK liblapack.so.3 PATHS /usr/lib /usr/lib64)
   find_library(ATLAS_REFERENCE_LAPACK NAMES lapack)
-  if(ATLAS_REFERENCE_LAPACK)
-    set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK})
-  endif()
+#   if(ATLAS_REFERENCE_LAPACK)
+#     set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK})
+#   endif()
   
 endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
 
diff --git a/bench/btl/cmake/FindBLAZE.cmake b/bench/btl/cmake/FindBLAZE.cmake
new file mode 100644
index 000000000..dba4c89f2
--- /dev/null
+++ b/bench/btl/cmake/FindBLAZE.cmake
@@ -0,0 +1,31 @@
+# - Try to find eigen2 headers
+# Once done this will define
+#
+#  BLAZE_FOUND - system has blaze lib
+#  BLAZE_INCLUDE_DIR - the blaze include directory
+#
+# Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+# Adapted from FindEigen.cmake:
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Redistribution and use is allowed according to the terms of the BSD license.
+# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
+
+if (BLAZE_INCLUDE_DIR)
+
+  # in cache already
+  set(BLAZE_FOUND TRUE)
+
+else (BLAZE_INCLUDE_DIR)
+
+find_path(BLAZE_INCLUDE_DIR NAMES blaze/Blaze.h
+     PATHS
+     ${INCLUDE_INSTALL_DIR}
+   )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(BLAZE DEFAULT_MSG BLAZE_INCLUDE_DIR)
+
+mark_as_advanced(BLAZE_INCLUDE_DIR)
+
+endif(BLAZE_INCLUDE_DIR)
+
diff --git a/bench/btl/cmake/FindCBLAS.cmake b/bench/btl/cmake/FindCBLAS.cmake
index 554f0291b..ce0f2f2b2 100644
--- a/bench/btl/cmake/FindCBLAS.cmake
+++ b/bench/btl/cmake/FindCBLAS.cmake
@@ -23,6 +23,7 @@ find_file(CBLAS_LIBRARIES
   libcblas.so.3
   PATHS
   /usr/lib
+  /usr/lib64
   $ENV{CBLASDIR}/lib
   ${LIB_INSTALL_DIR}
 )
diff --git a/bench/btl/cmake/FindGOTO.cmake b/bench/btl/cmake/FindGOTO.cmake
deleted file mode 100644
index 67ea0934a..000000000
--- a/bench/btl/cmake/FindGOTO.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-
-if (GOTO_LIBRARIES)
-  set(GOTO_FIND_QUIETLY TRUE)
-endif (GOTO_LIBRARIES)
-
-find_library(GOTO_LIBRARIES goto PATHS $ENV{GOTODIR} ${LIB_INSTALL_DIR})
-
-if(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
-  set(GOTO_LIBRARIES ${GOTO_LIBRARIES} "-lpthread -lgfortran")
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(GOTO DEFAULT_MSG GOTO_LIBRARIES)
-
-mark_as_advanced(GOTO_LIBRARIES)
diff --git a/bench/btl/cmake/FindGOTO2.cmake b/bench/btl/cmake/FindGOTO2.cmake
deleted file mode 100644
index baa68d213..000000000
--- a/bench/btl/cmake/FindGOTO2.cmake
+++ /dev/null
@@ -1,25 +0,0 @@
-
-if (GOTO2_LIBRARIES)
-  set(GOTO2_FIND_QUIETLY TRUE)
-endif (GOTO2_LIBRARIES)
-# 
-# find_path(GOTO_INCLUDES
-#   NAMES
-#   cblas.h
-#   PATHS
-#   $ENV{GOTODIR}/include
-#   ${INCLUDE_INSTALL_DIR}
-# )
-
-find_file(GOTO2_LIBRARIES libgoto2.so PATHS /usr/lib $ENV{GOTO2DIR} ${LIB_INSTALL_DIR})
-find_library(GOTO2_LIBRARIES goto2 PATHS $ENV{GOTO2DIR} ${LIB_INSTALL_DIR})
-
-if(GOTO2_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
-  set(GOTO2_LIBRARIES ${GOTO2_LIBRARIES} "-lpthread -lgfortran")
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(GOTO2 DEFAULT_MSG
-                                  GOTO2_LIBRARIES)
-
-mark_as_advanced(GOTO2_LIBRARIES)
diff --git a/bench/btl/cmake/FindOPENBLAS.cmake b/bench/btl/cmake/FindOPENBLAS.cmake
new file mode 100644
index 000000000..2a0919436
--- /dev/null
+++ b/bench/btl/cmake/FindOPENBLAS.cmake
@@ -0,0 +1,17 @@
+
+if (OPENBLAS_LIBRARIES)
+  set(OPENBLAS_FIND_QUIETLY TRUE)
+endif (OPENBLAS_LIBRARIES)
+
+find_file(OPENBLAS_LIBRARIES NAMES libopenblas.so libopenblas.so.0 PATHS /usr/lib /usr/lib64 $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
+find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
+
+if(OPENBLAS_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
+  set(OPENBLAS_LIBRARIES ${OPENBLAS_LIBRARIES} "-lpthread -lgfortran")
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(OPENBLAS DEFAULT_MSG
+                                  OPENBLAS_LIBRARIES)
+
+mark_as_advanced(OPENBLAS_LIBRARIES)
diff --git a/bench/btl/data/action_settings.txt b/bench/btl/data/action_settings.txt
index e32213e22..39d2b5dc4 100644
--- a/bench/btl/data/action_settings.txt
+++ b/bench/btl/data/action_settings.txt
@@ -1,19 +1,19 @@
-aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:3000
-ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:3000
-atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:3000
+aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:5000
+ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:5000
+atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:5000
 axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000
 axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000
-matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:3000
-matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:3000
-trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:3000
-trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:3000
-trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:3000
-cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:3000
-complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000
-partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000
-tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000
-hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000
-symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:3000
-syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:3000
-ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:3000
-rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000
-\ No newline at end of file
+matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:5000
+matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:5000
+trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:5000
+trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:5000
+trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:5000
+cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:5000
+complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:5000
+partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:5000
+tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:5000
+hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:5000
+symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:5000
+syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:5000
+ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:5000
+rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000
diff --git a/bench/btl/data/perlib_plot_settings.txt b/bench/btl/data/perlib_plot_settings.txt
index 6844bab28..f023cfe02 100644
--- a/bench/btl/data/perlib_plot_settings.txt
+++ b/bench/btl/data/perlib_plot_settings.txt
@@ -10,7 +10,7 @@ ublas ;           with lines lw 3 lt 1 lc rgbcolor "#00b7ff"
 mtl4 ;            with lines lw 3 lt 1 lc rgbcolor "#d18847"
 blitz ;           with lines lw 3 lt 1 lc rgbcolor "#ff00ff"
 F77 ;             with lines lw 3 lt 3 lc rgbcolor "#e6e64c"
-GOTO ;            with lines lw 3 lt 3 lc rgbcolor "#C05600"
-GOTO2 ;           with lines lw 3 lt 1 lc rgbcolor "#C05600"
+OPENBLAS ;        with lines lw 3 lt 1 lc rgbcolor "#C05600"
 C ;               with lines lw 3 lt 3 lc rgbcolor "#e6bd96"
 ACML ;            with lines lw 2 lt 3 lc rgbcolor "#e6e64c"
+blaze ;           with lines lw 3 lt 1 lc rgbcolor "#ff00ff"
diff --git a/bench/btl/generic_bench/bench.hh b/bench/btl/generic_bench/bench.hh
index 005c36395..7b7b951b5 100644
--- a/bench/btl/generic_bench/bench.hh
+++ b/bench/btl/generic_bench/bench.hh
@@ -102,8 +102,8 @@ BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point )
       // merge the two data
       std::vector<int> newSizes;
       std::vector<double> newFlops;
-      int i=0;
-      int j=0;
+      unsigned int i=0;
+      unsigned int j=0;
       while (i<tab_sizes.size() && j<oldSizes.size())
       {
         if (tab_sizes[i] == oldSizes[j])
diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh
index 4c355cd6e..2b01149f9 100644
--- a/bench/btl/generic_bench/bench_parameter.hh
+++ b/bench/btl/generic_bench/bench_parameter.hh
@@ -29,11 +29,11 @@
 // min vector size for axpy bench
 #define MIN_AXPY 5
 // max vector size for axpy bench
-#define MAX_AXPY 1000000
+#define MAX_AXPY 3000000
 // min matrix size for matrix vector product bench
 #define MIN_MV 5
 // max matrix size for matrix vector product bench
-#define MAX_MV 3000
+#define MAX_MV 5000
 // min matrix size for matrix matrix product bench
 #define MIN_MM 5
 // max matrix size for matrix matrix product bench
diff --git a/bench/btl/generic_bench/btl.hh b/bench/btl/generic_bench/btl.hh
index f1a88ff74..706b00fb0 100644
--- a/bench/btl/generic_bench/btl.hh
+++ b/bench/btl/generic_bench/btl.hh
@@ -44,15 +44,10 @@
 #define BTL_ASM_COMMENT(X)
 #endif
 
-#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && !defined(__arm__) && !defined(__powerpc__)
-#define BTL_DISABLE_SSE_EXCEPTIONS()  { \
-  int aux; \
-  asm( \
-  "stmxcsr   %[aux]           \n\t" \
-  "orl       $32832, %[aux]   \n\t" \
-  "ldmxcsr   %[aux]           \n\t" \
-  : : [aux] "m" (aux)); \
-}
+#ifdef __SSE__
+#include "xmmintrin.h"
+// This enables flush to zero (FTZ) and denormals are zero (DAZ) modes:
+#define BTL_DISABLE_SSE_EXCEPTIONS()  { _mm_setcsr(_mm_getcsr() | 0x8040); }
 #else
 #define BTL_DISABLE_SSE_EXCEPTIONS()
 #endif
@@ -176,7 +171,7 @@ public:
     if (_config!=NULL)
     {
       std::vector<BtlString> config = BtlString(_config).split(" \t\n");
-      for (int i = 0; i<config.size(); i++)
+      for (unsigned int i = 0; i<config.size(); i++)
       {
         if (config[i].beginsWith("-a"))
         {
@@ -224,7 +219,7 @@ public:
       return false;
 
     BtlString name(_name);
-    for (int i=0; i<Instance.m_selectedActionNames.size(); ++i)
+    for (unsigned int i=0; i<Instance.m_selectedActionNames.size(); ++i)
       if (name.contains(Instance.m_selectedActionNames[i]))
         return false;
 
diff --git a/bench/btl/generic_bench/init/init_function.hh b/bench/btl/generic_bench/init/init_function.hh
index 7b3bdbafc..e467cb648 100644
--- a/bench/btl/generic_bench/init/init_function.hh
+++ b/bench/btl/generic_bench/init/init_function.hh
@@ -30,23 +30,23 @@ double simple_function(int index_i, int index_j)
   return index_i+index_j;
 }
 
-double pseudo_random(int index)
+double pseudo_random(int /*index*/)
 {
   return std::rand()/double(RAND_MAX);
 }
 
-double pseudo_random(int index_i, int index_j)
+double pseudo_random(int /*index_i*/, int /*index_j*/)
 {
   return std::rand()/double(RAND_MAX);
 }
 
 
-double null_function(int index)
+double null_function(int /*index*/)
 {
   return 0.0;
 }
 
-double null_function(int index_i, int index_j)
+double null_function(int /*index_i*/, int /*index_j*/)
 {
   return 0.0;
 }
diff --git a/bench/btl/generic_bench/init/init_matrix.hh b/bench/btl/generic_bench/init/init_matrix.hh
index 67cbd2073..6382d30c8 100644
--- a/bench/btl/generic_bench/init/init_matrix.hh
+++ b/bench/btl/generic_bench/init/init_matrix.hh
@@ -29,7 +29,7 @@ BTL_DONT_INLINE void init_row(Vector & X, int size, int row){
 
   X.resize(size);
 
-  for (int j=0;j<X.size();j++){
+  for (unsigned int j=0;j<X.size();j++){
     X[j]=typename Vector::value_type(init_function(row,j));
   }
 }
@@ -42,7 +42,7 @@ BTL_DONT_INLINE void init_row(Vector & X, int size, int row){
 template<double init_function(int,int),class Vector>
 BTL_DONT_INLINE void init_matrix(Vector &  A, int size){
   A.resize(size);
-  for (int row=0; row<A.size() ; row++){
+  for (unsigned int row=0; row<A.size() ; row++){
     init_row<init_function>(A[row],size,row);
   }
 }
@@ -50,11 +50,11 @@ BTL_DONT_INLINE void init_matrix(Vector &  A, int size){
 template<double init_function(int,int),class Matrix>
 BTL_DONT_INLINE void init_matrix_symm(Matrix&  A, int size){
   A.resize(size);
-  for (int row=0; row<A.size() ; row++)
+  for (unsigned int row=0; row<A.size() ; row++)
     A[row].resize(size);
-  for (int row=0; row<A.size() ; row++){
+  for (unsigned int row=0; row<A.size() ; row++){
     A[row][row] = init_function(row,row);
-    for (int col=0; col<row ; col++){
+    for (unsigned int col=0; col<row ; col++){
       double x = init_function(row,col);
       A[row][col] = A[col][row] = x;
     }
diff --git a/bench/btl/generic_bench/init/init_vector.hh b/bench/btl/generic_bench/init/init_vector.hh
index efaf0c92e..518e87dbe 100644
--- a/bench/btl/generic_bench/init/init_vector.hh
+++ b/bench/btl/generic_bench/init/init_vector.hh
@@ -29,7 +29,7 @@ void init_vector(Vector & X, int size){
 
   X.resize(size);
 
-  for (int i=0;i<X.size();i++){
+  for (unsigned int i=0;i<X.size();i++){
     X[i]=typename Vector::value_type(init_function(i));
   }
 }
diff --git a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
index fc0f3168d..5e579fb49 100644
--- a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
+++ b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
@@ -78,7 +78,7 @@ public:
     // time measurement
     action.calculate();
     _chronos.start();
-    for (int ii=0;ii<_nb_calc;ii++)
+    for (unsigned int ii=0;ii<_nb_calc;ii++)
     {
       action.calculate();
     }
diff --git a/bench/btl/generic_bench/timers/portable_timer.hh b/bench/btl/generic_bench/timers/portable_timer.hh
index e6ad309fe..c199811b6 100755
--- a/bench/btl/generic_bench/timers/portable_timer.hh
+++ b/bench/btl/generic_bench/timers/portable_timer.hh
@@ -34,7 +34,7 @@
 //  timer  -------------------------------------------------------------------//
 
 //  A timer object measures CPU time.
-#ifdef _MSC_VER
+#if defined(_MSC_VER)
 
 #define NOMINMAX
 #include <windows.h>
@@ -87,6 +87,48 @@
 
  }; // Portable_Timer
 
+#elif defined(__APPLE__)
+#include <CoreServices/CoreServices.h>
+#include <mach/mach_time.h>
+
+
+class Portable_Timer
+{
+ public:
+
+  Portable_Timer()
+  {
+  }
+
+  void start()
+  {
+    m_start_time = double(mach_absolute_time())*1e-9;;
+
+  }
+
+  void stop()
+  {
+    m_stop_time = double(mach_absolute_time())*1e-9;;
+
+  }
+
+  double elapsed()
+  {
+    return  user_time();
+  }
+
+  double user_time()
+  {
+    return m_stop_time - m_start_time;
+  }
+
+
+private:
+
+  double m_stop_time, m_start_time;
+
+}; // Portable_Timer (Apple)
+
 #else
 
 #include <sys/time.h>
@@ -138,7 +180,7 @@ private:
   int m_clkid;
   double m_stop_time, m_start_time;
 
-}; // Portable_Timer
+}; // Portable_Timer (Linux)
 
 #endif
 
diff --git a/bench/btl/generic_bench/utils/size_lin_log.hh b/bench/btl/generic_bench/utils/size_lin_log.hh
index bca3932ae..bbc9f543d 100644
--- a/bench/btl/generic_bench/utils/size_lin_log.hh
+++ b/bench/btl/generic_bench/utils/size_lin_log.hh
@@ -23,7 +23,7 @@
 #include "size_log.hh"
 
 template<class Vector>
-void size_lin_log(const int nb_point, const int size_min, const int size_max, Vector & X)
+void size_lin_log(const int nb_point, const int /*size_min*/, const int size_max, Vector & X)
 {
   int ten=10;
   int nine=9;
diff --git a/bench/btl/libs/BLAS/CMakeLists.txt b/bench/btl/libs/BLAS/CMakeLists.txt
index de42fe047..0272ccad0 100644
--- a/bench/btl/libs/BLAS/CMakeLists.txt
+++ b/bench/btl/libs/BLAS/CMakeLists.txt
@@ -18,27 +18,14 @@ if (MKL_FOUND)
 endif (MKL_FOUND)
 
 
-find_package(GOTO2)
-if (GOTO2_FOUND)
-  btl_add_bench(btl_goto2 main.cpp)
-  if(BUILD_btl_goto2)
-    target_link_libraries(btl_goto2 ${GOTO_LIBRARIES} )
-    set_target_properties(btl_goto2 PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO2")
-  endif(BUILD_btl_goto2)
-endif (GOTO2_FOUND)
-
-find_package(GOTO)
-if (GOTO_FOUND)
-  if(GOTO2_FOUND)
-    btl_add_bench(btl_goto main.cpp OFF)
-  else()
-    btl_add_bench(btl_goto main.cpp)
-  endif()
-  if(BUILD_btl_goto)
-    target_link_libraries(btl_goto ${GOTO_LIBRARIES} )
-    set_target_properties(btl_goto PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO")
-  endif(BUILD_btl_goto)
-endif (GOTO_FOUND)
+find_package(OPENBLAS)
+if (OPENBLAS_FOUND)
+  btl_add_bench(btl_openblas main.cpp)
+  if(BUILD_btl_openblas)
+    target_link_libraries(btl_openblas ${OPENBLAS_LIBRARIES} )
+    set_target_properties(btl_openblas PROPERTIES COMPILE_FLAGS "-DCBLASNAME=OPENBLAS")
+  endif(BUILD_btl_openblas)
+endif (OPENBLAS_FOUND)
 
 find_package(ACML)
 if (ACML_FOUND)
diff --git a/bench/btl/libs/BLAS/blas_interface_impl.hh b/bench/btl/libs/BLAS/blas_interface_impl.hh
index 0e84df038..fc4ba2a1f 100644
--- a/bench/btl/libs/BLAS/blas_interface_impl.hh
+++ b/bench/btl/libs/BLAS/blas_interface_impl.hh
@@ -75,7 +75,6 @@ public :
   static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
     int N2 = N*N;
     BLAS_FUNC(copy)(&N2, X, &intone, C, &intone);
-    char uplo = 'L';
     int info = 0;
     int * ipiv = (int*)alloca(sizeof(int)*N);
     BLAS_FUNC(getrf)(&N, &N, C, &N, ipiv, &info);
@@ -92,7 +91,7 @@ public :
     BLAS_FUNC(trsm)(&right, &lower, &notrans, &nonunit, &N, &N, &fone, L, &N, X, &N);
   }
 
-  static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & X, int N){
+  static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & /*X*/, int N){
     BLAS_FUNC(trmm)(&left, &lower, &notrans,&nonunit, &N,&N,&fone,A,&N,B,&N);
   }
 
@@ -101,7 +100,6 @@ public :
   static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
     int N2 = N*N;
     BLAS_FUNC(copy)(&N2, X, &intone, C, &intone);
-    char uplo = 'L';
     int info = 0;
     int * ipiv = (int*)alloca(sizeof(int)*N);
     int * jpiv = (int*)alloca(sizeof(int)*N);
@@ -134,8 +132,6 @@ public :
     }
     char uplo = 'U';
     int info = 0;
-    int ilo = 1;
-    int ihi = N;
     int bsize = 64;
     int worksize = N*bsize;
     SCALAR* d = new SCALAR[3*N+worksize];
diff --git a/bench/btl/libs/BLAS/c_interface_base.h b/bench/btl/libs/BLAS/c_interface_base.h
index 515d8dcfc..de613803b 100644
--- a/bench/btl/libs/BLAS/c_interface_base.h
+++ b/bench/btl/libs/BLAS/c_interface_base.h
@@ -17,12 +17,12 @@ public:
   typedef real* gene_matrix;
   typedef real* gene_vector;
 
-  static void free_matrix(gene_matrix & A, int N){
-    delete A;
+  static void free_matrix(gene_matrix & A, int /*N*/){
+    delete[] A;
   }
 
   static void free_vector(gene_vector & B){
-    delete B;
+    delete[] B;
   }
 
   static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
diff --git a/bench/btl/libs/BLAS/main.cpp b/bench/btl/libs/BLAS/main.cpp
index 8347c9f0b..564d55ef2 100644
--- a/bench/btl/libs/BLAS/main.cpp
+++ b/bench/btl/libs/BLAS/main.cpp
@@ -56,13 +56,13 @@ int main()
 
   bench<Action_trmm<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
 
-  bench<Action_cholesky<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-  bench<Action_partial_lu<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_cholesky<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
+  bench<Action_partial_lu<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
 
   #ifdef HAS_LAPACK
-  bench<Action_lu_decomp<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-  bench<Action_hessenberg<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-  bench<Action_tridiagonalization<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+//   bench<Action_lu_decomp<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
+  bench<Action_hessenberg<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
+  bench<Action_tridiagonalization<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
   #endif
 
   //bench<Action_lu_solve<blas_LU_solve_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
diff --git a/bench/btl/libs/STL/STL_interface.hh b/bench/btl/libs/STL/STL_interface.hh
index 93e76bd55..ef4cc9233 100644
--- a/bench/btl/libs/STL/STL_interface.hh
+++ b/bench/btl/libs/STL/STL_interface.hh
@@ -44,9 +44,9 @@ public :
     return "STL";
   }
 
-  static void free_matrix(gene_matrix & A, int N){}
+  static void free_matrix(gene_matrix & /*A*/, int /*N*/){}
 
-  static void free_vector(gene_vector & B){}
+  static void free_vector(gene_vector & /*B*/){}
 
   static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
     A = A_stl;
diff --git a/bench/btl/libs/blaze/CMakeLists.txt b/bench/btl/libs/blaze/CMakeLists.txt
new file mode 100644
index 000000000..e99a0855c
--- /dev/null
+++ b/bench/btl/libs/blaze/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+find_package(BLAZE)
+find_package(Boost COMPONENTS system)
+if (BLAZE_FOUND AND Boost_FOUND)
+  include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
+  btl_add_bench(btl_blaze main.cpp)
+  # Note: The newest blaze version requires C++14.
+  # Ideally, we should set this depending on the version of Blaze we found
+  set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14)
+  if(BUILD_btl_blaze)
+    target_link_libraries(btl_blaze ${Boost_LIBRARIES})
+  endif()
+endif ()
diff --git a/bench/btl/libs/blaze/blaze_interface.hh b/bench/btl/libs/blaze/blaze_interface.hh
new file mode 100644
index 000000000..ee1523944
--- /dev/null
+++ b/bench/btl/libs/blaze/blaze_interface.hh
@@ -0,0 +1,140 @@
+//=====================================================
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//=====================================================
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+//
+#ifndef BLAZE_INTERFACE_HH
+#define BLAZE_INTERFACE_HH
+
+#include <blaze/Math.h>
+#include <blaze/Blaze.h>
+// using namespace blaze;
+
+#include <vector>
+
+template<class real>
+class blaze_interface {
+
+public :
+
+  typedef real real_type ;
+
+  typedef std::vector<real>        stl_vector;
+  typedef std::vector<stl_vector > stl_matrix;
+
+  typedef blaze::DynamicMatrix<real,blaze::columnMajor>  gene_matrix;
+  typedef blaze::DynamicVector<real>  gene_vector;
+
+  static inline std::string name() { return "blaze"; }
+
+  static void free_matrix(gene_matrix & A, int N){
+    return ;
+  }
+
+  static void free_vector(gene_vector & B){
+    return ;
+  }
+
+  static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
+    A.resize(A_stl[0].size(), A_stl.size());
+
+    for (int j=0; j<A_stl.size() ; j++){
+      for (int i=0; i<A_stl[j].size() ; i++){
+        A(i,j) = A_stl[j][i];
+      }
+    }
+  }
+
+  static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){
+    B.resize(B_stl.size());
+    for (int i=0; i<B_stl.size() ; i++){
+      B[i] = B_stl[i];
+    }
+  }
+
+  static inline void vector_to_stl(gene_vector & B, stl_vector & B_stl){
+    for (int i=0; i<B_stl.size() ; i++){
+      B_stl[i] = B[i];
+    }
+  }
+
+  static inline void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
+    int N=A_stl.size();
+    for (int j=0;j<N;j++){
+      A_stl[j].resize(N);
+      for (int i=0;i<N;i++){
+        A_stl[j][i] = A(i,j);
+      }
+    }
+  }
+
+  static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
+    X = (A*B);
+  }
+
+  static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
+    X = (trans(A)*trans(B));
+  }
+
+  static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
+    X = (trans(A)*A);
+  }
+
+  static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
+    X = (A*trans(A));
+  }
+
+  static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
+    X = (A*B);
+  }
+
+  static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
+    X = (trans(A)*B);
+  }
+
+  static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N){
+    Y += coef * X;
+  }
+
+  static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){
+    Y = a*X + b*Y;
+  }
+
+//   static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){
+//     C = X;
+//     recursive_cholesky(C);
+//   }
+
+//   static inline void lu_decomp(const gene_matrix & X, gene_matrix & R, int N){
+//     R = X;
+//     std::vector<int> ipvt(N);
+//     lu_factor(R, ipvt);
+//   }
+
+//   static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector & X, int N){
+//     X = lower_trisolve(L, B);
+//   }
+
+  static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){
+    cible = source;
+  }
+
+  static inline void copy_vector(const gene_vector & source, gene_vector & cible, int N){
+    cible = source;
+  }
+
+};
+
+#endif
diff --git a/bench/btl/libs/blaze/main.cpp b/bench/btl/libs/blaze/main.cpp
new file mode 100644
index 000000000..80e8f4eaa
--- /dev/null
+++ b/bench/btl/libs/blaze/main.cpp
@@ -0,0 +1,40 @@
+//=====================================================
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//=====================================================
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+//
+#include "utilities.h"
+#include "blaze_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+
+  bench<Action_axpy<blaze_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
+  bench<Action_axpby<blaze_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
+
+  bench<Action_matrix_vector_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
+  bench<Action_atv_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
+//   bench<Action_matrix_matrix_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+//   bench<Action_ata_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+//   bench<Action_aat_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+
+  return 0;
+}
+
+
diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh
index 47fe58135..1deabdae2 100644
--- a/bench/btl/libs/eigen2/eigen2_interface.hh
+++ b/bench/btl/libs/eigen2/eigen2_interface.hh
@@ -47,7 +47,7 @@ public :
   {
     #if defined(EIGEN_VECTORIZE_SSE)
     if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2";
-    #elif defined(EIGEN_VECTORIZE_ALTIVEC)
+    #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
     if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2";
     #else
     if (SIZE==Dynamic) return "eigen2_novec"; else return "tiny_eigen2_novec";
diff --git a/bench/btl/libs/eigen3/eigen3_interface.hh b/bench/btl/libs/eigen3/eigen3_interface.hh
index 31bcc1f93..b821fd721 100644
--- a/bench/btl/libs/eigen3/eigen3_interface.hh
+++ b/bench/btl/libs/eigen3/eigen3_interface.hh
@@ -45,15 +45,15 @@ public :
     return EIGEN_MAKESTRING(BTL_PREFIX);
   }
 
-  static void free_matrix(gene_matrix & A, int N) {}
+  static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
 
-  static void free_vector(gene_vector & B) {}
+  static void free_vector(gene_vector & /*B*/) {}
 
   static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
     A.resize(A_stl[0].size(), A_stl.size());
 
-    for (int j=0; j<A_stl.size() ; j++){
-      for (int i=0; i<A_stl[j].size() ; i++){
+    for (unsigned int j=0; j<A_stl.size() ; j++){
+      for (unsigned int i=0; i<A_stl[j].size() ; i++){
         A.coeffRef(i,j) = A_stl[j][i];
       }
     }
@@ -62,19 +62,19 @@ public :
   static BTL_DONT_INLINE  void vector_from_stl(gene_vector & B, stl_vector & B_stl){
     B.resize(B_stl.size(),1);
 
-    for (int i=0; i<B_stl.size() ; i++){
+    for (unsigned int i=0; i<B_stl.size() ; i++){
       B.coeffRef(i) = B_stl[i];
     }
   }
 
   static BTL_DONT_INLINE  void vector_to_stl(gene_vector & B, stl_vector & B_stl){
-    for (int i=0; i<B_stl.size() ; i++){
+    for (unsigned int i=0; i<B_stl.size() ; i++){
       B_stl[i] = B.coeff(i);
     }
   }
 
   static BTL_DONT_INLINE  void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
-    int N=A_stl.size();
+    int  N=A_stl.size();
 
     for (int j=0;j<N;j++){
       A_stl[j].resize(N);
@@ -84,28 +84,28 @@ public :
     }
   }
 
-  static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
+  static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int  /*N*/){
     X.noalias() = A*B;
   }
 
-  static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
+  static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int  /*N*/){
     X.noalias() = A.transpose()*B.transpose();
   }
 
-//   static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
+//   static inline void ata_product(const gene_matrix & A, gene_matrix & X, int  /*N*/){
 //     X.noalias() = A.transpose()*A;
 //   }
 
-  static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
+  static inline void aat_product(const gene_matrix & A, gene_matrix & X, int  /*N*/){
     X.template triangularView<Lower>().setZero();
     X.template selfadjointView<Lower>().rankUpdate(A);
   }
 
-  static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
+  static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int  /*N*/){
     X.noalias() = A*B;
   }
 
-  static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
+  static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int  /*N*/){
     X.noalias() = (A.template selfadjointView<Lower>() * B);
 //     internal::product_selfadjoint_vector<real,0,LowerTriangularBit,false,false>(N,A.data(),N, B.data(), 1, X.data(), 1);
   }
@@ -155,54 +155,54 @@ public :
     }
   }
 
-  static EIGEN_DONT_INLINE void syr2(gene_matrix & A,  gene_vector & X, gene_vector & Y, int N){
+  static EIGEN_DONT_INLINE void syr2(gene_matrix & A,  gene_vector & X, gene_vector & Y, int  N){
     // internal::product_selfadjoint_rank2_update<real,0,LowerTriangularBit>(N,A.data(),N, X.data(), 1, Y.data(), 1, -1);
     for(int j=0; j<N; ++j)
       A.col(j).tail(N-j) += X[j] * Y.tail(N-j) + Y[j] * X.tail(N-j);
   }
 
-  static EIGEN_DONT_INLINE void ger(gene_matrix & A,  gene_vector & X, gene_vector & Y, int N){
+  static EIGEN_DONT_INLINE void ger(gene_matrix & A,  gene_vector & X, gene_vector & Y, int  N){
     for(int j=0; j<N; ++j)
       A.col(j) += X * Y[j];
   }
 
-  static EIGEN_DONT_INLINE void rot(gene_vector & A,  gene_vector & B, real c, real s, int N){
+  static EIGEN_DONT_INLINE void rot(gene_vector & A,  gene_vector & B, real c, real s, int  /*N*/){
     internal::apply_rotation_in_the_plane(A, B, JacobiRotation<real>(c,s));
   }
 
-  static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
+  static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int  /*N*/){
     X.noalias() = (A.transpose()*B);
   }
 
-  static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){
+  static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int  /*N*/){
     Y += coef * X;
   }
 
-  static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){
+  static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int  /*N*/){
     Y = a*X + b*Y;
   }
 
-  static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){
+  static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int  /*N*/){
     cible = source;
   }
 
-  static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int N){
+  static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int  /*N*/){
     cible = source;
   }
 
-  static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){
+  static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int  /*N*/){
     X = L.template triangularView<Lower>().solve(B);
   }
 
-  static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){
+  static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int  /*N*/){
     X = L.template triangularView<Upper>().solve(B);
   }
 
-  static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){
+  static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int  /*N*/){
     X.noalias() = L.template triangularView<Lower>() * B;
   }
 
-  static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){
+  static inline void cholesky(const gene_matrix & X, gene_matrix & C, int  /*N*/){
     C = X;
     internal::llt_inplace<real,Lower>::blocked(C);
     //C = X.llt().matrixL();
@@ -211,11 +211,11 @@ public :
 //     Cholesky<gene_matrix>::computeInPlaceBlock(C);
   }
 
-  static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
+  static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int  /*N*/){
     C = X.fullPivLu().matrixLU();
   }
 
-  static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
+  static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int  N){
     Matrix<DenseIndex,1,Dynamic> piv(N);
     DenseIndex nb;
     C = X;
@@ -223,13 +223,13 @@ public :
 //     C = X.partialPivLu().matrixLU();
   }
 
-  static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){
+  static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int  N){
     typename Tridiagonalization<gene_matrix>::CoeffVectorType aux(N-1);
     C = X;
     internal::tridiagonalization_inplace(C, aux);
   }
 
-  static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int N){
+  static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int  /*N*/){
     C = HessenbergDecomposition<gene_matrix>(X).packedMatrix();
   }
 
diff --git a/bench/btl/libs/eigen3/main_adv.cpp b/bench/btl/libs/eigen3/main_adv.cpp
index efe5857e4..95865357e 100644
--- a/bench/btl/libs/eigen3/main_adv.cpp
+++ b/bench/btl/libs/eigen3/main_adv.cpp
@@ -29,14 +29,14 @@ BTL_MAIN;
 
 int main()
 {
-  bench<Action_trisolve<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-  bench<Action_trisolve_matrix<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-  bench<Action_cholesky<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-  bench<Action_lu_decomp<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-  bench<Action_partial_lu<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_trisolve<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
+  bench<Action_trisolve_matrix<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
+  bench<Action_cholesky<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
+//   bench<Action_lu_decomp<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
+  bench<Action_partial_lu<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
 
-  bench<Action_hessenberg<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-  bench<Action_tridiagonalization<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+//   bench<Action_hessenberg<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
+  bench<Action_tridiagonalization<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
 
   return 0;
 }
diff --git a/bench/btl/libs/tensors/CMakeLists.txt b/bench/btl/libs/tensors/CMakeLists.txt
new file mode 100644
index 000000000..09d6d8e43
--- /dev/null
+++ b/bench/btl/libs/tensors/CMakeLists.txt
@@ -0,0 +1,44 @@
+
+
+if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR)
+  # unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version
+  set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR})
+  set(TENSOR_FOUND TRUE)
+else()
+  find_package(Tensor)
+endif()
+
+if (TENSOR_FOUND)
+
+  include_directories(${TENSOR_INCLUDE_DIR})
+  btl_add_bench(btl_tensor_linear main_linear.cpp)
+  btl_add_bench(btl_tensor_vecmat main_vecmat.cpp)
+  btl_add_bench(btl_tensor_matmat main_matmat.cpp)
+
+  btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+  btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+  btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+
+  option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF)
+  if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC)
+    btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp)
+    btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp)
+    btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp)
+
+    btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+    btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+    btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+  endif()
+
+
+  if(NOT BTL_NOVEC)
+    btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF)
+    btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF)
+    btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF)
+    btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+    btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+    btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+
+  endif(NOT BTL_NOVEC)
+
+endif (TENSOR_FOUND)
diff --git a/bench/btl/libs/tensors/main_linear.cpp b/bench/btl/libs/tensors/main_linear.cpp
new file mode 100644
index 000000000..e257f1e72
--- /dev/null
+++ b/bench/btl/libs/tensors/main_linear.cpp
@@ -0,0 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+  bench<Action_axpy<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
+  bench<Action_axpby<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
+
+  return 0;
+}
diff --git a/bench/btl/libs/tensors/main_matmat.cpp b/bench/btl/libs/tensors/main_matmat.cpp
new file mode 100644
index 000000000..675fcfc6d
--- /dev/null
+++ b/bench/btl/libs/tensors/main_matmat.cpp
@@ -0,0 +1,21 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+  bench<Action_matrix_matrix_product<tensor_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+
+  return 0;
+}
diff --git a/bench/btl/libs/tensors/main_vecmat.cpp b/bench/btl/libs/tensors/main_vecmat.cpp
new file mode 100644
index 000000000..1af00c81b
--- /dev/null
+++ b/bench/btl/libs/tensors/main_vecmat.cpp
@@ -0,0 +1,21 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+  bench<Action_matrix_vector_product<tensor_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
+
+  return 0;
+}
diff --git a/bench/btl/libs/tensors/tensor_interface.hh b/bench/btl/libs/tensors/tensor_interface.hh
new file mode 100644
index 000000000..97b8e0f0b
--- /dev/null
+++ b/bench/btl/libs/tensors/tensor_interface.hh
@@ -0,0 +1,105 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#ifndef TENSOR_INTERFACE_HH
+#define TENSOR_INTERFACE_HH
+
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <vector>
+#include "btl.hh"
+
+using namespace Eigen;
+
+template<class real>
+class tensor_interface
+{
+public :
+  typedef real real_type;
+  typedef typename Eigen::Tensor<real,2>::Index Index;
+
+  typedef std::vector<real> stl_vector;
+  typedef std::vector<stl_vector> stl_matrix;
+
+  typedef Eigen::Tensor<real,2> gene_matrix;
+  typedef Eigen::Tensor<real,1> gene_vector;
+
+
+  static inline std::string name( void )
+  {
+    return EIGEN_MAKESTRING(BTL_PREFIX);
+  }
+
+  static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
+
+  static void free_vector(gene_vector & /*B*/) {}
+
+  static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
+    A.resize(Eigen::array<Index,2>(A_stl[0].size(), A_stl.size()));
+
+    for (unsigned int j=0; j<A_stl.size() ; j++){
+      for (unsigned int i=0; i<A_stl[j].size() ; i++){
+        A.coeffRef(Eigen::array<Index,2>(i,j)) = A_stl[j][i];
+      }
+    }
+  }
+
+  static BTL_DONT_INLINE  void vector_from_stl(gene_vector & B, stl_vector & B_stl){
+    B.resize(B_stl.size());
+
+    for (unsigned int i=0; i<B_stl.size() ; i++){
+      B.coeffRef(i) = B_stl[i];
+    }
+  }
+
+  static BTL_DONT_INLINE  void vector_to_stl(gene_vector & B, stl_vector & B_stl){
+    for (unsigned int i=0; i<B_stl.size() ; i++){
+      B_stl[i] = B.coeff(i);
+    }
+  }
+
+  static BTL_DONT_INLINE  void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
+    int  N=A_stl.size();
+
+    for (int j=0;j<N;j++){
+      A_stl[j].resize(N);
+      for (int i=0;i<N;i++){
+        A_stl[j][i] = A.coeff(Eigen::array<Index,2>(i,j));
+      }
+    }
+  }
+
+  static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int  /*N*/){
+    typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
+    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+    X/*.noalias()*/ = A.contract(B, dims);
+  }
+
+  static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int  /*N*/){
+    typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
+    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+    X/*.noalias()*/ = A.contract(B, dims);
+  }
+
+  static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int  /*N*/){
+    Y += X.constant(coef) * X;
+  }
+
+  static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int  /*N*/){
+    Y = X.constant(a)*X + Y.constant(b)*Y;
+  }
+
+  static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int  /*N*/){
+    cible = source;
+  }
+
+  static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int  /*N*/){
+    cible = source;
+  }
+};
+
+#endif
diff --git a/bench/dense_solvers.cpp b/bench/dense_solvers.cpp
new file mode 100644
index 000000000..24343dcd8
--- /dev/null
+++ b/bench/dense_solvers.cpp
@@ -0,0 +1,186 @@
+#include <iostream>
+#include "BenchTimer.h"
+#include <Eigen/Dense>
+#include <map>
+#include <vector>
+#include <string>
+#include <sstream>
+using namespace Eigen;
+
+std::map<std::string,Array<float,1,8,DontAlign|RowMajor> > results;
+std::vector<std::string> labels;
+std::vector<Array2i> sizes;
+
+template<typename Solver,typename MatrixType>
+EIGEN_DONT_INLINE
+void compute_norm_equation(Solver &solver, const MatrixType &A) {
+  if(A.rows()!=A.cols())
+    solver.compute(A.transpose()*A);
+  else
+    solver.compute(A);
+}
+
+template<typename Solver,typename MatrixType>
+EIGEN_DONT_INLINE
+void compute(Solver &solver, const MatrixType &A) {
+  solver.compute(A);
+}
+
+template<typename Scalar,int Size>
+void bench(int id, int rows, int size = Size)
+{
+  typedef Matrix<Scalar,Dynamic,Size> Mat;
+  typedef Matrix<Scalar,Dynamic,Dynamic> MatDyn;
+  typedef Matrix<Scalar,Size,Size> MatSquare;
+  Mat A(rows,size);
+  A.setRandom();
+  if(rows==size)
+    A = A*A.adjoint();
+  BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_cod, t_fpqr, t_jsvd, t_bdcsvd;
+
+  int svd_opt = ComputeThinU|ComputeThinV;
+  
+  int tries = 5;
+  int rep = 1000/size;
+  if(rep==0) rep = 1;
+//   rep = rep*rep;
+  
+  LLT<MatSquare> llt(size);
+  LDLT<MatSquare> ldlt(size);
+  PartialPivLU<MatSquare> lu(size);
+  FullPivLU<MatSquare> fplu(size,size);
+  HouseholderQR<Mat> qr(A.rows(),A.cols());
+  ColPivHouseholderQR<Mat> cpqr(A.rows(),A.cols());
+  CompleteOrthogonalDecomposition<Mat> cod(A.rows(),A.cols());
+  FullPivHouseholderQR<Mat> fpqr(A.rows(),A.cols());
+  JacobiSVD<MatDyn> jsvd(A.rows(),A.cols());
+  BDCSVD<MatDyn> bdcsvd(A.rows(),A.cols());
+  
+  BENCH(t_llt, tries, rep, compute_norm_equation(llt,A));
+  BENCH(t_ldlt, tries, rep, compute_norm_equation(ldlt,A));
+  BENCH(t_lu, tries, rep, compute_norm_equation(lu,A));
+  if(size<=1000)
+    BENCH(t_fplu, tries, rep, compute_norm_equation(fplu,A));
+  BENCH(t_qr, tries, rep, compute(qr,A));
+  BENCH(t_cpqr, tries, rep, compute(cpqr,A));
+  BENCH(t_cod, tries, rep, compute(cod,A));
+  if(size*rows<=10000000)
+    BENCH(t_fpqr, tries, rep, compute(fpqr,A));
+  if(size<500) // JacobiSVD is really too slow for too large matrices
+    BENCH(t_jsvd, tries, rep, jsvd.compute(A,svd_opt));
+//   if(size*rows<=20000000)
+    BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,svd_opt));
+  
+  results["LLT"][id] = t_llt.best();
+  results["LDLT"][id] = t_ldlt.best();
+  results["PartialPivLU"][id] = t_lu.best();
+  results["FullPivLU"][id] = t_fplu.best();
+  results["HouseholderQR"][id] = t_qr.best();
+  results["ColPivHouseholderQR"][id] = t_cpqr.best();
+  results["CompleteOrthogonalDecomposition"][id] = t_cod.best();
+  results["FullPivHouseholderQR"][id] = t_fpqr.best();
+  results["JacobiSVD"][id] = t_jsvd.best();
+  results["BDCSVD"][id] = t_bdcsvd.best();
+}
+
+
+int main()
+{
+  labels.push_back("LLT");
+  labels.push_back("LDLT");
+  labels.push_back("PartialPivLU");
+  labels.push_back("FullPivLU");
+  labels.push_back("HouseholderQR");
+  labels.push_back("ColPivHouseholderQR");
+  labels.push_back("CompleteOrthogonalDecomposition");
+  labels.push_back("FullPivHouseholderQR");
+  labels.push_back("JacobiSVD");
+  labels.push_back("BDCSVD");
+
+  for(int i=0; i<labels.size(); ++i)
+    results[labels[i]].fill(-1);
+
+  const int small = 8;
+  sizes.push_back(Array2i(small,small));
+  sizes.push_back(Array2i(100,100));
+  sizes.push_back(Array2i(1000,1000));
+  sizes.push_back(Array2i(4000,4000));
+  sizes.push_back(Array2i(10000,small));
+  sizes.push_back(Array2i(10000,100));
+  sizes.push_back(Array2i(10000,1000));
+  sizes.push_back(Array2i(10000,4000));
+
+  using namespace std;
+
+  for(int k=0; k<sizes.size(); ++k)
+  {
+    cout << sizes[k](0) << "x" << sizes[k](1) << "...\n";
+    bench<float,Dynamic>(k,sizes[k](0),sizes[k](1));
+  }
+
+  cout.width(32);
+  cout << "solver/size";
+  cout << "  ";
+  for(int k=0; k<sizes.size(); ++k)
+  {
+    std::stringstream ss;
+    ss << sizes[k](0) << "x" << sizes[k](1);
+    cout.width(10); cout << ss.str(); cout << " ";
+  }
+  cout << endl;
+
+
+  for(int i=0; i<labels.size(); ++i)
+  {
+    cout.width(32); cout << labels[i]; cout << "  ";
+    ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f;
+    for(int k=0; k<sizes.size(); ++k)
+    {
+      cout.width(10);
+      if(r(k)>=1e6)  cout << "-";
+      else           cout << r(k);
+      cout << " ";
+    }
+    cout << endl;
+  }
+
+  // HTML output
+  cout << "<table class=\"manual\">" << endl;
+  cout << "<tr><th>solver/size</th>" << endl;
+  for(int k=0; k<sizes.size(); ++k)
+    cout << "  <th>" << sizes[k](0) << "x" << sizes[k](1) << "</th>";
+  cout << "</tr>" << endl;
+  for(int i=0; i<labels.size(); ++i)
+  {
+    cout << "<tr";
+    if(i%2==1) cout << " class=\"alt\"";
+    cout << "><td>" << labels[i] << "</td>";
+    ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f;
+    for(int k=0; k<sizes.size(); ++k)
+    {
+      if(r(k)>=1e6) cout << "<td>-</td>";
+      else
+      {
+        cout << "<td>" << r(k);
+        if(i>0)
+          cout << " (x" << numext::round(10.f*results[labels[i]](k)/results["LLT"](k))/10.f << ")";
+        if(i<4 && sizes[k](0)!=sizes[k](1))
+          cout << " <sup><a href=\"#note_ls\">*</a></sup>";
+        cout << "</td>";
+      }
+    }
+    cout << "</tr>" << endl;
+  }
+  cout << "</table>" << endl;
+
+//   cout << "LLT                             (ms)  " << (results["LLT"]*1000.).format(fmt) << "\n";
+//   cout << "LDLT                             (%)  " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "PartialPivLU                     (%)  " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "FullPivLU                        (%)  " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "HouseholderQR                    (%)  " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "ColPivHouseholderQR              (%)  " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "CompleteOrthogonalDecomposition  (%)  " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "FullPivHouseholderQR             (%)  " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "JacobiSVD                        (%)  " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "BDCSVD                           (%)  " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n";
+}
diff --git a/bench/eig33.cpp b/bench/eig33.cpp
index 1608b999d..47947a9be 100644
--- a/bench/eig33.cpp
+++ b/bench/eig33.cpp
@@ -50,7 +50,7 @@ inline void computeRoots(const Matrix& m, Roots& roots)
 {
   typedef typename Matrix::Scalar Scalar;
   const Scalar s_inv3 = 1.0/3.0;
-  const Scalar s_sqrt3 = internal::sqrt(Scalar(3.0));
+  const Scalar s_sqrt3 = std::sqrt(Scalar(3.0));
 
   // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0.  The
   // eigenvalues are the roots to this equation, all guaranteed to be
@@ -73,23 +73,13 @@ inline void computeRoots(const Matrix& m, Roots& roots)
     q = Scalar(0);
 
   // Compute the eigenvalues by solving for the roots of the polynomial.
-  Scalar rho = internal::sqrt(-a_over_3);
-  Scalar theta = std::atan2(internal::sqrt(-q),half_b)*s_inv3;
-  Scalar cos_theta = internal::cos(theta);
-  Scalar sin_theta = internal::sin(theta);
-  roots(0) = c2_over_3 + Scalar(2)*rho*cos_theta;
-  roots(1) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta);
-  roots(2) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta);
-
-  // Sort in increasing order.
-  if (roots(0) >= roots(1))
-    std::swap(roots(0),roots(1));
-  if (roots(1) >= roots(2))
-  {
-    std::swap(roots(1),roots(2));
-    if (roots(0) >= roots(1))
-      std::swap(roots(0),roots(1));
-  }
+  Scalar rho = std::sqrt(-a_over_3);
+  Scalar theta = std::atan2(std::sqrt(-q),half_b)*s_inv3;
+  Scalar cos_theta = std::cos(theta);
+  Scalar sin_theta = std::sin(theta);
+  roots(2) = c2_over_3 + Scalar(2)*rho*cos_theta;
+  roots(0) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta);
+  roots(1) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta);
 }
 
 template<typename Matrix, typename Vector>
@@ -99,9 +89,12 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals)
   // Scale the matrix so its entries are in [-1,1].  The scaling is applied
   // only when at least one matrix entry has magnitude larger than 1.
 
-  Scalar scale = mat.cwiseAbs()/*.template triangularView<Lower>()*/.maxCoeff();
+  Scalar shift = mat.trace()/3;
+  Matrix scaledMat = mat;
+  scaledMat.diagonal().array() -= shift;
+  Scalar scale = scaledMat.cwiseAbs()/*.template triangularView<Lower>()*/.maxCoeff();
   scale = std::max(scale,Scalar(1));
-  Matrix scaledMat = mat / scale;
+  scaledMat/=scale;
 
   // Compute the eigenvalues
 //   scaledMat.setZero();
@@ -166,6 +159,7 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals)
   
   // Rescale back to the original size.
   evals *= scale;
+  evals.array()+=shift;
 }
 
 int main()
@@ -173,24 +167,29 @@ int main()
   BenchTimer t;
   int tries = 10;
   int rep = 400000;
-  typedef Matrix3f Mat;
-  typedef Vector3f Vec;
+  typedef Matrix3d Mat;
+  typedef Vector3d Vec;
   Mat A = Mat::Random(3,3);
   A = A.adjoint() * A;
+//   Mat Q = A.householderQr().householderQ();
+//   A = Q * Vec(2.2424567,2.2424566,7.454353).asDiagonal() * Q.transpose();
 
   SelfAdjointEigenSolver<Mat> eig(A);
   BENCH(t, tries, rep, eig.compute(A));
-  std::cout << "Eigen:  " << t.best() << "s\n";
+  std::cout << "Eigen iterative:  " << t.best() << "s\n";
+  
+  BENCH(t, tries, rep, eig.computeDirect(A));
+  std::cout << "Eigen direct   :  " << t.best() << "s\n";
 
   Mat evecs;
   Vec evals;
   BENCH(t, tries, rep, eigen33(A,evecs,evals));
   std::cout << "Direct: " << t.best() << "s\n\n";
 
-  std::cerr << "Eigenvalue/eigenvector diffs:\n";
-  std::cerr << (evals - eig.eigenvalues()).transpose() << "\n";
-  for(int k=0;k<3;++k)
-    if(evecs.col(k).dot(eig.eigenvectors().col(k))<0)
-      evecs.col(k) = -evecs.col(k);
-  std::cerr << evecs - eig.eigenvectors() << "\n\n";
+//   std::cerr << "Eigenvalue/eigenvector diffs:\n";
+//   std::cerr << (evals - eig.eigenvalues()).transpose() << "\n";
+//   for(int k=0;k<3;++k)
+//     if(evecs.col(k).dot(eig.eigenvectors().col(k))<0)
+//       evecs.col(k) = -evecs.col(k);
+//   std::cerr << evecs - eig.eigenvectors() << "\n\n";
 }
diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
new file mode 100644
index 000000000..af8eb9b8f
--- /dev/null
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -0,0 +1,61 @@
+#3.0.1
+#3.1.1
+#3.2.0
+3.2.4
+#5745:37f59e65eb6c
+5891:d8652709345d  # introduce AVX
+#5893:24b4dc92c6d3  # merge
+5895:997c2ef9fc8b  # introduce FMA
+#5904:e1eafd14eaa1  # complex and AVX
+5908:f8ee3c721251  # improve packing with ptranspose
+#5921:ca808bb456b0  # merge
+#5927:8b1001f9e3ac
+5937:5a4ca1ad8c53  # New gebp kernel handling up to 3 packets x 4 register-level blocks
+#5949:f3488f4e45b2  # merge
+#5969:e09031dccfd9  # Disable 3pX4 kernel on Altivec
+#5992:4a429f5e0483  # merge
+before-evaluators
+#6334:f6a45e5b8b7c  # Implement evaluator for sparse outer products
+#6639:c9121c60b5c7
+#6655:06f163b5221f  # Properly detect FMA support on ARM
+#6677:700e023044e7   # FMA has been wrongly disabled
+#6681:11d31dafb0e3
+#6699:5e6e8e10aad1   # merge default to tensors
+#6726:ff2d2388e7b9   # merge default to tensors
+#6742:0cbd6195e829   # merge default to tensors
+#6747:853d2bafeb8f   # Generalized the gebp apis
+6765:71584fd55762   # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation
+#6781:9cc5a931b2c6   # generalized gemv
+#6792:f6e1daab600a   # ensured that contractions that can be reduced to a matrix vector product
+#6844:039efd86b75c   # merge tensor
+6845:7333ed40c6ef   # change prefetching in gebp
+#6856:b5be5e10eb7f   # merge index conversion
+#6893:c3a64aba7c70   # clean blocking size computation
+#6898:6fb31ebe6492   # rotating kernel for ARM
+6899:877facace746   # rotating kernel for ARM only
+#6904:c250623ae9fa   # result_of
+6921:915f1b1fc158   # fix prefetching change for ARM
+6923:9ff25f6dacc6   # prefetching
+6933:52572e60b5d3   # blocking size strategy
+6937:c8c042f286b2   # avoid redundant pack_rhs
+6981:7e5d6f78da59   # dynamic loop swapping
+6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
+6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
+7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+7015:8aad8f35c955   # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables
+7016:a58d253e8c91   # Polish lookup tables generation
+7018:9b27294a8186   # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment
+7019:c758b1e2c073   # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now.
+7085:627e039fba68   # Bug 986: add support for coefficient-based product with 0 depth.
+7098:b6f1db9cf9ec   # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code
+7591:09a8e2186610   # 3.3-alpha1
+7650:b0f3c8f43025   # help clang inlining
+#8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
+8789:efcb912e4356   # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
+8972:81d53c711775   # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
+8985:d935df21a082   # Remove the rotating kernel.
+8988:6c2dc56e73b3   # Bug 256: enable vectorization with unaligned loads/stores.
+9148:b8b8c421e36c   # Relax mixing-type constraints for binary coefficient-wise operators
+9174:d228bc282ac9   # merge
+9212:c90098affa7b   # Fix performance regression introduced in changeset 8aad8f35c955
+9213:9f1c14e4694b   # Fix performance regression in dgemm introduced by changeset 81d53c711775
diff --git a/bench/perf_monitoring/gemm/gemm.cpp b/bench/perf_monitoring/gemm/gemm.cpp
new file mode 100644
index 000000000..614bd4737
--- /dev/null
+++ b/bench/perf_monitoring/gemm/gemm.cpp
@@ -0,0 +1,67 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <Eigen/Core>
+#include "../../BenchTimer.h"
+using namespace Eigen;
+
+#ifndef SCALAR
+#error SCALAR must be defined
+#endif
+
+typedef SCALAR Scalar;
+
+typedef Matrix<Scalar,Dynamic,Dynamic> Mat;
+
+EIGEN_DONT_INLINE
+void gemm(const Mat &A, const Mat &B, Mat &C)
+{
+  C.noalias() += A * B;
+}
+
+EIGEN_DONT_INLINE
+double bench(long m, long n, long k)
+{
+  Mat A(m,k);
+  Mat B(k,n);
+  Mat C(m,n);
+  A.setRandom();
+  B.setRandom();
+  C.setZero();
+  
+  BenchTimer t;
+  
+  double up = 1e8*4/sizeof(Scalar);
+  double tm0 = 4, tm1 = 10;
+  if(NumTraits<Scalar>::IsComplex)
+  {
+    up /= 4;
+    tm0 = 2;
+    tm1 = 4;
+  }
+  
+  double flops = 2. * m * n * k;
+  long rep = std::max(1., std::min(100., up/flops) );
+  long tries = std::max(tm0, std::min(tm1, up/flops) );
+  
+  BENCH(t, tries, rep, gemm(A,B,C));
+  
+  return 1e-9 * rep * flops / t.best();
+}
+
+int main(int argc, char **argv)
+{
+  std::vector<double> results;
+  
+  std::ifstream settings("gemm_settings.txt");
+  long m, n, k;
+  while(settings >> m >> n >> k)
+  {
+    //std::cerr << "  Testing " << m << " " << n << " " << k << std::endl;
+    results.push_back( bench(m, n, k) );
+  }
+  
+  std::cout << RowVectorXd::Map(results.data(), results.size());
+  
+  return 0;
+}
diff --git a/bench/perf_monitoring/gemm/gemm_settings.txt b/bench/perf_monitoring/gemm/gemm_settings.txt
new file mode 100644
index 000000000..5c43e1c7d
--- /dev/null
+++ b/bench/perf_monitoring/gemm/gemm_settings.txt
@@ -0,0 +1,15 @@
+8 8 8
+9 9 9
+24 24 24
+239 239 239
+240 240 240
+2400 24 24
+24 2400 24
+24 24 2400
+24 2400 2400
+2400 24 2400
+2400 2400 24
+2400 2400 64
+4800 23 160
+23 4800 160
+2400 2400 2400
diff --git a/bench/perf_monitoring/gemm/lazy_gemm.cpp b/bench/perf_monitoring/gemm/lazy_gemm.cpp
new file mode 100644
index 000000000..6dc370155
--- /dev/null
+++ b/bench/perf_monitoring/gemm/lazy_gemm.cpp
@@ -0,0 +1,98 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <Eigen/Core>
+#include "../../BenchTimer.h"
+using namespace Eigen;
+
+#ifndef SCALAR
+#error SCALAR must be defined
+#endif
+
+typedef SCALAR Scalar;
+
+template<typename MatA, typename MatB, typename MatC>
+EIGEN_DONT_INLINE
+void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
+{
+//   escape((void*)A.data());
+//   escape((void*)B.data());
+  C.noalias() += A.lazyProduct(B);
+//   escape((void*)C.data());
+}
+
+template<int m, int n, int k, int TA>
+EIGEN_DONT_INLINE
+double bench()
+{
+  typedef Matrix<Scalar,m,k,TA> MatA;
+  typedef Matrix<Scalar,k,n> MatB;
+  typedef Matrix<Scalar,m,n> MatC;
+
+  MatA A(m,k);
+  MatB B(k,n);
+  MatC C(m,n);
+  A.setRandom();
+  B.setRandom();
+  C.setZero();
+
+  BenchTimer t;
+
+  double up = 1e7*4/sizeof(Scalar);
+  double tm0 = 10, tm1 = 20;
+
+  double flops = 2. * m * n * k;
+  long rep = std::max(10., std::min(10000., up/flops) );
+  long tries = std::max(tm0, std::min(tm1, up/flops) );
+
+  BENCH(t, tries, rep, lazy_gemm(A,B,C));
+
+  return 1e-9 * rep * flops / t.best();
+}
+
+template<int m, int n, int k>
+double bench_t(int t)
+{
+  if(t)
+    return bench<m,n,k,RowMajor>();
+  else
+    return bench<m,n,k,0>();
+}
+
+EIGEN_DONT_INLINE
+double bench_mnk(int m, int n, int k, int t)
+{
+  int id = m*10000 + n*100 + k;
+  switch(id) {
+    case  10101 : return bench_t< 1, 1, 1>(t); break;
+    case  20202 : return bench_t< 2, 2, 2>(t); break;
+    case  30303 : return bench_t< 3, 3, 3>(t); break;
+    case  40404 : return bench_t< 4, 4, 4>(t); break;
+    case  50505 : return bench_t< 5, 5, 5>(t); break;
+    case  60606 : return bench_t< 6, 6, 6>(t); break;
+    case  70707 : return bench_t< 7, 7, 7>(t); break;
+    case  80808 : return bench_t< 8, 8, 8>(t); break;
+    case  90909 : return bench_t< 9, 9, 9>(t); break;
+    case 101010 : return bench_t<10,10,10>(t); break;
+    case 111111 : return bench_t<11,11,11>(t); break;
+    case 121212 : return bench_t<12,12,12>(t); break;
+  }
+  return 0;
+}
+
+int main(int argc, char **argv)
+{
+  std::vector<double> results;
+  
+  std::ifstream settings("lazy_gemm_settings.txt");
+  long m, n, k, t;
+  while(settings >> m >> n >> k >> t)
+  {
+    //std::cerr << "  Testing " << m << " " << n << " " << k << std::endl;
+    results.push_back( bench_mnk(m, n, k, t) );
+  }
+  
+  std::cout << RowVectorXd::Map(results.data(), results.size());
+  
+  return 0;
+}
diff --git a/bench/perf_monitoring/gemm/lazy_gemm_settings.txt b/bench/perf_monitoring/gemm/lazy_gemm_settings.txt
new file mode 100644
index 000000000..407d5d4fa
--- /dev/null
+++ b/bench/perf_monitoring/gemm/lazy_gemm_settings.txt
@@ -0,0 +1,15 @@
+1 1 1 0
+2 2 2 0
+3 3 3 0
+4 4 4 0
+4 4 4 1
+5 5 5 0
+6 6 6 0
+7 7 7 0
+7 7 7 1
+8 8 8 0
+9 9 9 0
+10 10 10 0
+11 11 11 0
+12 12 12 0
+12 12 12 1
diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh
new file mode 100755
index 000000000..cd3214ac9
--- /dev/null
+++ b/bench/perf_monitoring/gemm/make_plot.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# base name of the bench
+# it reads $1.out
+# and generates $1.pdf
+WHAT=$1
+bench=$2
+
+header="rev "
+while read line
+do
+  if [ ! -z '$line' ]; then
+    header="$header  \"$line\""
+  fi
+done < $bench"_settings.txt"
+
+echo $header > $WHAT.out.header
+cat $WHAT.out >> $WHAT.out.header
+
+
+echo "set title '$WHAT'" > $WHAT.gnuplot
+echo "set key autotitle columnhead outside " >> $WHAT.gnuplot
+echo "set xtics rotate 1" >> $WHAT.gnuplot
+
+echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
+echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot
+
+col=`cat $bench"_settings.txt" | wc -l`
+echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
+echo " " >>  $WHAT.gnuplot
+
+gnuplot -persist < $WHAT.gnuplot
+
+# generate a png file
+# convert -background white -density 120 -rotate 90 -resize 800 +dither -colors 256 -quality 0 $WHAT.ps -background white -flatten  .$WHAT.png
+
+# clean
+rm $WHAT.out.header $WHAT.gnuplot
+\ No newline at end of file
diff --git a/bench/perf_monitoring/gemm/run.sh b/bench/perf_monitoring/gemm/run.sh
new file mode 100755
index 000000000..9d6ee40bc
--- /dev/null
+++ b/bench/perf_monitoring/gemm/run.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+
+# ./run.sh gemm
+# ./run.sh lazy_gemm
+
+# Examples of environment variables to be set:
+#   PREFIX="haswell-fma-"
+#   CXX_FLAGS="-mfma"
+
+# Options:
+#   -up : enforce the recomputation of existing data, and keep best results as a merging strategy
+#   -s  : recompute selected changesets only and keep bests
+
+bench=$1
+
+if echo "$*" | grep '\-up' > /dev/null; then
+  update=true
+else
+  update=false
+fi
+
+if echo "$*" | grep '\-s' > /dev/null; then
+  selected=true
+else
+  selected=false
+fi
+
+global_args="$*"
+
+if [ $selected == true ]; then
+ echo "Recompute selected changesets only and keep bests"
+elif [ $update == true ]; then
+ echo "(Re-)Compute all changesets and keep bests"
+else
+ echo "Skip previously computed changesets"
+fi
+
+
+
+if [ ! -d "eigen_src" ]; then
+  hg clone https://bitbucket.org/eigen/eigen eigen_src
+else
+  cd eigen_src
+  hg pull -u
+  cd ..
+fi
+
+if [ ! -z '$CXX' ]; then
+  CXX=g++
+fi
+
+function make_backup
+{
+  if [ -f "$1.out" ]; then
+    mv "$1.out" "$1.backup"
+  fi
+}
+
+function merge
+{
+  count1=`echo $1 |  wc -w`
+  count2=`echo $2 |  wc -w`
+  
+  if [ $count1 == $count2 ]; then
+    a=( $1 ); b=( $2 )
+    res=""
+    for (( i=0 ; i<$count1 ; i++ )); do
+      ai=${a[$i]}; bi=${b[$i]}
+      tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l`
+      res="$res $tmp"
+    done
+    echo $res
+
+  else
+    echo $1
+  fi
+}
+
+function test_current 
+{
+  rev=$1
+  scalar=$2
+  name=$3
+  
+  prev=""
+  if [ -e "$name.backup" ]; then
+    prev=`grep $rev "$name.backup" | cut -c 14-`
+  fi
+  res=$prev
+  count_rev=`echo $prev |  wc -w`
+  count_ref=`cat $bench"_settings.txt" |  wc -l`
+  if echo "$global_args" | grep "$rev" > /dev/null; then
+    rev_found=true
+  else
+    rev_found=false
+  fi
+#  echo $update et $selected et $rev_found because $rev et "$global_args"
+#  echo $count_rev et $count_ref
+  if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] &&  [ $rev_found == true ]); then
+    if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src $bench.cpp -DSCALAR=$scalar -o $name; then
+      curr=`./$name`
+      if [ $count_rev == $count_ref ]; then
+        echo "merge previous $prev"
+        echo "with new       $curr"
+      else
+        echo "got            $curr"
+      fi
+      res=`merge "$curr" "$prev"`
+#       echo $res
+      echo "$rev $res" >> $name.out
+    else
+      echo "Compilation failed, skip rev $rev"
+    fi
+  else
+    echo "Skip existing results for $rev / $name"
+    echo "$rev $res" >> $name.out
+  fi
+}
+
+make_backup $PREFIX"s"$bench
+make_backup $PREFIX"d"$bench
+make_backup $PREFIX"c"$bench
+
+cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev
+do
+  if [ ! -z '$rev' ]; then
+    echo "Testing rev $rev"
+    cd eigen_src
+    hg up -C $rev > /dev/null
+    actual_rev=`hg identify | cut -f1 -d' '`
+    cd ..
+    
+    test_current $actual_rev float                  $PREFIX"s"$bench
+    test_current $actual_rev double                 $PREFIX"d"$bench
+    test_current $actual_rev "std::complex<double>" $PREFIX"c"$bench
+  fi
+  
+done
+
+echo "Float:"
+cat $PREFIX"s""$bench.out"
+echo " "
+
+echo "Double:"
+cat $PREFIX"d""$bench.out"
+echo ""
+
+echo "Complex:"
+cat $PREFIX"c""$bench.out"
+echo ""
+
+./make_plot.sh $PREFIX"s"$bench $bench
+./make_plot.sh $PREFIX"d"$bench $bench
+./make_plot.sh $PREFIX"c"$bench $bench
+
+
diff --git a/bench/spbench/CMakeLists.txt b/bench/spbench/CMakeLists.txt
index 6e0e1b103..8d53f4ae2 100644
--- a/bench/spbench/CMakeLists.txt
+++ b/bench/spbench/CMakeLists.txt
@@ -29,7 +29,7 @@ if(UMFPACK_FOUND AND BLAS_FOUND)
   set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endif()
 
-find_package(SuperLU)
+find_package(SuperLU 4.0)
 if(SUPERLU_FOUND AND BLAS_FOUND)
   add_definitions("-DEIGEN_SUPERLU_SUPPORT")
   include_directories(${SUPERLU_INCLUDES})
diff --git a/bench/spbench/spbenchstyle.h b/bench/spbench/spbenchstyle.h
index 17a05ce71..f6a981778 100644
--- a/bench/spbench/spbenchstyle.h
+++ b/bench/spbench/spbenchstyle.h
@@ -91,4 +91,5 @@ void printBenchStyle(std::ofstream& out)
   </xsl:stylesheet>\n\n";
   
 }
-#endif
-\ No newline at end of file
+
+#endif
diff --git a/bench/tensors/README b/bench/tensors/README
new file mode 100644
index 000000000..3a5fdbe17
--- /dev/null
+++ b/bench/tensors/README
@@ -0,0 +1,21 @@
+The tensor benchmark suite is made of several parts.
+
+The first part is a generic suite, in which each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
+
+To compile the floating point CPU benchmarks, simply call:
+g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the floating point GPU benchmarks, simply call:
+nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_35 -o benchmarks_gpu
+
+We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
+nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
+
+last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
+g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code):
+1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code.
+{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc
+2. The host compilation pass that generates the final host binary.
+clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl
diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h
new file mode 100644
index 000000000..f115b54ad
--- /dev/null
+++ b/bench/tensors/benchmark.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include <vector>
+
+namespace testing {
+class Benchmark {
+ public:
+  Benchmark(const char* name, void (*fn)(int)) {
+    Register(name, fn, NULL);
+  }
+  Benchmark(const char* name, void (*fn_range)(int, int)) {
+    Register(name, NULL, fn_range);
+  }
+  Benchmark* Arg(int x);
+  Benchmark* Range(int lo, int hi);
+  const char* Name();
+  bool ShouldRun(int argc, char* argv[]);
+  void Run();
+ private:
+  const char* name_;
+  void (*fn_)(int);
+  void (*fn_range_)(int, int);
+  std::vector<int> args_;
+  void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int));
+  void RunRepeatedlyWithArg(int iterations, int arg);
+  void RunWithArg(int arg);
+};
+}  // namespace testing
+void SetBenchmarkFlopsProcessed(int64_t);
+void StopBenchmarkTiming();
+void StartBenchmarkTiming();
+#define BENCHMARK(f) \
+    static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
+        (new ::testing::Benchmark(#f, f))
diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc
new file mode 100644
index 000000000..1efa0dbad
--- /dev/null
+++ b/bench/tensors/benchmark_main.cc
@@ -0,0 +1,237 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "benchmark.h"
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <inttypes.h>
+#include <time.h>
+#include <map>
+
+static int64_t g_flops_processed;
+static int64_t g_benchmark_total_time_ns;
+static int64_t g_benchmark_start_time_ns;
+typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
+typedef BenchmarkMap::iterator BenchmarkMapIt;
+
+BenchmarkMap& gBenchmarks() {
+  static BenchmarkMap g_benchmarks;
+  return g_benchmarks;
+}
+
+static int g_name_column_width = 20;
+
+static int Round(int n) {
+  int base = 1;
+  while (base*10 < n) {
+    base *= 10;
+  }
+  if (n < 2*base) {
+    return 2*base;
+  }
+  if (n < 5*base) {
+    return 5*base;
+  }
+  return 10*base;
+}
+
+#ifdef __APPLE__
+  #include <mach/mach_time.h>
+  static mach_timebase_info_data_t g_time_info;
+  static void __attribute__((constructor)) init_info() {
+    mach_timebase_info(&g_time_info);
+  }
+#endif
+
+static int64_t NanoTime() {
+#if defined(__APPLE__)
+  uint64_t t = mach_absolute_time();
+  return t * g_time_info.numer / g_time_info.denom;
+#else
+  struct timespec t;
+  t.tv_sec = t.tv_nsec = 0;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec;
+#endif
+}
+
+namespace testing {
+Benchmark* Benchmark::Arg(int arg) {
+  args_.push_back(arg);
+  return this;
+}
+
+Benchmark* Benchmark::Range(int lo, int hi) {
+  const int kRangeMultiplier = 8;
+  if (hi < lo) {
+    int temp = hi;
+    hi = lo;
+    lo = temp;
+  }
+  while (lo < hi) {
+    args_.push_back(lo);
+    lo *= kRangeMultiplier;
+  }
+  // We always run the hi number.
+  args_.push_back(hi);
+  return this;
+}
+
+const char* Benchmark::Name() {
+  return name_;
+}
+bool Benchmark::ShouldRun(int argc, char* argv[]) {
+  if (argc == 1) {
+    return true;  // With no arguments, we run all benchmarks.
+  }
+  // Otherwise, we interpret each argument as a regular expression and
+  // see if any of our benchmarks match.
+  for (int i = 1; i < argc; i++) {
+    regex_t re;
+    if (regcomp(&re, argv[i], 0) != 0) {
+      fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]);
+      exit(EXIT_FAILURE);
+    }
+    int match = regexec(&re, name_, 0, NULL, 0);
+    regfree(&re);
+    if (match != REG_NOMATCH) {
+      return true;
+    }
+  }
+  return false;
+}
+void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) {
+  name_ = name;
+  fn_ = fn;
+  fn_range_ = fn_range;
+  if (fn_ == NULL && fn_range_ == NULL) {
+    fprintf(stderr, "%s: missing function\n", name_);
+    exit(EXIT_FAILURE);
+  }
+  gBenchmarks().insert(std::make_pair(name, this));
+}
+void Benchmark::Run() {
+  if (fn_ != NULL) {
+    RunWithArg(0);
+  } else {
+    if (args_.empty()) {
+      fprintf(stderr, "%s: no args!\n", name_);
+      exit(EXIT_FAILURE);
+    }
+    for (size_t i = 0; i < args_.size(); ++i) {
+      RunWithArg(args_[i]);
+    }
+  }
+}
+void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) {
+  g_flops_processed = 0;
+  g_benchmark_total_time_ns = 0;
+  g_benchmark_start_time_ns = NanoTime();
+  if (fn_ != NULL) {
+    fn_(iterations);
+  } else {
+    fn_range_(iterations, arg);
+  }
+  if (g_benchmark_start_time_ns != 0) {
+    g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+  }
+}
+void Benchmark::RunWithArg(int arg) {
+  // run once in case it's expensive
+  int iterations = 1;
+  RunRepeatedlyWithArg(iterations, arg);
+  while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) {
+    int last = iterations;
+    if (g_benchmark_total_time_ns/iterations == 0) {
+      iterations = 1e9;
+    } else {
+      iterations = 1e9 / (g_benchmark_total_time_ns/iterations);
+    }
+    iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last));
+    iterations = Round(iterations);
+    RunRepeatedlyWithArg(iterations, arg);
+  }
+  char throughput[100];
+  throughput[0] = '\0';
+  if (g_benchmark_total_time_ns > 0 && g_flops_processed > 0) {
+    double mflops_processed = static_cast<double>(g_flops_processed)/1e6;
+    double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9;
+    snprintf(throughput, sizeof(throughput), " %8.2f MFlops/s", mflops_processed/seconds);
+  }
+  char full_name[100];
+  if (fn_range_ != NULL) {
+    if (arg >= (1<<20)) {
+      snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20));
+    } else if (arg >= (1<<10)) {
+      snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10));
+    } else {
+      snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg);
+    }
+  } else {
+    snprintf(full_name, sizeof(full_name), "%s", name_);
+  }
+  printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name,
+         iterations, g_benchmark_total_time_ns/iterations, throughput);
+  fflush(stdout);
+}
+}  // namespace testing
+void SetBenchmarkFlopsProcessed(int64_t x) {
+  g_flops_processed = x;
+}
+void StopBenchmarkTiming() {
+  if (g_benchmark_start_time_ns != 0) {
+    g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+  }
+  g_benchmark_start_time_ns = 0;
+}
+void StartBenchmarkTiming() {
+  if (g_benchmark_start_time_ns == 0) {
+    g_benchmark_start_time_ns = NanoTime();
+  }
+}
+int main(int argc, char* argv[]) {
+  if (gBenchmarks().empty()) {
+    fprintf(stderr, "No benchmarks registered!\n");
+    exit(EXIT_FAILURE);
+  }
+  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+    int name_width = static_cast<int>(strlen(it->second->Name()));
+    g_name_column_width = std::max(g_name_column_width, name_width);
+  }
+  bool need_header = true;
+  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+    ::testing::Benchmark* b = it->second;
+    if (b->ShouldRun(argc, argv)) {
+      if (need_header) {
+        printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op");
+        fflush(stdout);
+        need_header = false;
+      }
+      b->Run();
+    }
+  }
+  if (need_header) {
+    fprintf(stderr, "No matching benchmarks!\n");
+    fprintf(stderr, "Available benchmarks:\n");
+    for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+      fprintf(stderr, "  %s\n", it->second->Name());
+    }
+    exit(EXIT_FAILURE);
+  }
+  return 0;
+}
diff --git a/bench/tensors/contraction_benchmarks_cpu.cc b/bench/tensors/contraction_benchmarks_cpu.cc
new file mode 100644
index 000000000..f9e57ad47
--- /dev/null
+++ b/bench/tensors/contraction_benchmarks_cpu.cc
@@ -0,0 +1,39 @@
+#define EIGEN_USE_THREADS
+
+#include <string>
+
+#include "tensor_benchmarks.h"
+
+#define CREATE_THREAD_POOL(threads)             \
+Eigen::ThreadPool pool(threads);                \
+Eigen::ThreadPoolDevice device(&pool, threads);
+
+
+// Contractions for number of threads ranging from 1 to 32
+// Dimensions are Rows, Cols, Depth
+#define BM_ContractionCPU(D1, D2, D3)                                         \
+  static void BM_##Contraction##_##D1##x##D2##x##D3(int iters, int Threads) { \
+    StopBenchmarkTiming();                                                    \
+    CREATE_THREAD_POOL(Threads);                                              \
+    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
+    suite.contraction(iters);                                                 \
+  }                                                                           \
+  BENCHMARK_RANGE(BM_##Contraction##_##D1##x##D2##x##D3, 1, 32);
+
+
+// Vector Matrix and Matrix Vector products
+BM_ContractionCPU(1, 2000, 500);
+BM_ContractionCPU(2000, 1, 500);
+
+// Various skinny matrices
+BM_ContractionCPU(250, 3, 512);
+BM_ContractionCPU(1500, 3, 512);
+
+BM_ContractionCPU(512, 800, 4);
+BM_ContractionCPU(512, 80, 800);
+BM_ContractionCPU(512, 80, 13522);
+BM_ContractionCPU(1, 80, 13522);
+
+BM_ContractionCPU(3200, 512, 4);
+BM_ContractionCPU(3200, 512, 80);
+BM_ContractionCPU(3200, 80, 512);
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
new file mode 100644
index 000000000..c2fb3dede
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks.h
@@ -0,0 +1,478 @@
+#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
+#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
+
+typedef int TensorIndex;
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "benchmark.h"
+
+#define BENCHMARK_RANGE(bench, lo, hi) \
+  BENCHMARK(bench)->Range(lo, hi)
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+// TODO(bsteiner): also templatize on the input type since we have users
+// for int8 as well as floats.
+template <typename Device, typename T> class BenchmarkSuite {
+ public:
+  BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
+      : m_(m), k_(k), n_(n), device_(device) {
+    initialize();
+  }
+
+  BenchmarkSuite(const Device& device, size_t m)
+      : m_(m), k_(m), n_(m), device_(device) {
+    initialize();
+  }
+
+  ~BenchmarkSuite() {
+    device_.deallocate(a_);
+    device_.deallocate(b_);
+    device_.deallocate(c_);
+  }
+
+  void memcpy(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
+    }
+    // Record the number of values copied per second
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void typeCasting(int num_iters) {
+    eigen_assert(m_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    if (sizeof(T) >= sizeof(int)) {
+      sizes[0] = m_;
+      sizes[1] = k_;
+    } else {
+      sizes[0] = m_ * sizeof(T) / sizeof(int);
+      sizes[1] = k_ * sizeof(T) / sizeof(int);
+    }
+    const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
+    TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.template cast<T>();
+    }
+    // Record the number of values copied per second
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
+  }
+
+  void random(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = C.random();
+    }
+    // Record the number of random numbers generated per second
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void slicing(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+
+    const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
+    const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
+    const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
+    const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
+    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.slice(first_quadrant, quarter_sizes).device(device_) =
+          A.slice(first_quadrant, quarter_sizes);
+      C.slice(second_quadrant, quarter_sizes).device(device_) =
+          B.slice(second_quadrant, quarter_sizes);
+      C.slice(third_quadrant, quarter_sizes).device(device_) =
+          A.slice(third_quadrant, quarter_sizes);
+      C.slice(fourth_quadrant, quarter_sizes).device(device_) =
+          B.slice(fourth_quadrant, quarter_sizes);
+    }
+    // Record the number of values copied from the rhs slice to the lhs slice
+    // each second
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void rowChip(int num_iters) {
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    Eigen::array<TensorIndex, 1> output_size;
+    output_size[0] = n_;
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.chip(iter % k_, 0);
+    }
+    // Record the number of values copied from the rhs chip to the lhs.
+    finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
+  }
+
+  void colChip(int num_iters) {
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    Eigen::array<TensorIndex, 1> output_size;
+    output_size[0] = n_;
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.chip(iter % n_, 1);
+    }
+    // Record the number of values copied from the rhs chip to the lhs.
+    finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
+  }
+
+  void shuffling(int num_iters) {
+    eigen_assert(m_ == n_);
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = k_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_b;
+    size_b[0] = k_;
+    size_b[1] = m_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
+
+    Eigen::array<int, 2> shuffle;
+    shuffle[0] = 1;
+    shuffle[1] = 0;
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.shuffle(shuffle);
+    }
+    // Record the number of values shuffled from A and copied to B each second
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
+  }
+
+ void padding(int num_iters) {
+    eigen_assert(m_ == k_);
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = k_-3;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_b;
+    size_b[0] = k_;
+    size_b[1] = m_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
+
+#if defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
+                         Eigen::type2indexpair<2, 1> > paddings;
+#else
+    Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
+    paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
+    paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
+#endif
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.pad(paddings);
+    }
+    // Record the number of values copied from the padded tensor A each second
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
+  }
+
+ void striding(int num_iters) {
+    eigen_assert(m_ == k_);
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = k_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_b;
+    size_b[0] = m_;
+    size_b[1] = k_/2;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    Eigen::array<TensorIndex, 2> strides;
+    strides[0] = 1;
+    strides[1] = 2;
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
+#endif
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.stride(strides);
+    }
+    // Record the number of values copied from the padded tensor A each second
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
+  }
+
+  void broadcasting(int num_iters) {
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = 1;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_c;
+    size_c[0] = m_;
+    size_c[1] = n_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    Eigen::array<int, 2> broadcast;
+    broadcast[0] = 1;
+    broadcast[1] = n_;
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
+    broadcast.set(1, n_);
+#endif
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.broadcast(broadcast);
+    }
+    // Record the number of values broadcasted from A and copied to C each second
+    finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
+  }
+
+  void coeffWiseOp(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
+    }
+    // Record the number of FLOP executed per second (2 multiplications and
+    // 1 addition per value)
+    finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
+  }
+
+  void algebraicFunc(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void transcendentalFunc(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.exp() + B.log();
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+ // Row reduction
+  void rowReduction(int num_iters) {
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    Eigen::array<TensorIndex, 1> output_size;
+    output_size[0] = n_;
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    Eigen::array<TensorIndex, 1> sum_along_dim;
+    sum_along_dim[0] = 0;
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
+#endif
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.sum(sum_along_dim);
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
+  }
+
+  // Column reduction
+  void colReduction(int num_iters) {
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
+        b_, input_size);
+    Eigen::array<TensorIndex, 1> output_size;
+    output_size[0] = k_;
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
+        c_, output_size);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    Eigen::array<TensorIndex, 1> sum_along_dim;
+    sum_along_dim[0] = 1;
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
+#endif
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.sum(sum_along_dim);
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
+  }
+
+  // Full reduction
+  void fullReduction(int num_iters) {
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
+        b_, input_size);
+    Eigen::array<TensorIndex, 0> output_size;
+    TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
+        c_, output_size);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.sum();
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
+  }
+
+  // do a contraction which is equivalent to a matrix multiplication
+  void contraction(int num_iters) {
+    Eigen::array<TensorIndex, 2> sizeA;
+    sizeA[0] = m_;
+    sizeA[1] = k_;
+    Eigen::array<TensorIndex, 2> sizeB;
+    sizeB[0] = k_;
+    sizeB[1] = n_;
+    Eigen::array<TensorIndex, 2> sizeC;
+    sizeC[0] = m_;
+    sizeC[1] = n_;
+
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+
+    typedef typename Tensor<T, 2>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+    }
+    // Record the number of FLOP executed per second (size_ multiplications and
+    // additions for each value in the resulting tensor)
+    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
+  }
+
+  void convolution(int num_iters, int kernel_x, int kernel_y) {
+    Eigen::array<TensorIndex, 2> input_sizes;
+    input_sizes[0] = m_;
+    input_sizes[1] = n_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
+    Eigen::array<TensorIndex, 2> kernel_sizes;
+    kernel_sizes[0] = kernel_x;
+    kernel_sizes[1] = kernel_y;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
+    Eigen::array<TensorIndex, 2> result_sizes;
+    result_sizes[0] = m_ - kernel_x + 1;
+    result_sizes[1] = n_ - kernel_y + 1;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
+    Eigen::array<TensorIndex, 2> dims;
+    dims[0] = 0;
+    dims[1] = 1;
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.convolve(B, dims);
+    }
+    // Record the number of FLOP executed per second (kernel_size
+    // multiplications and additions for each value in the resulting tensor)
+    finalizeBenchmark(static_cast<int64_t>(2) *
+        (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
+  }
+
+ private:
+  void initialize() {
+    a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+    b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+    c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+    // Initialize the content of the memory pools to prevent asan from
+    // complaining.
+    device_.memset(a_, 12, m_ * k_ * sizeof(T));
+    device_.memset(b_, 23, k_ * n_ * sizeof(T));
+    device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+    //BenchmarkUseRealTime();
+  }
+
+  inline void finalizeBenchmark(int64_t num_items) {
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+    if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+      device_.synchronize();
+    }
+#endif
+    StopBenchmarkTiming();
+    SetBenchmarkFlopsProcessed(num_items);
+  }
+
+
+  TensorIndex m_;
+  TensorIndex k_;
+  TensorIndex n_;
+  T* a_;
+  T* b_;
+  T* c_;
+  Device device_;
+};
+#endif  // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc
new file mode 100644
index 000000000..8947f4b7f
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@@ -0,0 +1,168 @@
+#define EIGEN_USE_THREADS
+
+#include <string>
+
+#include "tensor_benchmarks.h"
+
+#define CREATE_THREAD_POOL(threads)             \
+Eigen::ThreadPool pool(threads);                \
+Eigen::ThreadPoolDevice device(&pool, threads);
+
+// Simple functions
+#define BM_FuncCPU(FUNC, THREADS)                                    \
+  static void BM_##FUNC##_##THREADS##T(int iters, int N) {           \
+    StopBenchmarkTiming();                                           \
+    CREATE_THREAD_POOL(THREADS);                                     \
+    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
+    suite.FUNC(iters);                                               \
+  }                                                                  \
+  BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
+
+BM_FuncCPU(memcpy, 4);
+BM_FuncCPU(memcpy, 8);
+BM_FuncCPU(memcpy, 12);
+
+BM_FuncCPU(typeCasting, 4);
+BM_FuncCPU(typeCasting, 8);
+BM_FuncCPU(typeCasting, 12);
+
+BM_FuncCPU(random, 4);
+BM_FuncCPU(random, 8);
+BM_FuncCPU(random, 12);
+
+BM_FuncCPU(slicing, 4);
+BM_FuncCPU(slicing, 8);
+BM_FuncCPU(slicing, 12);
+
+BM_FuncCPU(rowChip, 4);
+BM_FuncCPU(rowChip, 8);
+BM_FuncCPU(rowChip, 12);
+
+BM_FuncCPU(colChip, 4);
+BM_FuncCPU(colChip, 8);
+BM_FuncCPU(colChip, 12);
+
+BM_FuncCPU(shuffling, 4);
+BM_FuncCPU(shuffling, 8);
+BM_FuncCPU(shuffling, 12);
+
+BM_FuncCPU(padding, 4);
+BM_FuncCPU(padding, 8);
+BM_FuncCPU(padding, 12);
+
+BM_FuncCPU(striding, 4);
+BM_FuncCPU(striding, 8);
+BM_FuncCPU(striding, 12);
+
+BM_FuncCPU(broadcasting, 4);
+BM_FuncCPU(broadcasting, 8);
+BM_FuncCPU(broadcasting, 12);
+
+BM_FuncCPU(coeffWiseOp, 4);
+BM_FuncCPU(coeffWiseOp, 8);
+BM_FuncCPU(coeffWiseOp, 12);
+
+BM_FuncCPU(algebraicFunc, 4);
+BM_FuncCPU(algebraicFunc, 8);
+BM_FuncCPU(algebraicFunc, 12);
+
+BM_FuncCPU(transcendentalFunc, 4);
+BM_FuncCPU(transcendentalFunc, 8);
+BM_FuncCPU(transcendentalFunc, 12);
+
+BM_FuncCPU(rowReduction, 4);
+BM_FuncCPU(rowReduction, 8);
+BM_FuncCPU(rowReduction, 12);
+
+BM_FuncCPU(colReduction, 4);
+BM_FuncCPU(colReduction, 8);
+BM_FuncCPU(colReduction, 12);
+
+
+// Contractions
+#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS)                      \
+  static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \
+    StopBenchmarkTiming();                                                      \
+    if (THREADS == 1) {                                                         \
+      Eigen::DefaultDevice device;                                              \
+      BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3);    \
+      suite.FUNC(iters);                                                        \
+    } else {                                                                    \
+      CREATE_THREAD_POOL(THREADS);                                              \
+      BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
+      suite.FUNC(iters);                                                        \
+    }                                                                           \
+  }                                                                             \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
+
+
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
+
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS)                    \
+  static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) {   \
+    StopBenchmarkTiming();                                                     \
+    CREATE_THREAD_POOL(THREADS);                                               \
+    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N);	       \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);
diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
new file mode 100644
index 000000000..65784d0d6
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@@ -0,0 +1,77 @@
+#define EIGEN_USE_GPU
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+#include "tensor_benchmarks.h"
+
+// Simple functions
+#define BM_FuncGPU(FUNC)                                                       \
+  static void BM_##FUNC(int iters, int N) {                                    \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
+//BM_FuncGPU(random);
+BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
+
+
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3);   \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+/*
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);
+*/
diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu
new file mode 100644
index 000000000..76d68c5c1
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@@ -0,0 +1,75 @@
+#define EIGEN_USE_GPU
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+#include "tensor_benchmarks.h"
+
+// Simple functions
+#define BM_FuncGPU(FUNC)                                                       \
+  static void BM_##FUNC(int iters, int N) {                                    \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N);                  \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
+BM_FuncGPU(random);
+BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
+
+
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3);         \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N);                  \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);
diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc
new file mode 100644
index 000000000..7eca4d966
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_sycl.cc
@@ -0,0 +1,37 @@
+#define EIGEN_USE_SYCL
+
+#include <SYCL/sycl.hpp>
+#include <iostream>
+
+#include "tensor_benchmarks.h"
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+// Simple functions
+template <typename device_selector>
+cl::sycl::queue sycl_queue() {
+  return cl::sycl::queue(device_selector(), [=](cl::sycl::exception_list l) {
+    for (const auto& e : l) {
+      try {
+        std::rethrow_exception(e);
+      } catch (cl::sycl::exception e) {
+        std::cout << e.what() << std::endl;
+      }
+    }
+  });
+}
+
+#define BM_FuncGPU(FUNC)                                       \
+  static void BM_##FUNC(int iters, int N) {                    \
+    StopBenchmarkTiming();                                     \
+    cl::sycl::queue q = sycl_queue<cl::sycl::gpu_selector>();  \
+    Eigen::SyclDevice device(q);                               \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
+    suite.FUNC(iters);                                         \
+  }                                                            \
+  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
diff --git a/blas/PackedTriangularMatrixVector.h b/blas/PackedTriangularMatrixVector.h
index e9886d56f..0039536a8 100644
--- a/blas/PackedTriangularMatrixVector.h
+++ b/blas/PackedTriangularMatrixVector.h
@@ -18,7 +18,7 @@ struct packed_triangular_matrix_vector_product;
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs>
 struct packed_triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   enum {
     IsLower     = (Mode & Lower)   ==Lower,
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
@@ -47,7 +47,7 @@ struct packed_triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsS
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs>
 struct packed_triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   enum {
     IsLower     = (Mode & Lower)   ==Lower,
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
diff --git a/blas/common.h b/blas/common.h
index 2bf642c6b..61d8344d9 100644
--- a/blas/common.h
+++ b/blas/common.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,18 +10,16 @@
 #ifndef EIGEN_BLAS_COMMON_H
 #define EIGEN_BLAS_COMMON_H
 
-#include <Eigen/Core>
-#include <Eigen/Jacobi>
+#include "../Eigen/Core"
+#include "../Eigen/Jacobi"
 
-#include <iostream>
 #include <complex>
 
 #ifndef SCALAR
 #error the token SCALAR must be defined to compile this file
 #endif
 
-#include <Eigen/src/misc/blas.h>
-
+#include "../Eigen/src/misc/blas.h"
 
 #define NOTR    0
 #define TR      1
@@ -95,6 +93,7 @@ enum
 
 typedef Matrix<Scalar,Dynamic,Dynamic,ColMajor> PlainMatrixType;
 typedef Map<Matrix<Scalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > MatrixType;
+typedef Map<const Matrix<Scalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > ConstMatrixType;
 typedef Map<Matrix<Scalar,Dynamic,1>, 0, InnerStride<Dynamic> > StridedVectorType;
 typedef Map<Matrix<Scalar,Dynamic,1> > CompactVectorType;
 
@@ -106,26 +105,45 @@ matrix(T* data, int rows, int cols, int stride)
 }
 
 template<typename T>
-Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > vector(T* data, int size, int incr)
+Map<const Matrix<T,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> >
+matrix(const T* data, int rows, int cols, int stride)
+{
+  return Map<const Matrix<T,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> >(data, rows, cols, OuterStride<>(stride));
+}
+
+template<typename T>
+Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > make_vector(T* data, int size, int incr)
 {
   return Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
 }
 
 template<typename T>
-Map<Matrix<T,Dynamic,1> > vector(T* data, int size)
+Map<const Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > make_vector(const T* data, int size, int incr)
+{
+  return Map<const Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
+}
+
+template<typename T>
+Map<Matrix<T,Dynamic,1> > make_vector(T* data, int size)
 {
   return Map<Matrix<T,Dynamic,1> >(data, size);
 }
 
 template<typename T>
+Map<const Matrix<T,Dynamic,1> > make_vector(const T* data, int size)
+{
+  return Map<const Matrix<T,Dynamic,1> >(data, size);
+}
+
+template<typename T>
 T* get_compact_vector(T* x, int n, int incx)
 {
   if(incx==1)
     return x;
 
-  T* ret = new Scalar[n];
-  if(incx<0) vector(ret,n) = vector(x,n,-incx).reverse();
-  else       vector(ret,n) = vector(x,n, incx);
+  typename Eigen::internal::remove_const<T>::type* ret = new Scalar[n];
+  if(incx<0) make_vector(ret,n) = make_vector(x,n,-incx).reverse();
+  else       make_vector(ret,n) = make_vector(x,n, incx);
   return ret;
 }
 
@@ -135,8 +153,8 @@ T* copy_back(T* x_cpy, T* x, int n, int incx)
   if(x_cpy==x)
     return 0;
 
-  if(incx<0) vector(x,n,-incx).reverse() = vector(x_cpy,n);
-  else       vector(x,n, incx)           = vector(x_cpy,n);
+  if(incx<0) make_vector(x,n,-incx).reverse() = make_vector(x_cpy,n);
+  else       make_vector(x,n, incx)           = make_vector(x_cpy,n);
   return x_cpy;
 }
 
diff --git a/blas/double.cpp b/blas/double.cpp
index 8fd0709ba..295b1d1f2 100644
--- a/blas/double.cpp
+++ b/blas/double.cpp
@@ -23,11 +23,10 @@ double BLASFUNC(dsdot)(int* n, float* x, int* incx, float* y, int* incy)
 {
   if(*n<=0) return 0;
 
-  if(*incx==1 && *incy==1)    return (vector(x,*n).cast<double>().cwiseProduct(vector(y,*n).cast<double>())).sum();
-  else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cast<double>().cwiseProduct(vector(y,*n,*incy).cast<double>())).sum();
-  else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(vector(y,*n,*incy).cast<double>())).sum();
-  else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cast<double>().cwiseProduct(vector(y,*n,-*incy).reverse().cast<double>())).sum();
-  else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(vector(y,*n,-*incy).reverse().cast<double>())).sum();
+  if(*incx==1 && *incy==1)    return (make_vector(x,*n).cast<double>().cwiseProduct(make_vector(y,*n).cast<double>())).sum();
+  else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cast<double>().cwiseProduct(make_vector(y,*n,*incy).cast<double>())).sum();
+  else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(make_vector(y,*n,*incy).cast<double>())).sum();
+  else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cast<double>().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast<double>())).sum();
+  else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast<double>())).sum();
   else return 0;
 }
-
diff --git a/blas/fortran/chbmv.f b/blas/fortran/chbmv.f
deleted file mode 100644
index 1b1c330ea..000000000
--- a/blas/fortran/chbmv.f
+++ /dev/null
@@ -1,310 +0,0 @@
-      SUBROUTINE CHBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      COMPLEX ALPHA,BETA
-      INTEGER INCX,INCY,K,LDA,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      COMPLEX A(LDA,*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  CHBMV  performs the matrix-vector  operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n hermitian band matrix, with k super-diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the band matrix A is being supplied as
-*           follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  being supplied.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  being supplied.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry, K specifies the number of super-diagonals of the
-*           matrix A. K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  ALPHA  - COMPLEX         .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the hermitian matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer the upper
-*           triangular part of a hermitian band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the hermitian matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer the lower
-*           triangular part of a hermitian band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that the imaginary parts of the diagonal elements need
-*           not be set and are assumed to be zero.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - COMPLEX          array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the
-*           vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - COMPLEX         .
-*           On entry, BETA specifies the scalar beta.
-*           Unchanged on exit.
-*
-*  Y      - COMPLEX          array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the
-*           vector y. On exit, Y is overwritten by the updated vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      COMPLEX ONE
-      PARAMETER (ONE= (1.0E+0,0.0E+0))
-      COMPLEX ZERO
-      PARAMETER (ZERO= (0.0E+0,0.0E+0))
-*     ..
-*     .. Local Scalars ..
-      COMPLEX TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC CONJG,MAX,MIN,REAL
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (K.LT.0) THEN
-          INFO = 3
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 6
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 8
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 11
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('CHBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array A
-*     are accessed sequentially with one pass through A.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when upper triangle of A is stored.
-*
-          KPLUS1 = K + 1
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  L = KPLUS1 - J
-                  DO 50 I = MAX(1,J-K),J - 1
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I)
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  L = KPLUS1 - J
-                  DO 70 I = MAX(1,J-K),J - 1
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  IF (J.GT.K) THEN
-                      KX = KX + INCX
-                      KY = KY + INCY
-                  END IF
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when lower triangle of A is stored.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*REAL(A(1,J))
-                  L = 1 - J
-                  DO 90 I = J + 1,MIN(N,J+K)
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I)
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*REAL(A(1,J))
-                  L = 1 - J
-                  IX = JX
-                  IY = JY
-                  DO 110 I = J + 1,MIN(N,J+K)
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of CHBMV .
-*
-      END
diff --git a/blas/fortran/chpmv.f b/blas/fortran/chpmv.f
deleted file mode 100644
index 158be5a7b..000000000
--- a/blas/fortran/chpmv.f
+++ /dev/null
@@ -1,272 +0,0 @@
-      SUBROUTINE CHPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      COMPLEX ALPHA,BETA
-      INTEGER INCX,INCY,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      COMPLEX AP(*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  CHPMV  performs the matrix-vector operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n hermitian matrix, supplied in packed form.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the matrix A is supplied in the packed
-*           array AP as follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  supplied in AP.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  supplied in AP.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  ALPHA  - COMPLEX         .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  AP     - COMPLEX          array of DIMENSION at least
-*           ( ( n*( n + 1 ) )/2 ).
-*           Before entry with UPLO = 'U' or 'u', the array AP must
-*           contain the upper triangular part of the hermitian matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
-*           and a( 2, 2 ) respectively, and so on.
-*           Before entry with UPLO = 'L' or 'l', the array AP must
-*           contain the lower triangular part of the hermitian matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
-*           and a( 3, 1 ) respectively, and so on.
-*           Note that the imaginary parts of the diagonal elements need
-*           not be set and are assumed to be zero.
-*           Unchanged on exit.
-*
-*  X      - COMPLEX          array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - COMPLEX         .
-*           On entry, BETA specifies the scalar beta. When BETA is
-*           supplied as zero then Y need not be set on input.
-*           Unchanged on exit.
-*
-*  Y      - COMPLEX          array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the n
-*           element vector y. On exit, Y is overwritten by the updated
-*           vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      COMPLEX ONE
-      PARAMETER (ONE= (1.0E+0,0.0E+0))
-      COMPLEX ZERO
-      PARAMETER (ZERO= (0.0E+0,0.0E+0))
-*     ..
-*     .. Local Scalars ..
-      COMPLEX TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC CONJG,REAL
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 6
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('CHPMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array AP
-*     are accessed sequentially with one pass through AP.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      KK = 1
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when AP contains the upper triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  K = KK
-                  DO 50 I = 1,J - 1
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + CONJG(AP(K))*X(I)
-                      K = K + 1
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2
-                  KK = KK + J
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  DO 70 K = KK,KK + J - 2
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + CONJG(AP(K))*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + J
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when AP contains the lower triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*REAL(AP(KK))
-                  K = KK + 1
-                  DO 90 I = J + 1,N
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + CONJG(AP(K))*X(I)
-                      K = K + 1
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-                  KK = KK + (N-J+1)
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*REAL(AP(KK))
-                  IX = JX
-                  IY = JY
-                  DO 110 K = KK + 1,KK + N - J
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + CONJG(AP(K))*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + (N-J+1)
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of CHPMV .
-*
-      END
diff --git a/blas/fortran/ctbmv.f b/blas/fortran/ctbmv.f
deleted file mode 100644
index 5a879fa01..000000000
--- a/blas/fortran/ctbmv.f
+++ /dev/null
@@ -1,366 +0,0 @@
-      SUBROUTINE CTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
-*     .. Scalar Arguments ..
-      INTEGER INCX,K,LDA,N
-      CHARACTER DIAG,TRANS,UPLO
-*     ..
-*     .. Array Arguments ..
-      COMPLEX A(LDA,*),X(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  CTBMV  performs one of the matrix-vector operations
-*
-*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
-*
-*  where x is an n element vector and  A is an n by n unit, or non-unit,
-*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the matrix is an upper or
-*           lower triangular matrix as follows:
-*
-*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
-*
-*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
-*
-*           Unchanged on exit.
-*
-*  TRANS  - CHARACTER*1.
-*           On entry, TRANS specifies the operation to be performed as
-*           follows:
-*
-*              TRANS = 'N' or 'n'   x := A*x.
-*
-*              TRANS = 'T' or 't'   x := A'*x.
-*
-*              TRANS = 'C' or 'c'   x := conjg( A' )*x.
-*
-*           Unchanged on exit.
-*
-*  DIAG   - CHARACTER*1.
-*           On entry, DIAG specifies whether or not A is unit
-*           triangular as follows:
-*
-*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
-*
-*              DIAG = 'N' or 'n'   A is not assumed to be unit
-*                                  triangular.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry with UPLO = 'U' or 'u', K specifies the number of
-*           super-diagonals of the matrix A.
-*           On entry with UPLO = 'L' or 'l', K specifies the number of
-*           sub-diagonals of the matrix A.
-*           K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer an upper
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer a lower
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that when DIAG = 'U' or 'u' the elements of the array A
-*           corresponding to the diagonal elements of the matrix are not
-*           referenced, but are assumed to be unity.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - COMPLEX          array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      COMPLEX ZERO
-      PARAMETER (ZERO= (0.0E+0,0.0E+0))
-*     ..
-*     .. Local Scalars ..
-      COMPLEX TEMP
-      INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
-      LOGICAL NOCONJ,NOUNIT
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC CONJG,MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
-     +         .NOT.LSAME(TRANS,'C')) THEN
-          INFO = 2
-      ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
-          INFO = 3
-      ELSE IF (N.LT.0) THEN
-          INFO = 4
-      ELSE IF (K.LT.0) THEN
-          INFO = 5
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 7
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('CTBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF (N.EQ.0) RETURN
-*
-      NOCONJ = LSAME(TRANS,'T')
-      NOUNIT = LSAME(DIAG,'N')
-*
-*     Set up the start point in X if the increment is not unity. This
-*     will be  ( N - 1 )*INCX   too small for descending loops.
-*
-      IF (INCX.LE.0) THEN
-          KX = 1 - (N-1)*INCX
-      ELSE IF (INCX.NE.1) THEN
-          KX = 1
-      END IF
-*
-*     Start the operations. In this version the elements of A are
-*     accessed sequentially with one pass through A.
-*
-      IF (LSAME(TRANS,'N')) THEN
-*
-*         Form  x := A*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 20 J = 1,N
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = KPLUS1 - J
-                          DO 10 I = MAX(1,J-K),J - 1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   10                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
-                      END IF
-   20             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 40 J = 1,N
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = KPLUS1 - J
-                          DO 30 I = MAX(1,J-K),J - 1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX + INCX
-   30                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
-                      END IF
-                      JX = JX + INCX
-                      IF (J.GT.K) KX = KX + INCX
-   40             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 60 J = N,1,-1
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = 1 - J
-                          DO 50 I = MIN(N,J+K),J + 1,-1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   50                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(1,J)
-                      END IF
-   60             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 80 J = N,1,-1
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = 1 - J
-                          DO 70 I = MIN(N,J+K),J + 1,-1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX - INCX
-   70                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(1,J)
-                      END IF
-                      JX = JX - INCX
-                      IF ((N-J).GE.K) KX = KX - INCX
-   80             CONTINUE
-              END IF
-          END IF
-      ELSE
-*
-*        Form  x := A'*x  or  x := conjg( A' )*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 110 J = N,1,-1
-                      TEMP = X(J)
-                      L = KPLUS1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                          DO 90 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + A(L+I,J)*X(I)
-   90                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J))
-                          DO 100 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + CONJG(A(L+I,J))*X(I)
-  100                     CONTINUE
-                      END IF
-                      X(J) = TEMP
-  110             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 140 J = N,1,-1
-                      TEMP = X(JX)
-                      KX = KX - INCX
-                      IX = KX
-                      L = KPLUS1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                          DO 120 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + A(L+I,J)*X(IX)
-                              IX = IX - INCX
-  120                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J))
-                          DO 130 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + CONJG(A(L+I,J))*X(IX)
-                              IX = IX - INCX
-  130                     CONTINUE
-                      END IF
-                      X(JX) = TEMP
-                      JX = JX - INCX
-  140             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 170 J = 1,N
-                      TEMP = X(J)
-                      L = 1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(1,J)
-                          DO 150 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + A(L+I,J)*X(I)
-  150                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J))
-                          DO 160 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + CONJG(A(L+I,J))*X(I)
-  160                     CONTINUE
-                      END IF
-                      X(J) = TEMP
-  170             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 200 J = 1,N
-                      TEMP = X(JX)
-                      KX = KX + INCX
-                      IX = KX
-                      L = 1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(1,J)
-                          DO 180 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + A(L+I,J)*X(IX)
-                              IX = IX + INCX
-  180                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J))
-                          DO 190 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + CONJG(A(L+I,J))*X(IX)
-                              IX = IX + INCX
-  190                     CONTINUE
-                      END IF
-                      X(JX) = TEMP
-                      JX = JX + INCX
-  200             CONTINUE
-              END IF
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of CTBMV .
-*
-      END
diff --git a/blas/fortran/drotm.f b/blas/fortran/drotm.f
deleted file mode 100644
index 63a3b1134..000000000
--- a/blas/fortran/drotm.f
+++ /dev/null
@@ -1,147 +0,0 @@
-      SUBROUTINE DROTM(N,DX,INCX,DY,INCY,DPARAM)
-*     .. Scalar Arguments ..
-      INTEGER INCX,INCY,N
-*     ..
-*     .. Array Arguments ..
-      DOUBLE PRECISION DPARAM(5),DX(*),DY(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
-*
-*     (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN
-*     (DY**T)
-*
-*     DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE
-*     LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY.
-*     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
-*
-*     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0
-*
-*       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0)
-*     H=(          )    (          )    (          )    (          )
-*       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0).
-*     SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM.
-*
-*  Arguments
-*  =========
-*
-*  N      (input) INTEGER
-*         number of elements in input vector(s)
-*
-*  DX     (input/output) DOUBLE PRECISION array, dimension N
-*         double precision vector with N elements
-*
-*  INCX   (input) INTEGER
-*         storage spacing between elements of DX
-*
-*  DY     (input/output) DOUBLE PRECISION array, dimension N
-*         double precision vector with N elements
-*
-*  INCY   (input) INTEGER
-*         storage spacing between elements of DY
-*
-*  DPARAM (input/output)  DOUBLE PRECISION array, dimension 5 
-*     DPARAM(1)=DFLAG
-*     DPARAM(2)=DH11
-*     DPARAM(3)=DH21
-*     DPARAM(4)=DH12
-*     DPARAM(5)=DH22
-*
-*  =====================================================================
-*
-*     .. Local Scalars ..
-      DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,TWO,W,Z,ZERO
-      INTEGER I,KX,KY,NSTEPS
-*     ..
-*     .. Data statements ..
-      DATA ZERO,TWO/0.D0,2.D0/
-*     ..
-*
-      DFLAG = DPARAM(1)
-      IF (N.LE.0 .OR. (DFLAG+TWO.EQ.ZERO)) GO TO 140
-      IF (.NOT. (INCX.EQ.INCY.AND.INCX.GT.0)) GO TO 70
-*
-      NSTEPS = N*INCX
-      IF (DFLAG) 50,10,30
-   10 CONTINUE
-      DH12 = DPARAM(4)
-      DH21 = DPARAM(3)
-      DO 20 I = 1,NSTEPS,INCX
-          W = DX(I)
-          Z = DY(I)
-          DX(I) = W + Z*DH12
-          DY(I) = W*DH21 + Z
-   20 CONTINUE
-      GO TO 140
-   30 CONTINUE
-      DH11 = DPARAM(2)
-      DH22 = DPARAM(5)
-      DO 40 I = 1,NSTEPS,INCX
-          W = DX(I)
-          Z = DY(I)
-          DX(I) = W*DH11 + Z
-          DY(I) = -W + DH22*Z
-   40 CONTINUE
-      GO TO 140
-   50 CONTINUE
-      DH11 = DPARAM(2)
-      DH12 = DPARAM(4)
-      DH21 = DPARAM(3)
-      DH22 = DPARAM(5)
-      DO 60 I = 1,NSTEPS,INCX
-          W = DX(I)
-          Z = DY(I)
-          DX(I) = W*DH11 + Z*DH12
-          DY(I) = W*DH21 + Z*DH22
-   60 CONTINUE
-      GO TO 140
-   70 CONTINUE
-      KX = 1
-      KY = 1
-      IF (INCX.LT.0) KX = 1 + (1-N)*INCX
-      IF (INCY.LT.0) KY = 1 + (1-N)*INCY
-*
-      IF (DFLAG) 120,80,100
-   80 CONTINUE
-      DH12 = DPARAM(4)
-      DH21 = DPARAM(3)
-      DO 90 I = 1,N
-          W = DX(KX)
-          Z = DY(KY)
-          DX(KX) = W + Z*DH12
-          DY(KY) = W*DH21 + Z
-          KX = KX + INCX
-          KY = KY + INCY
-   90 CONTINUE
-      GO TO 140
-  100 CONTINUE
-      DH11 = DPARAM(2)
-      DH22 = DPARAM(5)
-      DO 110 I = 1,N
-          W = DX(KX)
-          Z = DY(KY)
-          DX(KX) = W*DH11 + Z
-          DY(KY) = -W + DH22*Z
-          KX = KX + INCX
-          KY = KY + INCY
-  110 CONTINUE
-      GO TO 140
-  120 CONTINUE
-      DH11 = DPARAM(2)
-      DH12 = DPARAM(4)
-      DH21 = DPARAM(3)
-      DH22 = DPARAM(5)
-      DO 130 I = 1,N
-          W = DX(KX)
-          Z = DY(KY)
-          DX(KX) = W*DH11 + Z*DH12
-          DY(KY) = W*DH21 + Z*DH22
-          KX = KX + INCX
-          KY = KY + INCY
-  130 CONTINUE
-  140 CONTINUE
-      RETURN
-      END
diff --git a/blas/fortran/drotmg.f b/blas/fortran/drotmg.f
deleted file mode 100644
index 3ae647b08..000000000
--- a/blas/fortran/drotmg.f
+++ /dev/null
@@ -1,206 +0,0 @@
-      SUBROUTINE DROTMG(DD1,DD2,DX1,DY1,DPARAM)
-*     .. Scalar Arguments ..
-      DOUBLE PRECISION DD1,DD2,DX1,DY1
-*     ..
-*     .. Array Arguments ..
-      DOUBLE PRECISION DPARAM(5)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
-*     THE SECOND COMPONENT OF THE 2-VECTOR  (DSQRT(DD1)*DX1,DSQRT(DD2)*
-*     DY2)**T.
-*     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
-*
-*     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0
-*
-*       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0)
-*     H=(          )    (          )    (          )    (          )
-*       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0).
-*     LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22
-*     RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE
-*     VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.)
-*
-*     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
-*     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
-*     OF DD1 AND DD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
-*
-*
-*  Arguments
-*  =========
-*
-*  DD1    (input/output) DOUBLE PRECISION
-*
-*  DD2    (input/output) DOUBLE PRECISION 
-*
-*  DX1    (input/output) DOUBLE PRECISION 
-*
-*  DY1    (input) DOUBLE PRECISION
-*
-*  DPARAM (input/output)  DOUBLE PRECISION array, dimension 5
-*     DPARAM(1)=DFLAG
-*     DPARAM(2)=DH11
-*     DPARAM(3)=DH21
-*     DPARAM(4)=DH12
-*     DPARAM(5)=DH22
-*
-*  =====================================================================
-*
-*     .. Local Scalars ..
-      DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,DP1,DP2,DQ1,DQ2,DTEMP,
-     +                 DU,GAM,GAMSQ,ONE,RGAMSQ,TWO,ZERO
-      INTEGER IGO
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC DABS
-*     ..
-*     .. Data statements ..
-*
-      DATA ZERO,ONE,TWO/0.D0,1.D0,2.D0/
-      DATA GAM,GAMSQ,RGAMSQ/4096.D0,16777216.D0,5.9604645D-8/
-*     ..
-
-      IF (.NOT.DD1.LT.ZERO) GO TO 10
-*       GO ZERO-H-D-AND-DX1..
-      GO TO 60
-   10 CONTINUE
-*     CASE-DD1-NONNEGATIVE
-      DP2 = DD2*DY1
-      IF (.NOT.DP2.EQ.ZERO) GO TO 20
-      DFLAG = -TWO
-      GO TO 260
-*     REGULAR-CASE..
-   20 CONTINUE
-      DP1 = DD1*DX1
-      DQ2 = DP2*DY1
-      DQ1 = DP1*DX1
-*
-      IF (.NOT.DABS(DQ1).GT.DABS(DQ2)) GO TO 40
-      DH21 = -DY1/DX1
-      DH12 = DP2/DP1
-*
-      DU = ONE - DH12*DH21
-*
-      IF (.NOT.DU.LE.ZERO) GO TO 30
-*         GO ZERO-H-D-AND-DX1..
-      GO TO 60
-   30 CONTINUE
-      DFLAG = ZERO
-      DD1 = DD1/DU
-      DD2 = DD2/DU
-      DX1 = DX1*DU
-*         GO SCALE-CHECK..
-      GO TO 100
-   40 CONTINUE
-      IF (.NOT.DQ2.LT.ZERO) GO TO 50
-*         GO ZERO-H-D-AND-DX1..
-      GO TO 60
-   50 CONTINUE
-      DFLAG = ONE
-      DH11 = DP1/DP2
-      DH22 = DX1/DY1
-      DU = ONE + DH11*DH22
-      DTEMP = DD2/DU
-      DD2 = DD1/DU
-      DD1 = DTEMP
-      DX1 = DY1*DU
-*         GO SCALE-CHECK
-      GO TO 100
-*     PROCEDURE..ZERO-H-D-AND-DX1..
-   60 CONTINUE
-      DFLAG = -ONE
-      DH11 = ZERO
-      DH12 = ZERO
-      DH21 = ZERO
-      DH22 = ZERO
-*
-      DD1 = ZERO
-      DD2 = ZERO
-      DX1 = ZERO
-*         RETURN..
-      GO TO 220
-*     PROCEDURE..FIX-H..
-   70 CONTINUE
-      IF (.NOT.DFLAG.GE.ZERO) GO TO 90
-*
-      IF (.NOT.DFLAG.EQ.ZERO) GO TO 80
-      DH11 = ONE
-      DH22 = ONE
-      DFLAG = -ONE
-      GO TO 90
-   80 CONTINUE
-      DH21 = -ONE
-      DH12 = ONE
-      DFLAG = -ONE
-   90 CONTINUE
-      GO TO IGO(120,150,180,210)
-*     PROCEDURE..SCALE-CHECK
-  100 CONTINUE
-  110 CONTINUE
-      IF (.NOT.DD1.LE.RGAMSQ) GO TO 130
-      IF (DD1.EQ.ZERO) GO TO 160
-      ASSIGN 120 TO IGO
-*              FIX-H..
-      GO TO 70
-  120 CONTINUE
-      DD1 = DD1*GAM**2
-      DX1 = DX1/GAM
-      DH11 = DH11/GAM
-      DH12 = DH12/GAM
-      GO TO 110
-  130 CONTINUE
-  140 CONTINUE
-      IF (.NOT.DD1.GE.GAMSQ) GO TO 160
-      ASSIGN 150 TO IGO
-*              FIX-H..
-      GO TO 70
-  150 CONTINUE
-      DD1 = DD1/GAM**2
-      DX1 = DX1*GAM
-      DH11 = DH11*GAM
-      DH12 = DH12*GAM
-      GO TO 140
-  160 CONTINUE
-  170 CONTINUE
-      IF (.NOT.DABS(DD2).LE.RGAMSQ) GO TO 190
-      IF (DD2.EQ.ZERO) GO TO 220
-      ASSIGN 180 TO IGO
-*              FIX-H..
-      GO TO 70
-  180 CONTINUE
-      DD2 = DD2*GAM**2
-      DH21 = DH21/GAM
-      DH22 = DH22/GAM
-      GO TO 170
-  190 CONTINUE
-  200 CONTINUE
-      IF (.NOT.DABS(DD2).GE.GAMSQ) GO TO 220
-      ASSIGN 210 TO IGO
-*              FIX-H..
-      GO TO 70
-  210 CONTINUE
-      DD2 = DD2/GAM**2
-      DH21 = DH21*GAM
-      DH22 = DH22*GAM
-      GO TO 200
-  220 CONTINUE
-      IF (DFLAG) 250,230,240
-  230 CONTINUE
-      DPARAM(3) = DH21
-      DPARAM(4) = DH12
-      GO TO 260
-  240 CONTINUE
-      DPARAM(2) = DH11
-      DPARAM(5) = DH22
-      GO TO 260
-  250 CONTINUE
-      DPARAM(2) = DH11
-      DPARAM(3) = DH21
-      DPARAM(4) = DH12
-      DPARAM(5) = DH22
-  260 CONTINUE
-      DPARAM(1) = DFLAG
-      RETURN
-      END
diff --git a/blas/fortran/dsbmv.f b/blas/fortran/dsbmv.f
deleted file mode 100644
index 8c82d1fa1..000000000
--- a/blas/fortran/dsbmv.f
+++ /dev/null
@@ -1,304 +0,0 @@
-      SUBROUTINE DSBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      DOUBLE PRECISION ALPHA,BETA
-      INTEGER INCX,INCY,K,LDA,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE PRECISION A(LDA,*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  DSBMV  performs the matrix-vector  operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n symmetric band matrix, with k super-diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the band matrix A is being supplied as
-*           follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  being supplied.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  being supplied.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry, K specifies the number of super-diagonals of the
-*           matrix A. K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  ALPHA  - DOUBLE PRECISION.
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the symmetric matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer the upper
-*           triangular part of a symmetric band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the symmetric matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer the lower
-*           triangular part of a symmetric band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - DOUBLE PRECISION array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the
-*           vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - DOUBLE PRECISION.
-*           On entry, BETA specifies the scalar beta.
-*           Unchanged on exit.
-*
-*  Y      - DOUBLE PRECISION array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the
-*           vector y. On exit, Y is overwritten by the updated vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE PRECISION ONE,ZERO
-      PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
-*     ..
-*     .. Local Scalars ..
-      DOUBLE PRECISION TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (K.LT.0) THEN
-          INFO = 3
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 6
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 8
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 11
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('DSBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array A
-*     are accessed sequentially with one pass through A.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when upper triangle of A is stored.
-*
-          KPLUS1 = K + 1
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  L = KPLUS1 - J
-                  DO 50 I = MAX(1,J-K),J - 1
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(I)
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  L = KPLUS1 - J
-                  DO 70 I = MAX(1,J-K),J - 1
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  IF (J.GT.K) THEN
-                      KX = KX + INCX
-                      KY = KY + INCY
-                  END IF
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when lower triangle of A is stored.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*A(1,J)
-                  L = 1 - J
-                  DO 90 I = J + 1,MIN(N,J+K)
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(I)
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*A(1,J)
-                  L = 1 - J
-                  IX = JX
-                  IY = JY
-                  DO 110 I = J + 1,MIN(N,J+K)
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of DSBMV .
-*
-      END
diff --git a/blas/fortran/dspmv.f b/blas/fortran/dspmv.f
deleted file mode 100644
index f6e121e76..000000000
--- a/blas/fortran/dspmv.f
+++ /dev/null
@@ -1,265 +0,0 @@
-      SUBROUTINE DSPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      DOUBLE PRECISION ALPHA,BETA
-      INTEGER INCX,INCY,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE PRECISION AP(*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  DSPMV  performs the matrix-vector operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n symmetric matrix, supplied in packed form.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the matrix A is supplied in the packed
-*           array AP as follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  supplied in AP.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  supplied in AP.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  ALPHA  - DOUBLE PRECISION.
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  AP     - DOUBLE PRECISION array of DIMENSION at least
-*           ( ( n*( n + 1 ) )/2 ).
-*           Before entry with UPLO = 'U' or 'u', the array AP must
-*           contain the upper triangular part of the symmetric matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
-*           and a( 2, 2 ) respectively, and so on.
-*           Before entry with UPLO = 'L' or 'l', the array AP must
-*           contain the lower triangular part of the symmetric matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
-*           and a( 3, 1 ) respectively, and so on.
-*           Unchanged on exit.
-*
-*  X      - DOUBLE PRECISION array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - DOUBLE PRECISION.
-*           On entry, BETA specifies the scalar beta. When BETA is
-*           supplied as zero then Y need not be set on input.
-*           Unchanged on exit.
-*
-*  Y      - DOUBLE PRECISION array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the n
-*           element vector y. On exit, Y is overwritten by the updated
-*           vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE PRECISION ONE,ZERO
-      PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
-*     ..
-*     .. Local Scalars ..
-      DOUBLE PRECISION TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 6
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('DSPMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array AP
-*     are accessed sequentially with one pass through AP.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      KK = 1
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when AP contains the upper triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  K = KK
-                  DO 50 I = 1,J - 1
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(I)
-                      K = K + 1
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
-                  KK = KK + J
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  DO 70 K = KK,KK + J - 2
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + J
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when AP contains the lower triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*AP(KK)
-                  K = KK + 1
-                  DO 90 I = J + 1,N
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(I)
-                      K = K + 1
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-                  KK = KK + (N-J+1)
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*AP(KK)
-                  IX = JX
-                  IY = JY
-                  DO 110 K = KK + 1,KK + N - J
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + (N-J+1)
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of DSPMV .
-*
-      END
diff --git a/blas/fortran/dtbmv.f b/blas/fortran/dtbmv.f
deleted file mode 100644
index a87ffdeae..000000000
--- a/blas/fortran/dtbmv.f
+++ /dev/null
@@ -1,335 +0,0 @@
-      SUBROUTINE DTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
-*     .. Scalar Arguments ..
-      INTEGER INCX,K,LDA,N
-      CHARACTER DIAG,TRANS,UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE PRECISION A(LDA,*),X(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  DTBMV  performs one of the matrix-vector operations
-*
-*     x := A*x,   or   x := A'*x,
-*
-*  where x is an n element vector and  A is an n by n unit, or non-unit,
-*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the matrix is an upper or
-*           lower triangular matrix as follows:
-*
-*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
-*
-*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
-*
-*           Unchanged on exit.
-*
-*  TRANS  - CHARACTER*1.
-*           On entry, TRANS specifies the operation to be performed as
-*           follows:
-*
-*              TRANS = 'N' or 'n'   x := A*x.
-*
-*              TRANS = 'T' or 't'   x := A'*x.
-*
-*              TRANS = 'C' or 'c'   x := A'*x.
-*
-*           Unchanged on exit.
-*
-*  DIAG   - CHARACTER*1.
-*           On entry, DIAG specifies whether or not A is unit
-*           triangular as follows:
-*
-*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
-*
-*              DIAG = 'N' or 'n'   A is not assumed to be unit
-*                                  triangular.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry with UPLO = 'U' or 'u', K specifies the number of
-*           super-diagonals of the matrix A.
-*           On entry with UPLO = 'L' or 'l', K specifies the number of
-*           sub-diagonals of the matrix A.
-*           K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer an upper
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer a lower
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that when DIAG = 'U' or 'u' the elements of the array A
-*           corresponding to the diagonal elements of the matrix are not
-*           referenced, but are assumed to be unity.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - DOUBLE PRECISION array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE PRECISION ZERO
-      PARAMETER (ZERO=0.0D+0)
-*     ..
-*     .. Local Scalars ..
-      DOUBLE PRECISION TEMP
-      INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
-      LOGICAL NOUNIT
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
-     +         .NOT.LSAME(TRANS,'C')) THEN
-          INFO = 2
-      ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
-          INFO = 3
-      ELSE IF (N.LT.0) THEN
-          INFO = 4
-      ELSE IF (K.LT.0) THEN
-          INFO = 5
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 7
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('DTBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF (N.EQ.0) RETURN
-*
-      NOUNIT = LSAME(DIAG,'N')
-*
-*     Set up the start point in X if the increment is not unity. This
-*     will be  ( N - 1 )*INCX   too small for descending loops.
-*
-      IF (INCX.LE.0) THEN
-          KX = 1 - (N-1)*INCX
-      ELSE IF (INCX.NE.1) THEN
-          KX = 1
-      END IF
-*
-*     Start the operations. In this version the elements of A are
-*     accessed sequentially with one pass through A.
-*
-      IF (LSAME(TRANS,'N')) THEN
-*
-*         Form  x := A*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 20 J = 1,N
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = KPLUS1 - J
-                          DO 10 I = MAX(1,J-K),J - 1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   10                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
-                      END IF
-   20             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 40 J = 1,N
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = KPLUS1 - J
-                          DO 30 I = MAX(1,J-K),J - 1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX + INCX
-   30                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
-                      END IF
-                      JX = JX + INCX
-                      IF (J.GT.K) KX = KX + INCX
-   40             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 60 J = N,1,-1
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = 1 - J
-                          DO 50 I = MIN(N,J+K),J + 1,-1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   50                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(1,J)
-                      END IF
-   60             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 80 J = N,1,-1
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = 1 - J
-                          DO 70 I = MIN(N,J+K),J + 1,-1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX - INCX
-   70                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(1,J)
-                      END IF
-                      JX = JX - INCX
-                      IF ((N-J).GE.K) KX = KX - INCX
-   80             CONTINUE
-              END IF
-          END IF
-      ELSE
-*
-*        Form  x := A'*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 100 J = N,1,-1
-                      TEMP = X(J)
-                      L = KPLUS1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                      DO 90 I = J - 1,MAX(1,J-K),-1
-                          TEMP = TEMP + A(L+I,J)*X(I)
-   90                 CONTINUE
-                      X(J) = TEMP
-  100             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 120 J = N,1,-1
-                      TEMP = X(JX)
-                      KX = KX - INCX
-                      IX = KX
-                      L = KPLUS1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                      DO 110 I = J - 1,MAX(1,J-K),-1
-                          TEMP = TEMP + A(L+I,J)*X(IX)
-                          IX = IX - INCX
-  110                 CONTINUE
-                      X(JX) = TEMP
-                      JX = JX - INCX
-  120             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 140 J = 1,N
-                      TEMP = X(J)
-                      L = 1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(1,J)
-                      DO 130 I = J + 1,MIN(N,J+K)
-                          TEMP = TEMP + A(L+I,J)*X(I)
-  130                 CONTINUE
-                      X(J) = TEMP
-  140             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 160 J = 1,N
-                      TEMP = X(JX)
-                      KX = KX + INCX
-                      IX = KX
-                      L = 1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(1,J)
-                      DO 150 I = J + 1,MIN(N,J+K)
-                          TEMP = TEMP + A(L+I,J)*X(IX)
-                          IX = IX + INCX
-  150                 CONTINUE
-                      X(JX) = TEMP
-                      JX = JX + INCX
-  160             CONTINUE
-              END IF
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of DTBMV .
-*
-      END
diff --git a/blas/fortran/lsame.f b/blas/fortran/lsame.f
deleted file mode 100644
index f53690268..000000000
--- a/blas/fortran/lsame.f
+++ /dev/null
@@ -1,85 +0,0 @@
-      LOGICAL FUNCTION LSAME(CA,CB)
-*
-*  -- LAPACK auxiliary routine (version 3.1) --
-*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
-*     November 2006
-*
-*     .. Scalar Arguments ..
-      CHARACTER CA,CB
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  LSAME returns .TRUE. if CA is the same letter as CB regardless of
-*  case.
-*
-*  Arguments
-*  =========
-*
-*  CA      (input) CHARACTER*1
-*
-*  CB      (input) CHARACTER*1
-*          CA and CB specify the single characters to be compared.
-*
-* =====================================================================
-*
-*     .. Intrinsic Functions ..
-      INTRINSIC ICHAR
-*     ..
-*     .. Local Scalars ..
-      INTEGER INTA,INTB,ZCODE
-*     ..
-*
-*     Test if the characters are equal
-*
-      LSAME = CA .EQ. CB
-      IF (LSAME) RETURN
-*
-*     Now test for equivalence if both characters are alphabetic.
-*
-      ZCODE = ICHAR('Z')
-*
-*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime
-*     machines, on which ICHAR returns a value with bit 8 set.
-*     ICHAR('A') on Prime machines returns 193 which is the same as
-*     ICHAR('A') on an EBCDIC machine.
-*
-      INTA = ICHAR(CA)
-      INTB = ICHAR(CB)
-*
-      IF (ZCODE.EQ.90 .OR. ZCODE.EQ.122) THEN
-*
-*        ASCII is assumed - ZCODE is the ASCII code of either lower or
-*        upper case 'Z'.
-*
-          IF (INTA.GE.97 .AND. INTA.LE.122) INTA = INTA - 32
-          IF (INTB.GE.97 .AND. INTB.LE.122) INTB = INTB - 32
-*
-      ELSE IF (ZCODE.EQ.233 .OR. ZCODE.EQ.169) THEN
-*
-*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or
-*        upper case 'Z'.
-*
-          IF (INTA.GE.129 .AND. INTA.LE.137 .OR.
-     +        INTA.GE.145 .AND. INTA.LE.153 .OR.
-     +        INTA.GE.162 .AND. INTA.LE.169) INTA = INTA + 64
-          IF (INTB.GE.129 .AND. INTB.LE.137 .OR.
-     +        INTB.GE.145 .AND. INTB.LE.153 .OR.
-     +        INTB.GE.162 .AND. INTB.LE.169) INTB = INTB + 64
-*
-      ELSE IF (ZCODE.EQ.218 .OR. ZCODE.EQ.250) THEN
-*
-*        ASCII is assumed, on Prime machines - ZCODE is the ASCII code
-*        plus 128 of either lower or upper case 'Z'.
-*
-          IF (INTA.GE.225 .AND. INTA.LE.250) INTA = INTA - 32
-          IF (INTB.GE.225 .AND. INTB.LE.250) INTB = INTB - 32
-      END IF
-      LSAME = INTA .EQ. INTB
-*
-*     RETURN
-*
-*     End of LSAME
-*
-      END
diff --git a/blas/fortran/srotm.f b/blas/fortran/srotm.f
deleted file mode 100644
index fc5a59333..000000000
--- a/blas/fortran/srotm.f
+++ /dev/null
@@ -1,148 +0,0 @@
-      SUBROUTINE SROTM(N,SX,INCX,SY,INCY,SPARAM)
-*     .. Scalar Arguments ..
-      INTEGER INCX,INCY,N
-*     ..
-*     .. Array Arguments ..
-      REAL SPARAM(5),SX(*),SY(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
-*
-*     (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN
-*     (DX**T)
-*
-*     SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE
-*     LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY.
-*     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS..
-*
-*     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0
-*
-*       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0)
-*     H=(          )    (          )    (          )    (          )
-*       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0).
-*     SEE  SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM.
-*
-*
-*  Arguments
-*  =========
-*
-*  N      (input) INTEGER
-*         number of elements in input vector(s)
-*
-*  SX     (input/output) REAL array, dimension N
-*         double precision vector with N elements
-*
-*  INCX   (input) INTEGER
-*         storage spacing between elements of SX
-*
-*  SY     (input/output) REAL array, dimension N
-*         double precision vector with N elements
-*
-*  INCY   (input) INTEGER
-*         storage spacing between elements of SY
-*
-*  SPARAM (input/output)  REAL array, dimension 5
-*     SPARAM(1)=SFLAG
-*     SPARAM(2)=SH11
-*     SPARAM(3)=SH21
-*     SPARAM(4)=SH12
-*     SPARAM(5)=SH22
-*
-*  =====================================================================
-*
-*     .. Local Scalars ..
-      REAL SFLAG,SH11,SH12,SH21,SH22,TWO,W,Z,ZERO
-      INTEGER I,KX,KY,NSTEPS
-*     ..
-*     .. Data statements ..
-      DATA ZERO,TWO/0.E0,2.E0/
-*     ..
-*
-      SFLAG = SPARAM(1)
-      IF (N.LE.0 .OR. (SFLAG+TWO.EQ.ZERO)) GO TO 140
-      IF (.NOT. (INCX.EQ.INCY.AND.INCX.GT.0)) GO TO 70
-*
-      NSTEPS = N*INCX
-      IF (SFLAG) 50,10,30
-   10 CONTINUE
-      SH12 = SPARAM(4)
-      SH21 = SPARAM(3)
-      DO 20 I = 1,NSTEPS,INCX
-          W = SX(I)
-          Z = SY(I)
-          SX(I) = W + Z*SH12
-          SY(I) = W*SH21 + Z
-   20 CONTINUE
-      GO TO 140
-   30 CONTINUE
-      SH11 = SPARAM(2)
-      SH22 = SPARAM(5)
-      DO 40 I = 1,NSTEPS,INCX
-          W = SX(I)
-          Z = SY(I)
-          SX(I) = W*SH11 + Z
-          SY(I) = -W + SH22*Z
-   40 CONTINUE
-      GO TO 140
-   50 CONTINUE
-      SH11 = SPARAM(2)
-      SH12 = SPARAM(4)
-      SH21 = SPARAM(3)
-      SH22 = SPARAM(5)
-      DO 60 I = 1,NSTEPS,INCX
-          W = SX(I)
-          Z = SY(I)
-          SX(I) = W*SH11 + Z*SH12
-          SY(I) = W*SH21 + Z*SH22
-   60 CONTINUE
-      GO TO 140
-   70 CONTINUE
-      KX = 1
-      KY = 1
-      IF (INCX.LT.0) KX = 1 + (1-N)*INCX
-      IF (INCY.LT.0) KY = 1 + (1-N)*INCY
-*
-      IF (SFLAG) 120,80,100
-   80 CONTINUE
-      SH12 = SPARAM(4)
-      SH21 = SPARAM(3)
-      DO 90 I = 1,N
-          W = SX(KX)
-          Z = SY(KY)
-          SX(KX) = W + Z*SH12
-          SY(KY) = W*SH21 + Z
-          KX = KX + INCX
-          KY = KY + INCY
-   90 CONTINUE
-      GO TO 140
-  100 CONTINUE
-      SH11 = SPARAM(2)
-      SH22 = SPARAM(5)
-      DO 110 I = 1,N
-          W = SX(KX)
-          Z = SY(KY)
-          SX(KX) = W*SH11 + Z
-          SY(KY) = -W + SH22*Z
-          KX = KX + INCX
-          KY = KY + INCY
-  110 CONTINUE
-      GO TO 140
-  120 CONTINUE
-      SH11 = SPARAM(2)
-      SH12 = SPARAM(4)
-      SH21 = SPARAM(3)
-      SH22 = SPARAM(5)
-      DO 130 I = 1,N
-          W = SX(KX)
-          Z = SY(KY)
-          SX(KX) = W*SH11 + Z*SH12
-          SY(KY) = W*SH21 + Z*SH22
-          KX = KX + INCX
-          KY = KY + INCY
-  130 CONTINUE
-  140 CONTINUE
-      RETURN
-      END
diff --git a/blas/fortran/srotmg.f b/blas/fortran/srotmg.f
deleted file mode 100644
index 7b3bd4272..000000000
--- a/blas/fortran/srotmg.f
+++ /dev/null
@@ -1,208 +0,0 @@
-      SUBROUTINE SROTMG(SD1,SD2,SX1,SY1,SPARAM)
-*     .. Scalar Arguments ..
-      REAL SD1,SD2,SX1,SY1
-*     ..
-*     .. Array Arguments ..
-      REAL SPARAM(5)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
-*     THE SECOND COMPONENT OF THE 2-VECTOR  (SQRT(SD1)*SX1,SQRT(SD2)*
-*     SY2)**T.
-*     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS..
-*
-*     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0
-*
-*       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0)
-*     H=(          )    (          )    (          )    (          )
-*       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0).
-*     LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22
-*     RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE
-*     VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.)
-*
-*     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
-*     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
-*     OF SD1 AND SD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
-*
-*
-*  Arguments
-*  =========
-*
-*
-*  SD1    (input/output) REAL
-*
-*  SD2    (input/output) REAL
-*
-*  SX1    (input/output) REAL
-*
-*  SY1    (input) REAL
-*
-*
-*  SPARAM (input/output)  REAL array, dimension 5
-*     SPARAM(1)=SFLAG
-*     SPARAM(2)=SH11
-*     SPARAM(3)=SH21
-*     SPARAM(4)=SH12
-*     SPARAM(5)=SH22
-*
-*  =====================================================================
-*
-*     .. Local Scalars ..
-      REAL GAM,GAMSQ,ONE,RGAMSQ,SFLAG,SH11,SH12,SH21,SH22,SP1,SP2,SQ1,
-     +     SQ2,STEMP,SU,TWO,ZERO
-      INTEGER IGO
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC ABS
-*     ..
-*     .. Data statements ..
-*
-      DATA ZERO,ONE,TWO/0.E0,1.E0,2.E0/
-      DATA GAM,GAMSQ,RGAMSQ/4096.E0,1.67772E7,5.96046E-8/
-*     ..
-
-      IF (.NOT.SD1.LT.ZERO) GO TO 10
-*       GO ZERO-H-D-AND-SX1..
-      GO TO 60
-   10 CONTINUE
-*     CASE-SD1-NONNEGATIVE
-      SP2 = SD2*SY1
-      IF (.NOT.SP2.EQ.ZERO) GO TO 20
-      SFLAG = -TWO
-      GO TO 260
-*     REGULAR-CASE..
-   20 CONTINUE
-      SP1 = SD1*SX1
-      SQ2 = SP2*SY1
-      SQ1 = SP1*SX1
-*
-      IF (.NOT.ABS(SQ1).GT.ABS(SQ2)) GO TO 40
-      SH21 = -SY1/SX1
-      SH12 = SP2/SP1
-*
-      SU = ONE - SH12*SH21
-*
-      IF (.NOT.SU.LE.ZERO) GO TO 30
-*         GO ZERO-H-D-AND-SX1..
-      GO TO 60
-   30 CONTINUE
-      SFLAG = ZERO
-      SD1 = SD1/SU
-      SD2 = SD2/SU
-      SX1 = SX1*SU
-*         GO SCALE-CHECK..
-      GO TO 100
-   40 CONTINUE
-      IF (.NOT.SQ2.LT.ZERO) GO TO 50
-*         GO ZERO-H-D-AND-SX1..
-      GO TO 60
-   50 CONTINUE
-      SFLAG = ONE
-      SH11 = SP1/SP2
-      SH22 = SX1/SY1
-      SU = ONE + SH11*SH22
-      STEMP = SD2/SU
-      SD2 = SD1/SU
-      SD1 = STEMP
-      SX1 = SY1*SU
-*         GO SCALE-CHECK
-      GO TO 100
-*     PROCEDURE..ZERO-H-D-AND-SX1..
-   60 CONTINUE
-      SFLAG = -ONE
-      SH11 = ZERO
-      SH12 = ZERO
-      SH21 = ZERO
-      SH22 = ZERO
-*
-      SD1 = ZERO
-      SD2 = ZERO
-      SX1 = ZERO
-*         RETURN..
-      GO TO 220
-*     PROCEDURE..FIX-H..
-   70 CONTINUE
-      IF (.NOT.SFLAG.GE.ZERO) GO TO 90
-*
-      IF (.NOT.SFLAG.EQ.ZERO) GO TO 80
-      SH11 = ONE
-      SH22 = ONE
-      SFLAG = -ONE
-      GO TO 90
-   80 CONTINUE
-      SH21 = -ONE
-      SH12 = ONE
-      SFLAG = -ONE
-   90 CONTINUE
-      GO TO IGO(120,150,180,210)
-*     PROCEDURE..SCALE-CHECK
-  100 CONTINUE
-  110 CONTINUE
-      IF (.NOT.SD1.LE.RGAMSQ) GO TO 130
-      IF (SD1.EQ.ZERO) GO TO 160
-      ASSIGN 120 TO IGO
-*              FIX-H..
-      GO TO 70
-  120 CONTINUE
-      SD1 = SD1*GAM**2
-      SX1 = SX1/GAM
-      SH11 = SH11/GAM
-      SH12 = SH12/GAM
-      GO TO 110
-  130 CONTINUE
-  140 CONTINUE
-      IF (.NOT.SD1.GE.GAMSQ) GO TO 160
-      ASSIGN 150 TO IGO
-*              FIX-H..
-      GO TO 70
-  150 CONTINUE
-      SD1 = SD1/GAM**2
-      SX1 = SX1*GAM
-      SH11 = SH11*GAM
-      SH12 = SH12*GAM
-      GO TO 140
-  160 CONTINUE
-  170 CONTINUE
-      IF (.NOT.ABS(SD2).LE.RGAMSQ) GO TO 190
-      IF (SD2.EQ.ZERO) GO TO 220
-      ASSIGN 180 TO IGO
-*              FIX-H..
-      GO TO 70
-  180 CONTINUE
-      SD2 = SD2*GAM**2
-      SH21 = SH21/GAM
-      SH22 = SH22/GAM
-      GO TO 170
-  190 CONTINUE
-  200 CONTINUE
-      IF (.NOT.ABS(SD2).GE.GAMSQ) GO TO 220
-      ASSIGN 210 TO IGO
-*              FIX-H..
-      GO TO 70
-  210 CONTINUE
-      SD2 = SD2/GAM**2
-      SH21 = SH21*GAM
-      SH22 = SH22*GAM
-      GO TO 200
-  220 CONTINUE
-      IF (SFLAG) 250,230,240
-  230 CONTINUE
-      SPARAM(3) = SH21
-      SPARAM(4) = SH12
-      GO TO 260
-  240 CONTINUE
-      SPARAM(2) = SH11
-      SPARAM(5) = SH22
-      GO TO 260
-  250 CONTINUE
-      SPARAM(2) = SH11
-      SPARAM(3) = SH21
-      SPARAM(4) = SH12
-      SPARAM(5) = SH22
-  260 CONTINUE
-      SPARAM(1) = SFLAG
-      RETURN
-      END
diff --git a/blas/fortran/ssbmv.f b/blas/fortran/ssbmv.f
deleted file mode 100644
index 16893a295..000000000
--- a/blas/fortran/ssbmv.f
+++ /dev/null
@@ -1,306 +0,0 @@
-      SUBROUTINE SSBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      REAL ALPHA,BETA
-      INTEGER INCX,INCY,K,LDA,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      REAL A(LDA,*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  SSBMV  performs the matrix-vector  operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n symmetric band matrix, with k super-diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the band matrix A is being supplied as
-*           follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  being supplied.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  being supplied.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry, K specifies the number of super-diagonals of the
-*           matrix A. K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  ALPHA  - REAL            .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  A      - REAL             array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the symmetric matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer the upper
-*           triangular part of a symmetric band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the symmetric matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer the lower
-*           triangular part of a symmetric band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - REAL             array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the
-*           vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - REAL            .
-*           On entry, BETA specifies the scalar beta.
-*           Unchanged on exit.
-*
-*  Y      - REAL             array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the
-*           vector y. On exit, Y is overwritten by the updated vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      REAL ONE,ZERO
-      PARAMETER (ONE=1.0E+0,ZERO=0.0E+0)
-*     ..
-*     .. Local Scalars ..
-      REAL TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (K.LT.0) THEN
-          INFO = 3
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 6
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 8
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 11
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('SSBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array A
-*     are accessed sequentially with one pass through A.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when upper triangle of A is stored.
-*
-          KPLUS1 = K + 1
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  L = KPLUS1 - J
-                  DO 50 I = MAX(1,J-K),J - 1
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(I)
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  L = KPLUS1 - J
-                  DO 70 I = MAX(1,J-K),J - 1
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  IF (J.GT.K) THEN
-                      KX = KX + INCX
-                      KY = KY + INCY
-                  END IF
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when lower triangle of A is stored.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*A(1,J)
-                  L = 1 - J
-                  DO 90 I = J + 1,MIN(N,J+K)
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(I)
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*A(1,J)
-                  L = 1 - J
-                  IX = JX
-                  IY = JY
-                  DO 110 I = J + 1,MIN(N,J+K)
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of SSBMV .
-*
-      END
diff --git a/blas/fortran/sspmv.f b/blas/fortran/sspmv.f
deleted file mode 100644
index 0b8449824..000000000
--- a/blas/fortran/sspmv.f
+++ /dev/null
@@ -1,265 +0,0 @@
-      SUBROUTINE SSPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      REAL ALPHA,BETA
-      INTEGER INCX,INCY,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      REAL AP(*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  SSPMV  performs the matrix-vector operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n symmetric matrix, supplied in packed form.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the matrix A is supplied in the packed
-*           array AP as follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  supplied in AP.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  supplied in AP.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  ALPHA  - REAL            .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  AP     - REAL             array of DIMENSION at least
-*           ( ( n*( n + 1 ) )/2 ).
-*           Before entry with UPLO = 'U' or 'u', the array AP must
-*           contain the upper triangular part of the symmetric matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
-*           and a( 2, 2 ) respectively, and so on.
-*           Before entry with UPLO = 'L' or 'l', the array AP must
-*           contain the lower triangular part of the symmetric matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
-*           and a( 3, 1 ) respectively, and so on.
-*           Unchanged on exit.
-*
-*  X      - REAL             array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - REAL            .
-*           On entry, BETA specifies the scalar beta. When BETA is
-*           supplied as zero then Y need not be set on input.
-*           Unchanged on exit.
-*
-*  Y      - REAL             array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the n
-*           element vector y. On exit, Y is overwritten by the updated
-*           vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      REAL ONE,ZERO
-      PARAMETER (ONE=1.0E+0,ZERO=0.0E+0)
-*     ..
-*     .. Local Scalars ..
-      REAL TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 6
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('SSPMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array AP
-*     are accessed sequentially with one pass through AP.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      KK = 1
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when AP contains the upper triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  K = KK
-                  DO 50 I = 1,J - 1
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(I)
-                      K = K + 1
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
-                  KK = KK + J
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  DO 70 K = KK,KK + J - 2
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + J
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when AP contains the lower triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*AP(KK)
-                  K = KK + 1
-                  DO 90 I = J + 1,N
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(I)
-                      K = K + 1
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-                  KK = KK + (N-J+1)
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*AP(KK)
-                  IX = JX
-                  IY = JY
-                  DO 110 K = KK + 1,KK + N - J
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + (N-J+1)
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of SSPMV .
-*
-      END
diff --git a/blas/fortran/stbmv.f b/blas/fortran/stbmv.f
deleted file mode 100644
index c0b8f1136..000000000
--- a/blas/fortran/stbmv.f
+++ /dev/null
@@ -1,335 +0,0 @@
-      SUBROUTINE STBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
-*     .. Scalar Arguments ..
-      INTEGER INCX,K,LDA,N
-      CHARACTER DIAG,TRANS,UPLO
-*     ..
-*     .. Array Arguments ..
-      REAL A(LDA,*),X(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  STBMV  performs one of the matrix-vector operations
-*
-*     x := A*x,   or   x := A'*x,
-*
-*  where x is an n element vector and  A is an n by n unit, or non-unit,
-*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the matrix is an upper or
-*           lower triangular matrix as follows:
-*
-*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
-*
-*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
-*
-*           Unchanged on exit.
-*
-*  TRANS  - CHARACTER*1.
-*           On entry, TRANS specifies the operation to be performed as
-*           follows:
-*
-*              TRANS = 'N' or 'n'   x := A*x.
-*
-*              TRANS = 'T' or 't'   x := A'*x.
-*
-*              TRANS = 'C' or 'c'   x := A'*x.
-*
-*           Unchanged on exit.
-*
-*  DIAG   - CHARACTER*1.
-*           On entry, DIAG specifies whether or not A is unit
-*           triangular as follows:
-*
-*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
-*
-*              DIAG = 'N' or 'n'   A is not assumed to be unit
-*                                  triangular.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry with UPLO = 'U' or 'u', K specifies the number of
-*           super-diagonals of the matrix A.
-*           On entry with UPLO = 'L' or 'l', K specifies the number of
-*           sub-diagonals of the matrix A.
-*           K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  A      - REAL             array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer an upper
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer a lower
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that when DIAG = 'U' or 'u' the elements of the array A
-*           corresponding to the diagonal elements of the matrix are not
-*           referenced, but are assumed to be unity.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - REAL             array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      REAL ZERO
-      PARAMETER (ZERO=0.0E+0)
-*     ..
-*     .. Local Scalars ..
-      REAL TEMP
-      INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
-      LOGICAL NOUNIT
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
-     +         .NOT.LSAME(TRANS,'C')) THEN
-          INFO = 2
-      ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
-          INFO = 3
-      ELSE IF (N.LT.0) THEN
-          INFO = 4
-      ELSE IF (K.LT.0) THEN
-          INFO = 5
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 7
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('STBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF (N.EQ.0) RETURN
-*
-      NOUNIT = LSAME(DIAG,'N')
-*
-*     Set up the start point in X if the increment is not unity. This
-*     will be  ( N - 1 )*INCX   too small for descending loops.
-*
-      IF (INCX.LE.0) THEN
-          KX = 1 - (N-1)*INCX
-      ELSE IF (INCX.NE.1) THEN
-          KX = 1
-      END IF
-*
-*     Start the operations. In this version the elements of A are
-*     accessed sequentially with one pass through A.
-*
-      IF (LSAME(TRANS,'N')) THEN
-*
-*         Form  x := A*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 20 J = 1,N
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = KPLUS1 - J
-                          DO 10 I = MAX(1,J-K),J - 1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   10                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
-                      END IF
-   20             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 40 J = 1,N
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = KPLUS1 - J
-                          DO 30 I = MAX(1,J-K),J - 1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX + INCX
-   30                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
-                      END IF
-                      JX = JX + INCX
-                      IF (J.GT.K) KX = KX + INCX
-   40             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 60 J = N,1,-1
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = 1 - J
-                          DO 50 I = MIN(N,J+K),J + 1,-1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   50                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(1,J)
-                      END IF
-   60             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 80 J = N,1,-1
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = 1 - J
-                          DO 70 I = MIN(N,J+K),J + 1,-1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX - INCX
-   70                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(1,J)
-                      END IF
-                      JX = JX - INCX
-                      IF ((N-J).GE.K) KX = KX - INCX
-   80             CONTINUE
-              END IF
-          END IF
-      ELSE
-*
-*        Form  x := A'*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 100 J = N,1,-1
-                      TEMP = X(J)
-                      L = KPLUS1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                      DO 90 I = J - 1,MAX(1,J-K),-1
-                          TEMP = TEMP + A(L+I,J)*X(I)
-   90                 CONTINUE
-                      X(J) = TEMP
-  100             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 120 J = N,1,-1
-                      TEMP = X(JX)
-                      KX = KX - INCX
-                      IX = KX
-                      L = KPLUS1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                      DO 110 I = J - 1,MAX(1,J-K),-1
-                          TEMP = TEMP + A(L+I,J)*X(IX)
-                          IX = IX - INCX
-  110                 CONTINUE
-                      X(JX) = TEMP
-                      JX = JX - INCX
-  120             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 140 J = 1,N
-                      TEMP = X(J)
-                      L = 1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(1,J)
-                      DO 130 I = J + 1,MIN(N,J+K)
-                          TEMP = TEMP + A(L+I,J)*X(I)
-  130                 CONTINUE
-                      X(J) = TEMP
-  140             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 160 J = 1,N
-                      TEMP = X(JX)
-                      KX = KX + INCX
-                      IX = KX
-                      L = 1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(1,J)
-                      DO 150 I = J + 1,MIN(N,J+K)
-                          TEMP = TEMP + A(L+I,J)*X(IX)
-                          IX = IX + INCX
-  150                 CONTINUE
-                      X(JX) = TEMP
-                      JX = JX + INCX
-  160             CONTINUE
-              END IF
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of STBMV .
-*
-      END
diff --git a/blas/fortran/zhbmv.f b/blas/fortran/zhbmv.f
deleted file mode 100644
index bca0da5fc..000000000
--- a/blas/fortran/zhbmv.f
+++ /dev/null
@@ -1,310 +0,0 @@
-      SUBROUTINE ZHBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      DOUBLE COMPLEX ALPHA,BETA
-      INTEGER INCX,INCY,K,LDA,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE COMPLEX A(LDA,*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  ZHBMV  performs the matrix-vector  operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n hermitian band matrix, with k super-diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the band matrix A is being supplied as
-*           follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  being supplied.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  being supplied.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry, K specifies the number of super-diagonals of the
-*           matrix A. K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  ALPHA  - COMPLEX*16      .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the hermitian matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer the upper
-*           triangular part of a hermitian band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the hermitian matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer the lower
-*           triangular part of a hermitian band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that the imaginary parts of the diagonal elements need
-*           not be set and are assumed to be zero.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - COMPLEX*16       array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the
-*           vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - COMPLEX*16      .
-*           On entry, BETA specifies the scalar beta.
-*           Unchanged on exit.
-*
-*  Y      - COMPLEX*16       array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the
-*           vector y. On exit, Y is overwritten by the updated vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE COMPLEX ONE
-      PARAMETER (ONE= (1.0D+0,0.0D+0))
-      DOUBLE COMPLEX ZERO
-      PARAMETER (ZERO= (0.0D+0,0.0D+0))
-*     ..
-*     .. Local Scalars ..
-      DOUBLE COMPLEX TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC DBLE,DCONJG,MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (K.LT.0) THEN
-          INFO = 3
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 6
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 8
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 11
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('ZHBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array A
-*     are accessed sequentially with one pass through A.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when upper triangle of A is stored.
-*
-          KPLUS1 = K + 1
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  L = KPLUS1 - J
-                  DO 50 I = MAX(1,J-K),J - 1
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(I)
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*DBLE(A(KPLUS1,J)) + ALPHA*TEMP2
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  L = KPLUS1 - J
-                  DO 70 I = MAX(1,J-K),J - 1
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*DBLE(A(KPLUS1,J)) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  IF (J.GT.K) THEN
-                      KX = KX + INCX
-                      KY = KY + INCY
-                  END IF
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when lower triangle of A is stored.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*DBLE(A(1,J))
-                  L = 1 - J
-                  DO 90 I = J + 1,MIN(N,J+K)
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(I)
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*DBLE(A(1,J))
-                  L = 1 - J
-                  IX = JX
-                  IY = JY
-                  DO 110 I = J + 1,MIN(N,J+K)
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of ZHBMV .
-*
-      END
diff --git a/blas/fortran/zhpmv.f b/blas/fortran/zhpmv.f
deleted file mode 100644
index b686108b3..000000000
--- a/blas/fortran/zhpmv.f
+++ /dev/null
@@ -1,272 +0,0 @@
-      SUBROUTINE ZHPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      DOUBLE COMPLEX ALPHA,BETA
-      INTEGER INCX,INCY,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE COMPLEX AP(*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  ZHPMV  performs the matrix-vector operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n hermitian matrix, supplied in packed form.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the matrix A is supplied in the packed
-*           array AP as follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  supplied in AP.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  supplied in AP.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  ALPHA  - COMPLEX*16      .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  AP     - COMPLEX*16       array of DIMENSION at least
-*           ( ( n*( n + 1 ) )/2 ).
-*           Before entry with UPLO = 'U' or 'u', the array AP must
-*           contain the upper triangular part of the hermitian matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
-*           and a( 2, 2 ) respectively, and so on.
-*           Before entry with UPLO = 'L' or 'l', the array AP must
-*           contain the lower triangular part of the hermitian matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
-*           and a( 3, 1 ) respectively, and so on.
-*           Note that the imaginary parts of the diagonal elements need
-*           not be set and are assumed to be zero.
-*           Unchanged on exit.
-*
-*  X      - COMPLEX*16       array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - COMPLEX*16      .
-*           On entry, BETA specifies the scalar beta. When BETA is
-*           supplied as zero then Y need not be set on input.
-*           Unchanged on exit.
-*
-*  Y      - COMPLEX*16       array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the n
-*           element vector y. On exit, Y is overwritten by the updated
-*           vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE COMPLEX ONE
-      PARAMETER (ONE= (1.0D+0,0.0D+0))
-      DOUBLE COMPLEX ZERO
-      PARAMETER (ZERO= (0.0D+0,0.0D+0))
-*     ..
-*     .. Local Scalars ..
-      DOUBLE COMPLEX TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC DBLE,DCONJG
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 6
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('ZHPMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array AP
-*     are accessed sequentially with one pass through AP.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      KK = 1
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when AP contains the upper triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  K = KK
-                  DO 50 I = 1,J - 1
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + DCONJG(AP(K))*X(I)
-                      K = K + 1
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*DBLE(AP(KK+J-1)) + ALPHA*TEMP2
-                  KK = KK + J
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  DO 70 K = KK,KK + J - 2
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + DCONJG(AP(K))*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*DBLE(AP(KK+J-1)) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + J
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when AP contains the lower triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*DBLE(AP(KK))
-                  K = KK + 1
-                  DO 90 I = J + 1,N
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + DCONJG(AP(K))*X(I)
-                      K = K + 1
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-                  KK = KK + (N-J+1)
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*DBLE(AP(KK))
-                  IX = JX
-                  IY = JY
-                  DO 110 K = KK + 1,KK + N - J
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + DCONJG(AP(K))*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + (N-J+1)
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of ZHPMV .
-*
-      END
diff --git a/blas/fortran/ztbmv.f b/blas/fortran/ztbmv.f
deleted file mode 100644
index 7c85c1b55..000000000
--- a/blas/fortran/ztbmv.f
+++ /dev/null
@@ -1,366 +0,0 @@
-      SUBROUTINE ZTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
-*     .. Scalar Arguments ..
-      INTEGER INCX,K,LDA,N
-      CHARACTER DIAG,TRANS,UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE COMPLEX A(LDA,*),X(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  ZTBMV  performs one of the matrix-vector operations
-*
-*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
-*
-*  where x is an n element vector and  A is an n by n unit, or non-unit,
-*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the matrix is an upper or
-*           lower triangular matrix as follows:
-*
-*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
-*
-*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
-*
-*           Unchanged on exit.
-*
-*  TRANS  - CHARACTER*1.
-*           On entry, TRANS specifies the operation to be performed as
-*           follows:
-*
-*              TRANS = 'N' or 'n'   x := A*x.
-*
-*              TRANS = 'T' or 't'   x := A'*x.
-*
-*              TRANS = 'C' or 'c'   x := conjg( A' )*x.
-*
-*           Unchanged on exit.
-*
-*  DIAG   - CHARACTER*1.
-*           On entry, DIAG specifies whether or not A is unit
-*           triangular as follows:
-*
-*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
-*
-*              DIAG = 'N' or 'n'   A is not assumed to be unit
-*                                  triangular.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry with UPLO = 'U' or 'u', K specifies the number of
-*           super-diagonals of the matrix A.
-*           On entry with UPLO = 'L' or 'l', K specifies the number of
-*           sub-diagonals of the matrix A.
-*           K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer an upper
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer a lower
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that when DIAG = 'U' or 'u' the elements of the array A
-*           corresponding to the diagonal elements of the matrix are not
-*           referenced, but are assumed to be unity.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - COMPLEX*16       array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE COMPLEX ZERO
-      PARAMETER (ZERO= (0.0D+0,0.0D+0))
-*     ..
-*     .. Local Scalars ..
-      DOUBLE COMPLEX TEMP
-      INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
-      LOGICAL NOCONJ,NOUNIT
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC DCONJG,MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
-     +         .NOT.LSAME(TRANS,'C')) THEN
-          INFO = 2
-      ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
-          INFO = 3
-      ELSE IF (N.LT.0) THEN
-          INFO = 4
-      ELSE IF (K.LT.0) THEN
-          INFO = 5
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 7
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('ZTBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF (N.EQ.0) RETURN
-*
-      NOCONJ = LSAME(TRANS,'T')
-      NOUNIT = LSAME(DIAG,'N')
-*
-*     Set up the start point in X if the increment is not unity. This
-*     will be  ( N - 1 )*INCX   too small for descending loops.
-*
-      IF (INCX.LE.0) THEN
-          KX = 1 - (N-1)*INCX
-      ELSE IF (INCX.NE.1) THEN
-          KX = 1
-      END IF
-*
-*     Start the operations. In this version the elements of A are
-*     accessed sequentially with one pass through A.
-*
-      IF (LSAME(TRANS,'N')) THEN
-*
-*         Form  x := A*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 20 J = 1,N
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = KPLUS1 - J
-                          DO 10 I = MAX(1,J-K),J - 1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   10                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
-                      END IF
-   20             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 40 J = 1,N
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = KPLUS1 - J
-                          DO 30 I = MAX(1,J-K),J - 1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX + INCX
-   30                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
-                      END IF
-                      JX = JX + INCX
-                      IF (J.GT.K) KX = KX + INCX
-   40             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 60 J = N,1,-1
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = 1 - J
-                          DO 50 I = MIN(N,J+K),J + 1,-1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   50                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(1,J)
-                      END IF
-   60             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 80 J = N,1,-1
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = 1 - J
-                          DO 70 I = MIN(N,J+K),J + 1,-1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX - INCX
-   70                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(1,J)
-                      END IF
-                      JX = JX - INCX
-                      IF ((N-J).GE.K) KX = KX - INCX
-   80             CONTINUE
-              END IF
-          END IF
-      ELSE
-*
-*        Form  x := A'*x  or  x := conjg( A' )*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 110 J = N,1,-1
-                      TEMP = X(J)
-                      L = KPLUS1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                          DO 90 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + A(L+I,J)*X(I)
-   90                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*DCONJG(A(KPLUS1,J))
-                          DO 100 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + DCONJG(A(L+I,J))*X(I)
-  100                     CONTINUE
-                      END IF
-                      X(J) = TEMP
-  110             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 140 J = N,1,-1
-                      TEMP = X(JX)
-                      KX = KX - INCX
-                      IX = KX
-                      L = KPLUS1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                          DO 120 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + A(L+I,J)*X(IX)
-                              IX = IX - INCX
-  120                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*DCONJG(A(KPLUS1,J))
-                          DO 130 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + DCONJG(A(L+I,J))*X(IX)
-                              IX = IX - INCX
-  130                     CONTINUE
-                      END IF
-                      X(JX) = TEMP
-                      JX = JX - INCX
-  140             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 170 J = 1,N
-                      TEMP = X(J)
-                      L = 1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(1,J)
-                          DO 150 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + A(L+I,J)*X(I)
-  150                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*DCONJG(A(1,J))
-                          DO 160 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + DCONJG(A(L+I,J))*X(I)
-  160                     CONTINUE
-                      END IF
-                      X(J) = TEMP
-  170             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 200 J = 1,N
-                      TEMP = X(JX)
-                      KX = KX + INCX
-                      IX = KX
-                      L = 1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(1,J)
-                          DO 180 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + A(L+I,J)*X(IX)
-                              IX = IX + INCX
-  180                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*DCONJG(A(1,J))
-                          DO 190 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + DCONJG(A(L+I,J))*X(IX)
-                              IX = IX + INCX
-  190                     CONTINUE
-                      END IF
-                      X(JX) = TEMP
-                      JX = JX + INCX
-  200             CONTINUE
-              END IF
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of ZTBMV .
-*
-      END
diff --git a/blas/level1_cplx_impl.h b/blas/level1_cplx_impl.h
index 283b9f827..719f5bac9 100644
--- a/blas/level1_cplx_impl.h
+++ b/blas/level1_cplx_impl.h
@@ -32,45 +32,52 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n,
 
   if(*n<=0) return 0;
 
-  if(*incx==1)  return vector(x,*n).unaryExpr<scalar_norm1_op>().sum();
-  else          return vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
+  if(*incx==1)  return make_vector(x,*n).unaryExpr<scalar_norm1_op>().sum();
+  else          return make_vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
 }
 
 // computes a dot product of a conjugated vector with another vector.
 int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres)
 {
 //   std::cerr << "_dotc " << *n << " " << *incx << " " << *incy << "\n";
+  Scalar* res = reinterpret_cast<Scalar*>(pres);
 
-  if(*n<=0) return 0;
+  if(*n<=0)
+  {
+    *res = Scalar(0);
+    return 0;
+  }
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar* res = reinterpret_cast<Scalar*>(pres);
 
-  if(*incx==1 && *incy==1)    *res = (vector(x,*n).dot(vector(y,*n)));
-  else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).dot(vector(y,*n,*incy)));
-  else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,*incy)));
-  else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).dot(vector(y,*n,-*incy).reverse()));
-  else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,-*incy).reverse()));
+  if(*incx==1 && *incy==1)    *res = (make_vector(x,*n).dot(make_vector(y,*n)));
+  else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,*incy)));
+  else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,*incy)));
+  else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,-*incy).reverse()));
+  else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,-*incy).reverse()));
   return 0;
 }
 
 // computes a vector-vector dot product without complex conjugation.
 int EIGEN_BLAS_FUNC(dotuw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres)
 {
-//   std::cerr << "_dotu " << *n << " " << *incx << " " << *incy << "\n";
+  Scalar* res = reinterpret_cast<Scalar*>(pres);
 
-  if(*n<=0) return 0;
+  if(*n<=0)
+  {
+    *res = Scalar(0);
+    return 0;
+  }
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar* res = reinterpret_cast<Scalar*>(pres);
 
-  if(*incx==1 && *incy==1)    *res = (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
-  else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
-  else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum();
-  else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
-  else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
+  if(*incx==1 && *incy==1)    *res = (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum();
+  else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum();
+  else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum();
+  else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
+  else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
   return 0;
 }
 
@@ -82,9 +89,9 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n,
   Scalar* x = reinterpret_cast<Scalar*>(px);
 
   if(*incx==1)
-    return vector(x,*n).stableNorm();
+    return make_vector(x,*n).stableNorm();
 
-  return vector(x,*n,*incx).stableNorm();
+  return make_vector(x,*n,*incx).stableNorm();
 }
 
 int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
@@ -96,8 +103,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScal
   RealScalar c = *pc;
   RealScalar s = *ps;
 
-  StridedVectorType vx(vector(x,*n,std::abs(*incx)));
-  StridedVectorType vy(vector(y,*n,std::abs(*incy)));
+  StridedVectorType vx(make_vector(x,*n,std::abs(*incx)));
+  StridedVectorType vy(make_vector(y,*n,std::abs(*incy)));
 
   Reverse<StridedVectorType> rvx(vx);
   Reverse<StridedVectorType> rvy(vy);
@@ -119,9 +126,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealSca
 
 //   std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n";
 
-  if(*incx==1)  vector(x,*n) *= alpha;
-  else          vector(x,*n,std::abs(*incx)) *= alpha;
+  if(*incx==1)  make_vector(x,*n) *= alpha;
+  else          make_vector(x,*n,std::abs(*incx)) *= alpha;
 
   return 0;
 }
-
diff --git a/blas/level1_impl.h b/blas/level1_impl.h
index b08c2f6be..f857bfa20 100644
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@@ -9,19 +9,19 @@
 
 #include "common.h"
 
-int EIGEN_BLAS_FUNC(axpy)(int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy)
+int EIGEN_BLAS_FUNC(axpy)(const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, RealScalar *py, const int *incy)
 {
-  Scalar* x = reinterpret_cast<Scalar*>(px);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
 
   if(*n<=0) return 0;
 
-  if(*incx==1 && *incy==1)    vector(y,*n) += alpha * vector(x,*n);
-  else if(*incx>0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,*incx);
-  else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,*incx);
-  else if(*incx<0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,-*incx).reverse();
-  else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,-*incx).reverse();
+  if(*incx==1 && *incy==1)    make_vector(y,*n) += alpha * make_vector(x,*n);
+  else if(*incx>0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,*incx);
+  else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,*incx);
+  else if(*incx<0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,-*incx).reverse();
+  else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,-*incx).reverse();
 
   return 0;
 }
@@ -35,7 +35,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int
 
   // be carefull, *incx==0 is allowed !!
   if(*incx==1 && *incy==1)
-    vector(y,*n) = vector(x,*n);
+    make_vector(y,*n) = make_vector(x,*n);
   else
   {
     if(*incx<0) x = x - (*n-1)*(*incx);
@@ -57,27 +57,27 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *inc
   Scalar* x = reinterpret_cast<Scalar*>(px);
 
   DenseIndex ret;
-  if(*incx==1)  vector(x,*n).cwiseAbs().maxCoeff(&ret);
-  else          vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
-  return ret+1;
+  if(*incx==1)  make_vector(x,*n).cwiseAbs().maxCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
+  return int(ret)+1;
 }
 
 int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amin_)(int *n, RealScalar *px, int *incx)
 {
   if(*n<=0) return 0;
   Scalar* x = reinterpret_cast<Scalar*>(px);
-  
+
   DenseIndex ret;
-  if(*incx==1)  vector(x,*n).cwiseAbs().minCoeff(&ret);
-  else          vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret);
-  return ret+1;
+  if(*incx==1)  make_vector(x,*n).cwiseAbs().minCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret);
+  return int(ret)+1;
 }
 
 int EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps)
 {
   using std::sqrt;
   using std::abs;
-  
+
   Scalar& a = *reinterpret_cast<Scalar*>(pa);
   Scalar& b = *reinterpret_cast<Scalar*>(pb);
   RealScalar* c = pc;
@@ -143,8 +143,8 @@ int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
 
-  if(*incx==1)  vector(x,*n) *= alpha;
-  else          vector(x,*n,std::abs(*incx)) *= alpha;
+  if(*incx==1)  make_vector(x,*n) *= alpha;
+  else          make_vector(x,*n,std::abs(*incx)) *= alpha;
 
   return 0;
 }
@@ -156,12 +156,11 @@ int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
 
-  if(*incx==1 && *incy==1)    vector(y,*n).swap(vector(x,*n));
-  else if(*incx>0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,*incx));
-  else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,*incx));
-  else if(*incx<0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,-*incx).reverse());
-  else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,-*incx).reverse());
+  if(*incx==1 && *incy==1)    make_vector(y,*n).swap(make_vector(x,*n));
+  else if(*incx>0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,*incx));
+  else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,*incx));
+  else if(*incx<0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,-*incx).reverse());
+  else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,-*incx).reverse());
 
   return 1;
 }
-
diff --git a/blas/level1_real_impl.h b/blas/level1_real_impl.h
index 8acecdfc6..02586d519 100644
--- a/blas/level1_real_impl.h
+++ b/blas/level1_real_impl.h
@@ -19,8 +19,8 @@ RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx)
 
   if(*n<=0) return 0;
 
-  if(*incx==1)  return vector(x,*n).cwiseAbs().sum();
-  else          return vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
+  if(*incx==1)  return make_vector(x,*n).cwiseAbs().sum();
+  else          return make_vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
 }
 
 // computes a vector-vector dot product.
@@ -33,11 +33,11 @@ Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, i
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
 
-  if(*incx==1 && *incy==1)    return (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
-  else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
-  else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum();
-  else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
-  else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
+  if(*incx==1 && *incy==1)    return (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum();
+  else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum();
+  else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum();
+  else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
+  else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
   else return 0;
 }
 
@@ -50,8 +50,8 @@ Scalar EIGEN_BLAS_FUNC(nrm2)(int *n, RealScalar *px, int *incx)
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
 
-  if(*incx==1)  return vector(x,*n).stableNorm();
-  else          return vector(x,*n,std::abs(*incx)).stableNorm();
+  if(*incx==1)  return make_vector(x,*n).stableNorm();
+  else          return make_vector(x,*n,std::abs(*incx)).stableNorm();
 }
 
 int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
@@ -64,8 +64,8 @@ int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int
   Scalar c = *reinterpret_cast<Scalar*>(pc);
   Scalar s = *reinterpret_cast<Scalar*>(ps);
 
-  StridedVectorType vx(vector(x,*n,std::abs(*incx)));
-  StridedVectorType vy(vector(y,*n,std::abs(*incy)));
+  StridedVectorType vx(make_vector(x,*n,std::abs(*incx)));
+  StridedVectorType vy(make_vector(y,*n,std::abs(*incy)));
 
   Reverse<StridedVectorType> rvx(vx);
   Reverse<StridedVectorType> rvy(vy);
diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h
index b850b6cd1..e3ce61435 100644
--- a/blas/level2_cplx_impl.h
+++ b/blas/level2_cplx_impl.h
@@ -16,28 +16,22 @@
   *  where alpha and beta are scalars, x and y are n element vectors and
   *  A is an n by n hermitian matrix.
   */
-int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
+int EIGEN_BLAS_FUNC(hemv)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *pa, const int *lda,
+                          const RealScalar *px, const int *incx, const RealScalar *pbeta, RealScalar *py, const int *incy)
 {
-  typedef void (*functype)(int, const Scalar*, int, const Scalar*, int, Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run);
-    func[LO] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run);
-
-    init = true;
-  }
-
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* x = reinterpret_cast<Scalar*>(px);
+  typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run),
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta   = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
 
   // check arguments
   int info = 0;
@@ -52,13 +46,13 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
   if(*n==0)
     return 1;
 
-  Scalar* actual_x = get_compact_vector(x,*n,*incx);
+  const Scalar* actual_x = get_compact_vector(x,*n,*incx);
   Scalar* actual_y = get_compact_vector(y,*n,*incy);
 
   if(beta!=Scalar(1))
   {
-    if(beta==Scalar(0)) vector(actual_y, *n).setZero();
-    else                vector(actual_y, *n) *= beta;
+    if(beta==Scalar(0)) make_vector(actual_y, *n).setZero();
+    else                make_vector(actual_y, *n) *= beta;
   }
 
   if(alpha!=Scalar(0))
@@ -67,7 +61,7 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
     if(code>=2 || func[code]==0)
       return 0;
 
-    func[code](*n, a, *lda, actual_x, 1, actual_y, alpha);
+    func[code](*n, a, *lda, actual_x, actual_y, alpha);
   }
 
   if(actual_x!=x) delete[] actual_x;
@@ -111,19 +105,12 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
 int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pap)
 {
   typedef void (*functype)(int, Scalar*, const Scalar*, RealScalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
-    func[LO] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* ap = reinterpret_cast<Scalar*>(pap);
@@ -162,19 +149,12 @@ int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
 int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap)
 {
   typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::packed_rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::packed_rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::packed_rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::packed_rank2_update_selector<Scalar,int,Lower>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
@@ -217,19 +197,12 @@ int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px
 int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pa, int *lda)
 {
   typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
-    func[LO] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* a = reinterpret_cast<Scalar*>(pa);
@@ -271,19 +244,12 @@ int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
 int EIGEN_BLAS_FUNC(her2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pa, int *lda)
 {
   typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::rank2_update_selector<Scalar,int,Lower>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
diff --git a/blas/level2_impl.h b/blas/level2_impl.h
index 5f3941975..173f40b44 100644
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@@ -9,29 +9,39 @@
 
 #include "common.h"
 
-int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *incb, RealScalar *pbeta, RealScalar *pc, int *incc)
+template<typename Index, typename Scalar, int StorageOrder, bool ConjugateLhs, bool ConjugateRhs>
+struct general_matrix_vector_product_wrapper
 {
-  typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar);
-  static functype func[4];
-
-  static bool init = false;
-  if(!init)
+  static void run(Index rows, Index cols,const Scalar *lhs, Index lhsStride, const Scalar *rhs, Index rhsIncr, Scalar* res, Index resIncr, Scalar alpha)
   {
-    for(int k=0; k<4; ++k)
-      func[k] = 0;
-
-    func[NOTR] = (internal::general_matrix_vector_product<int,Scalar,ColMajor,false,Scalar,false>::run);
-    func[TR  ] = (internal::general_matrix_vector_product<int,Scalar,RowMajor,false,Scalar,false>::run);
-    func[ADJ ] = (internal::general_matrix_vector_product<int,Scalar,RowMajor,Conj, Scalar,false>::run);
-
-    init = true;
+    typedef internal::const_blas_data_mapper<Scalar,Index,StorageOrder> LhsMapper;
+    typedef internal::const_blas_data_mapper<Scalar,Index,RowMajor> RhsMapper;
+    
+    internal::general_matrix_vector_product
+        <Index,Scalar,LhsMapper,StorageOrder,ConjugateLhs,Scalar,RhsMapper,ConjugateRhs>::run(
+        rows, cols, LhsMapper(lhs, lhsStride), RhsMapper(rhs, rhsIncr), res, resIncr, alpha);
   }
+};
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+int EIGEN_BLAS_FUNC(gemv)(const char *opa, const int *m, const int *n, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *incb, const RealScalar *pbeta, RealScalar *pc, const int *incc)
+{
+  typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar);
+  static const functype func[4] = {
+    // array index: NOTR
+    (general_matrix_vector_product_wrapper<int,Scalar,ColMajor,false,false>::run),
+    // array index: TR  
+    (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,false,false>::run),
+    // array index: ADJ 
+    (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,Conj ,false>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta   = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
 
   // check arguments
   int info = 0;
@@ -53,13 +63,13 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
   if(code!=NOTR)
     std::swap(actual_m,actual_n);
 
-  Scalar* actual_b = get_compact_vector(b,actual_n,*incb);
+  const Scalar* actual_b = get_compact_vector(b,actual_n,*incb);
   Scalar* actual_c = get_compact_vector(c,actual_m,*incc);
 
   if(beta!=Scalar(1))
   {
-    if(beta==Scalar(0)) vector(actual_c, actual_m).setZero();
-    else                vector(actual_c, actual_m) *= beta;
+    if(beta==Scalar(0)) make_vector(actual_c, actual_m).setZero();
+    else                make_vector(actual_c, actual_m) *= beta;
   }
 
   if(code>=4 || func[code]==0)
@@ -73,37 +83,41 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
   return 1;
 }
 
-int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
+int EIGEN_BLAS_FUNC(trsv)(const char *uplo, const char *opa, const char *diag, const int *n, const RealScalar *pa, const int *lda, RealScalar *pb, const int *incb)
 {
   typedef void (*functype)(int, const Scalar *, int, Scalar *);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run);
-
-    init = true;
-  }
-
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
 
   int info = 0;
@@ -128,37 +142,41 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
 
 
 
-int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
+int EIGEN_BLAS_FUNC(trmv)(const char *uplo, const char *opa, const char *diag, const int *n, const RealScalar *pa, const int *lda, RealScalar *pb, const int *incb)
 {
   typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, const Scalar&);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    init = true;
-  }
-
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
 
   int info = 0;
@@ -200,13 +218,13 @@ int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar
 int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealScalar *palpha, RealScalar *pa, int *lda,
                           RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
 {
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* x = reinterpret_cast<Scalar*>(px);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta = *reinterpret_cast<const Scalar*>(pbeta);
   int coeff_rows = *kl+*ku+1;
-  
+
   int info = 0;
        if(OP(*trans)==INVALID)                                        info = 1;
   else if(*m<0)                                                       info = 2;
@@ -218,26 +236,26 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca
   else if(*incy==0)                                                   info = 13;
   if(info)
     return xerbla_(SCALAR_SUFFIX_UP"GBMV ",&info,6);
-  
+
   if(*m==0 || *n==0 || (alpha==Scalar(0) && beta==Scalar(1)))
     return 0;
-  
+
   int actual_m = *m;
   int actual_n = *n;
   if(OP(*trans)!=NOTR)
     std::swap(actual_m,actual_n);
-  
-  Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
+
+  const Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
   Scalar* actual_y = get_compact_vector(y,actual_m,*incy);
-  
+
   if(beta!=Scalar(1))
   {
-    if(beta==Scalar(0)) vector(actual_y, actual_m).setZero();
-    else                vector(actual_y, actual_m) *= beta;
+    if(beta==Scalar(0)) make_vector(actual_y, actual_m).setZero();
+    else                make_vector(actual_y, actual_m) *= beta;
   }
-  
-  MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
-  
+
+  ConstMatrixType mat_coeffs(a,coeff_rows,*n,*lda);
+
   int nb = std::min(*n,(*m)+(*ku));
   for(int j=0; j<nb; ++j)
   {
@@ -246,16 +264,16 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca
     int len = end - start + 1;
     int offset = (*ku) - j + start;
     if(OP(*trans)==NOTR)
-      vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
+      make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
     else if(OP(*trans)==TR)
-      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * vector(actual_x+start,len) ).value();
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
     else
-      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * vector(actual_x+start,len) ).value();
-  }    
-  
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * make_vector(actual_x+start,len) ).value();
+  }
+
   if(actual_x!=x) delete[] actual_x;
   if(actual_y!=y) delete[] copy_back(actual_y,y,actual_m,*incy);
-  
+
   return 0;
 }
 
@@ -272,7 +290,7 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* x = reinterpret_cast<Scalar*>(px);
   int coeff_rows = *k + 1;
-  
+
   int info = 0;
        if(UPLO(*uplo)==INVALID)                                       info = 1;
   else if(OP(*opa)==INVALID)                                          info = 2;
@@ -283,37 +301,37 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea
   else if(*incx==0)                                                   info = 9;
   if(info)
     return xerbla_(SCALAR_SUFFIX_UP"TBMV ",&info,6);
-  
+
   if(*n==0)
     return 0;
-  
+
   int actual_n = *n;
-  
+
   Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
-  
+
   MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
-  
+
   int ku = UPLO(*uplo)==UPPER ? *k : 0;
   int kl = UPLO(*uplo)==LOWER ? *k : 0;
-  
+
   for(int j=0; j<*n; ++j)
   {
     int start = std::max(0,j - ku);
     int end = std::min((*m)-1,j + kl);
     int len = end - start + 1;
     int offset = (ku) - j + start;
-    
+
     if(OP(*trans)==NOTR)
-      vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
+      make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
     else if(OP(*trans)==TR)
-      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * vector(actual_x+start,len) ).value();
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
     else
-      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * vector(actual_x+start,len) ).value();
-  }    
-  
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * make_vector(actual_x+start,len) ).value();
+  }
+
   if(actual_x!=x) delete[] actual_x;
   if(actual_y!=y) delete[] copy_back(actual_y,y,actual_m,*incy);
-  
+
   return 0;
 }
 #endif
@@ -332,37 +350,41 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea
 int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx)
 {
   typedef void (*functype)(int, int, const Scalar *, int, Scalar *);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,Conj, Scalar,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,Conj, Scalar,RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* x = reinterpret_cast<Scalar*>(px);
   int coeff_rows = *k+1;
-  
+
   int info = 0;
        if(UPLO(*uplo)==INVALID)                                       info = 1;
   else if(OP(*op)==INVALID)                                           info = 2;
@@ -373,22 +395,22 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real
   else if(*incx==0)                                                   info = 9;
   if(info)
     return xerbla_(SCALAR_SUFFIX_UP"TBSV ",&info,6);
-  
+
   if(*n==0 || (*k==0 && DIAG(*diag)==UNIT))
     return 0;
-  
+
   int actual_n = *n;
- 
+
   Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
-  
+
   int code = OP(*op) | (UPLO(*uplo) << 2) | (DIAG(*diag) << 3);
   if(code>=16 || func[code]==0)
     return 0;
 
   func[code](*n, *k, a, *lda, actual_x);
-  
+
   if(actual_x!=x) delete[] copy_back(actual_x,x,actual_n,*incx);
-  
+
   return 0;
 }
 
@@ -402,32 +424,36 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real
 int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx)
 {
   typedef void (*functype)(int, const Scalar*, const Scalar*, Scalar*, Scalar);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0
+  };
 
   Scalar* ap = reinterpret_cast<Scalar*>(pap);
   Scalar* x = reinterpret_cast<Scalar*>(px);
@@ -473,32 +499,36 @@ int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar
 int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx)
 {
   typedef void (*functype)(int, const Scalar*, Scalar*);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run),
+    0
+  };
 
   Scalar* ap = reinterpret_cast<Scalar*>(pap);
   Scalar* x = reinterpret_cast<Scalar*>(px);
@@ -521,4 +551,3 @@ int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
 
   return 1;
 }
-
diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h
index 8d56eaaa1..7620f0a38 100644
--- a/blas/level2_real_impl.h
+++ b/blas/level2_real_impl.h
@@ -10,28 +10,22 @@
 #include "common.h"
 
 // y = alpha*A*x + beta*y
-int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
+int EIGEN_BLAS_FUNC(symv) (const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *pa, const int *lda,
+                           const RealScalar *px, const int *incx, const RealScalar *pbeta, RealScalar *py, const int *incy)
 {
-  typedef void (*functype)(int, const Scalar*, int, const Scalar*, int, Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run);
-    func[LO] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run);
-
-    init = true;
-  }
-
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* x = reinterpret_cast<Scalar*>(px);
+  typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run),
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta   = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
 
   // check arguments
   int info = 0;
@@ -46,20 +40,20 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
   if(*n==0)
     return 0;
 
-  Scalar* actual_x = get_compact_vector(x,*n,*incx);
+  const Scalar* actual_x = get_compact_vector(x,*n,*incx);
   Scalar* actual_y = get_compact_vector(y,*n,*incy);
 
   if(beta!=Scalar(1))
   {
-    if(beta==Scalar(0)) vector(actual_y, *n).setZero();
-    else                vector(actual_y, *n) *= beta;
+    if(beta==Scalar(0)) make_vector(actual_y, *n).setZero();
+    else                make_vector(actual_y, *n) *= beta;
   }
 
   int code = UPLO(*uplo);
   if(code>=2 || func[code]==0)
     return 0;
 
-  func[code](*n, a, *lda, actual_x, 1, actual_y, alpha);
+  func[code](*n, a, *lda, actual_x, actual_y, alpha);
 
   if(actual_x!=x) delete[] actual_x;
   if(actual_y!=y) delete[] copy_back(actual_y,y,*n,*incy);
@@ -68,41 +62,20 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
 }
 
 // C := alpha*x*x' + C
-int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(syr)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, RealScalar *pc, const int *ldc)
 {
 
-//   typedef void (*functype)(int, const Scalar *, int, Scalar *, int, Scalar);
-//   static functype func[2];
-
-//   static bool init = false;
-//   if(!init)
-//   {
-//     for(int k=0; k<2; ++k)
-//       func[k] = 0;
-//
-//     func[UP] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,UpperTriangular>::run);
-//     func[LO] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,LowerTriangular>::run);
-
-//     init = true;
-//   }
   typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
-    func[LO] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
-
-    init = true;
-  }
-
-  Scalar* x = reinterpret_cast<Scalar*>(px);
+  static const functype func[2] = {
+    // array index: UP
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };
+
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
 
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
@@ -115,7 +88,7 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
   if(*n==0 || alpha==Scalar(0)) return 1;
 
   // if the increment is not 1, let's copy it to a temporary vector to enable vectorization
-  Scalar* x_cpy = get_compact_vector(x,*n,*incx);
+  const Scalar* x_cpy = get_compact_vector(x,*n,*incx);
 
   int code = UPLO(*uplo);
   if(code>=2 || func[code]==0)
@@ -129,41 +102,20 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
 }
 
 // C := alpha*x*y' + alpha*y*x' + C
-int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(syr2)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, const RealScalar *py, const int *incy, RealScalar *pc, const int *ldc)
 {
-//   typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar);
-//   static functype func[2];
-//
-//   static bool init = false;
-//   if(!init)
-//   {
-//     for(int k=0; k<2; ++k)
-//       func[k] = 0;
-//
-//     func[UP] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,UpperTriangular>::run);
-//     func[LO] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,LowerTriangular>::run);
-//
-//     init = true;
-//   }
   typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
-
-  Scalar* x = reinterpret_cast<Scalar*>(px);
-  Scalar* y = reinterpret_cast<Scalar*>(py);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::rank2_update_selector<Scalar,int,Lower>::run),
+  };
+
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
+  const Scalar* y = reinterpret_cast<const Scalar*>(py);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
 
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
@@ -177,9 +129,9 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px
   if(alpha==Scalar(0))
     return 1;
 
-  Scalar* x_cpy = get_compact_vector(x,*n,*incx);
-  Scalar* y_cpy = get_compact_vector(y,*n,*incy);
-  
+  const Scalar* x_cpy = get_compact_vector(x,*n,*incx);
+  const Scalar* y_cpy = get_compact_vector(y,*n,*incy);
+
   int code = UPLO(*uplo);
   if(code>=2 || func[code]==0)
     return 0;
@@ -234,19 +186,12 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px
 int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *incx, Scalar *pap)
 {
   typedef void (*functype)(int, Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,false>::run);
-    func[LO] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,false>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,false>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* ap = reinterpret_cast<Scalar*>(pap);
@@ -285,19 +230,12 @@ int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *in
 int EIGEN_BLAS_FUNC(spr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap)
 {
   typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::packed_rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::packed_rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::packed_rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::packed_rank2_update_selector<Scalar,int,Lower>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
@@ -366,5 +304,3 @@ int EIGEN_BLAS_FUNC(ger)(int *m, int *n, Scalar *palpha, Scalar *px, int *incx,
 
   return 1;
 }
-
-
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index 07dbc22ff..6c802cd5f 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -6,37 +6,43 @@
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
+#include <iostream>
 #include "common.h"
 
-int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(gemm)(const char *opa, const char *opb, const int *m, const int *n, const int *k, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
 //   std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n";
   typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, Scalar, internal::level3_blocking<Scalar,Scalar>&, Eigen::internal::GemmParallelInfo<DenseIndex>*);
-  static functype func[12];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<12; ++k)
-      func[k] = 0;
-    func[NOTR  | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,ColMajor,false,ColMajor>::run);
-    func[TR    | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor>::run);
-    func[NOTR  | (TR   << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,false,ColMajor>::run);
-    func[TR    | (TR   << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (TR   << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,false,ColMajor>::run);
-    func[NOTR  | (ADJ  << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor>::run);
-    func[TR    | (ADJ  << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,Conj, ColMajor>::run);
-    func[ADJ   | (ADJ  << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,Conj, ColMajor>::run);
-    init = true;
-  }
-
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  static const functype func[12] = {
+    // array index: NOTR  | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,false,ColMajor>::run),
+    // array index: TR    | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor>::run),
+    // array index: TR    | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,Conj, ColMajor>::run),
+    // array index: ADJ   | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,Conj, ColMajor>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta   = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
 
   int info = 0;
   if(OP(*opa)==INVALID)                                               info = 1;
@@ -50,70 +56,92 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
   if(info)
     return xerbla_(SCALAR_SUFFIX_UP"GEMM ",&info,6);
 
+  if (*m == 0 || *n == 0)
+    return 0;
+
   if(beta!=Scalar(1))
   {
     if(beta==Scalar(0)) matrix(c, *m, *n, *ldc).setZero();
     else                matrix(c, *m, *n, *ldc) *= beta;
   }
 
-  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k);
+  if(*k == 0)
+    return 0;
+
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,1,true);
 
   int code = OP(*opa) | (OP(*opb) << 2);
   func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0);
   return 0;
 }
 
-int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha,  RealScalar *pa, int *lda, RealScalar *pb, int *ldb)
+int EIGEN_BLAS_FUNC(trsm)(const char *side, const char *uplo, const char *opa, const char *diag, const int *m, const int *n,
+                          const RealScalar *palpha,  const RealScalar *pa, const int *lda, RealScalar *pb, const int *ldb)
 {
 //   std::cerr << "in trsm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << "," << *n << " " << *palpha << " " << *lda << " " << *ldb<< "\n";
   typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, internal::level3_blocking<Scalar,Scalar>&);
-  static functype func[32];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<32; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          Conj, RowMajor,ColMajor>::run);
-
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    init = true;
-  }
-
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  static const functype func[32] = {
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          Conj, RowMajor,ColMajor>::run),\
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
-  Scalar  alpha = *reinterpret_cast<Scalar*>(palpha);
+  Scalar  alpha = *reinterpret_cast<const Scalar*>(palpha);
 
   int info = 0;
   if(SIDE(*side)==INVALID)                                            info = 1;
@@ -127,16 +155,19 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
   if(info)
     return xerbla_(SCALAR_SUFFIX_UP"TRSM ",&info,6);
 
+  if(*m==0 || *n==0)
+    return 0;
+
   int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4);
-  
+
   if(SIDE(*side)==LEFT)
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
     func[code](*m, *n, a, *lda, b, *ldb, blocking);
   }
   else
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
     func[code](*n, *m, a, *lda, b, *ldb, blocking);
   }
 
@@ -149,55 +180,73 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
 
 // b = alpha*op(a)*b  for side = 'L'or'l'
 // b = alpha*b*op(a)  for side = 'R'or'r'
-int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha,  RealScalar *pa, int *lda, RealScalar *pb, int *ldb)
+int EIGEN_BLAS_FUNC(trmm)(const char *side, const char *uplo, const char *opa, const char *diag, const int *m, const int *n,
+                          const RealScalar *palpha, const RealScalar *pa, const int *lda, RealScalar *pb, const int *ldb)
 {
 //   std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n";
   typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
-  static functype func[32];
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<32; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    init = true;
-  }
-
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  static const functype func[32] = {
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
-  Scalar  alpha = *reinterpret_cast<Scalar*>(palpha);
+  Scalar  alpha = *reinterpret_cast<const Scalar*>(palpha);
 
   int info = 0;
   if(SIDE(*side)==INVALID)                                            info = 1;
@@ -222,12 +271,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
 
   if(SIDE(*side)==LEFT)
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
     func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking);
   }
   else
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
     func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking);
   }
   return 1;
@@ -235,14 +284,15 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
 
 // c = alpha*a*b + beta*c  for side = 'L'or'l'
 // c = alpha*b*a + beta*c  for side = 'R'or'r
-int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(symm)(const char *side, const char *uplo, const int *m, const int *n, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
 //   std::cerr << "in symm " << *side << " " << *uplo << " " << *m << "x" << *n << " lda:" << *lda << " ldb:" << *ldb << " ldc:" << *ldc << " alpha:" << *palpha << " beta:" << *pbeta << "\n";
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
 
   int info = 0;
   if(SIDE(*side)==INVALID)                                            info = 1;
@@ -266,9 +316,9 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
     return 1;
   }
 
+  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
   #if ISCOMPLEX
   // FIXME add support for symmetric complex matrix
-  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
   Matrix<Scalar,Dynamic,Dynamic,ColMajor> matA(size,size);
   if(UPLO(*uplo)==UP)
   {
@@ -285,13 +335,15 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
   else if(SIDE(*side)==RIGHT)
     matrix(c, *m, *n, *ldc) += alpha * matrix(b, *m, *n, *ldb) * matA;
   #else
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,size,1,false);
+
   if(SIDE(*side)==LEFT)
-    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, RowMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
-    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, RowMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
+    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
     else                      return 0;
   else if(SIDE(*side)==RIGHT)
-    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, RowMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
-    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
+    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, RowMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);
+    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);
     else                      return 0;
   else
     return 0;
@@ -302,39 +354,38 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
 
 // c = alpha*a*a' + beta*c  for op = 'N'or'n'
 // c = alpha*a'*a + beta*c  for op = 'T'or't','C'or'c'
-int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const int *k,
+                          const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
 //   std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
   #if !ISCOMPLEX
-  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&);
-  static functype func[8];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<8; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Upper>::run);
-    func[TR    | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Upper>::run);
-    func[ADJ   | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Upper>::run);
-
-    func[NOTR  | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Lower>::run);
-    func[TR    | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Lower>::run);
-    func[ADJ   | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Lower>::run);
-
-    init = true;
-  }
+  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
+  static const functype func[8] = {
+    // array index: NOTR  | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Upper>::run),
+    // array index: TR    | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Upper>::run),
+    // array index: ADJ   | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Upper>::run),
+    0,
+    // array index: NOTR  | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Lower>::run),
+    // array index: TR    | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Lower>::run),
+    // array index: ADJ   | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Lower>::run),
+    0
+  };
   #endif
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
 
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
-  else if(OP(*op)==INVALID)                                           info = 2;
+  else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) )           info = 2;
   else if(*n<0)                                                       info = 3;
   else if(*k<0)                                                       info = 4;
   else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
@@ -352,6 +403,9 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
       else                matrix(c, *n, *n, *ldc).triangularView<Lower>() *= beta;
   }
 
+  if(*n==0 || *k==0)
+    return 0;
+
   #if ISCOMPLEX
   // FIXME add support for symmetric complex matrix
   if(UPLO(*uplo)==UP)
@@ -369,8 +423,10 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
       matrix(c, *n, *n, *ldc).triangularView<Lower>() += alpha * matrix(a,*k,*n,*lda).transpose() * matrix(a,*k,*n,*lda);
   }
   #else
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*n,*n,*k,1,false);
+
   int code = OP(*op) | (UPLO(*uplo) << 2);
-  func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha);
+  func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha, blocking);
   #endif
 
   return 0;
@@ -378,17 +434,20 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 
 // c = alpha*a*b' + alpha*b*a' + beta*c  for op = 'N'or'n'
 // c = alpha*a'*b + alpha*b'*a + beta*c  for op = 'T'or't'
-int EIGEN_BLAS_FUNC(syr2k)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(syr2k)(const char *uplo, const char *op, const int *n, const int *k, const RealScalar *palpha,
+                           const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
+
+//   std::cerr << "in syr2k " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << *ldb << " " << beta << " " << *ldc << "\n";
 
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
-  else if(OP(*op)==INVALID)                                           info = 2;
+  else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) )           info = 2;
   else if(*n<0)                                                       info = 3;
   else if(*k<0)                                                       info = 4;
   else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
@@ -443,13 +502,14 @@ int EIGEN_BLAS_FUNC(syr2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
 
 // c = alpha*a*b + beta*c  for side = 'L'or'l'
 // c = alpha*b*a + beta*c  for side = 'R'or'r
-int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(hemm)(const char *side, const char *uplo, const int *m, const int *n, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
 
 //   std::cerr << "in hemm " << *side << " " << *uplo << " " << *m << " " << *n << " " << alpha << " " << *lda << " " << beta << " " << *ldc << "\n";
 
@@ -472,20 +532,23 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
     return 1;
   }
 
+  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,size,1,false);
+
   if(SIDE(*side)==LEFT)
   {
     if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar,DenseIndex,RowMajor,true,Conj,  ColMajor,false,false, ColMajor>
-                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
     else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,true,false, ColMajor,false,false, ColMajor>
-                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
     else                      return 0;
   }
   else if(SIDE(*side)==RIGHT)
   {
     if(UPLO(*uplo)==UP)       matrix(c,*m,*n,*ldc) += alpha * matrix(b,*m,*n,*ldb) * matrix(a,*n,*n,*lda).selfadjointView<Upper>();/*internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,false,false, RowMajor,true,Conj,  ColMajor>
-                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);*/
+                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);*/
     else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,false,false, ColMajor,true,false, ColMajor>
-                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
+                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);
     else                      return 0;
   }
   else
@@ -498,27 +561,28 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
 
 // c = alpha*a*conj(a') + beta*c  for op = 'N'or'n'
 // c = alpha*conj(a')*a + beta*c  for op  = 'C'or'c'
-int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(herk)(const char *uplo, const char *op, const int *n, const int *k,
+                          const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
-  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&);
-  static functype func[8];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<8; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Upper>::run);
-    func[ADJ   | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Upper>::run);
-
-    func[NOTR  | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Lower>::run);
-    func[ADJ   | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Lower>::run);
-
-    init = true;
-  }
-
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+//   std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
+
+  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
+  static const functype func[8] = {
+    // array index: NOTR  | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Upper>::run),
+    0,
+    // array index: ADJ   | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Upper>::run),
+    0,
+    // array index: NOTR  | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Lower>::run),
+    0,
+    // array index: ADJ   | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Lower>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
   RealScalar alpha = *palpha;
   RealScalar beta  = *pbeta;
@@ -545,7 +609,7 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
     else
       if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Lower>().setZero();
       else                matrix(c, *n, *n, *ldc).triangularView<StrictlyLower>() *= beta;
-  
+
     if(beta!=Scalar(0))
     {
       matrix(c, *n, *n, *ldc).diagonal().real() *= beta;
@@ -555,7 +619,8 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 
   if(*k>0 && alpha!=RealScalar(0))
   {
-    func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*n,*n,*k,1,false);
+    func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha, blocking);
     matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
   }
   return 0;
@@ -563,21 +628,24 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 
 // c = alpha*a*conj(b') + conj(alpha)*b*conj(a') + beta*c,  for op = 'N'or'n'
 // c = alpha*conj(a')*b + conj(alpha)*conj(b')*a + beta*c,  for op = 'C'or'c'
-int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(her2k)(const char *uplo, const char *op, const int *n, const int *k,
+                           const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
   RealScalar beta  = *pbeta;
 
+//   std::cerr << "in her2k " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << *ldb << " " << beta << " " << *ldc << "\n";
+
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
   else if((OP(*op)==INVALID) || (OP(*op)==TR))                        info = 2;
   else if(*n<0)                                                       info = 3;
   else if(*k<0)                                                       info = 4;
   else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
-  else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 9;
+  else if(*ldb<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 9;
   else if(*ldc<std::max(1,*n))                                        info = 12;
   if(info)
     return xerbla_(SCALAR_SUFFIX_UP"HER2K",&info,6);
diff --git a/blas/single.cpp b/blas/single.cpp
index 836e3eee2..20ea57d5c 100644
--- a/blas/single.cpp
+++ b/blas/single.cpp
@@ -19,4 +19,4 @@
 #include "level3_impl.h"
 
 float BLASFUNC(sdsdot)(int* n, float* alpha, float* x, int* incx, float* y, int* incy)
-{ return *alpha + BLASFUNC(dsdot)(n, x, incx, y, incy); }
+{ return double(*alpha) + BLASFUNC(dsdot)(n, x, incx, y, incy); }
diff --git a/blas/testing/cblat1.f b/blas/testing/cblat1.f
index a4c996fda..8ca67fb19 100644
--- a/blas/testing/cblat1.f
+++ b/blas/testing/cblat1.f
@@ -1,7 +1,49 @@
+*> \brief \b CBLAT1
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT1
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>    Test program for the COMPLEX Level 1 BLAS.
+*>    Based upon the original BLAS test routine together with:
+*>
+*>    F06GAF Example Program Text
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
       PROGRAM CBLAT1
-*     Test program for the COMPLEX    Level 1 BLAS.
-*     Based upon the original BLAS test routine together with:
-*     F06GAF Example Program Text
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
 *     .. Parameters ..
       INTEGER          NOUT
       PARAMETER        (NOUT=6)
@@ -114,8 +156,8 @@
      +                  (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0),
      +                  (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0),
      +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
-     +                  (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0),
-     +                  (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0),
+     +                  (7.0E0,8.0E0), (0.3E0,0.1E0), (0.5E0,0.0E0),
+     +                  (0.0E0,0.5E0), (0.0E0,0.2E0), (2.0E0,3.0E0),
      +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/
       DATA              ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
@@ -129,10 +171,10 @@
      +                  (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0),
      +                  (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
      +                  (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0),
-     +                  (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0),
-     +                  (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/
-      DATA              STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/
-      DATA              STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/
+     +                  (0.5E0,0.0E0), (6.0E0,9.0E0), (0.0E0,0.5E0),
+     +                  (8.0E0,3.0E0), (0.0E0,0.2E0), (9.0E0,4.0E0)/
+      DATA              STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.8E0/
+      DATA              STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.6E0/
       DATA              ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
@@ -145,8 +187,8 @@
      +                  (0.11E0,-0.03E0), (-0.17E0,0.46E0),
      +                  (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
      +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
-     +                  (0.19E0,-0.17E0), (0.32E0,0.09E0),
-     +                  (0.23E0,-0.24E0), (0.18E0,0.01E0),
+     +                  (0.19E0,-0.17E0), (0.20E0,-0.35E0),
+     +                  (0.35E0,0.20E0), (0.14E0,0.08E0),
      +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0),
      +                  (2.0E0,3.0E0)/
       DATA              ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
@@ -162,9 +204,9 @@
      +                  (-0.17E0,0.46E0), (4.0E0,7.0E0),
      +                  (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
      +                  (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0),
-     +                  (0.32E0,0.09E0), (6.0E0,9.0E0),
-     +                  (0.23E0,-0.24E0), (8.0E0,3.0E0),
-     +                  (0.18E0,0.01E0), (9.0E0,4.0E0)/
+     +                  (0.20E0,-0.35E0), (6.0E0,9.0E0),
+     +                  (0.35E0,0.20E0), (8.0E0,3.0E0),
+     +                  (0.14E0,0.08E0), (9.0E0,4.0E0)/
       DATA              ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
@@ -177,8 +219,8 @@
      +                  (0.03E0,0.03E0), (-0.18E0,0.03E0),
      +                  (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
      +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
-     +                  (0.09E0,0.03E0), (0.03E0,0.12E0),
-     +                  (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0),
+     +                  (0.09E0,0.03E0), (0.15E0,0.00E0),
+     +                  (0.00E0,0.15E0), (0.00E0,0.06E0), (2.0E0,3.0E0),
      +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/
       DATA              ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
@@ -193,8 +235,8 @@
      +                  (-0.18E0,0.03E0), (4.0E0,7.0E0),
      +                  (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
      +                  (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0),
-     +                  (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0),
-     +                  (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/
+     +                  (0.15E0,0.00E0), (6.0E0,9.0E0), (0.00E0,0.15E0),
+     +                  (8.0E0,3.0E0), (0.00E0,0.06E0), (9.0E0,4.0E0)/
       DATA              ITRUE3/0, 1, 2, 2, 2/
 *     .. Executable Statements ..
       DO 60 INCX = 1, 2
@@ -529,7 +571,8 @@
 *
 *     .. Parameters ..
       INTEGER          NOUT
-      PARAMETER        (NOUT=6)
+      REAL             ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0E0)
 *     .. Scalar Arguments ..
       REAL             SFAC
       INTEGER          LEN
@@ -552,7 +595,7 @@
 *
       DO 40 I = 1, LEN
          SD = SCOMP(I) - STRUE(I)
-         IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0)
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO))
      +       GO TO 40
 *
 *                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
diff --git a/blas/testing/cblat2.f b/blas/testing/cblat2.f
index 20f188100..5833ea81a 100644
--- a/blas/testing/cblat2.f
+++ b/blas/testing/cblat2.f
@@ -1,68 +1,114 @@
+*> \brief \b CBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX          Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 17 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 35 lines:
+*> 'cblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> CGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGERC  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGERU  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
       PROGRAM CBLAT2
 *
-*  Test program for the COMPLEX          Level 2 Blas.
-*
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 17 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 35 lines:
-*  'CBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  CGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CGERC  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CGERU  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHER2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -71,8 +117,8 @@
       PARAMETER          ( NSUBS = 17 )
       COMPLEX            ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
-      REAL               RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -126,7 +172,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -135,7 +181,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -240,14 +286,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   90 CONTINUE
-      IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 100
-      EPS = RHALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of CMVCH using exact data.
@@ -3079,7 +3118,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LCERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/cblat3.f b/blas/testing/cblat3.f
index b26be91e6..09f2cb9c5 100644
--- a/blas/testing/cblat3.f
+++ b/blas/testing/cblat3.f
@@ -1,50 +1,96 @@
+*> \brief \b CBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX          Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 9 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 23 lines:
+*> 'cblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> CGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHERK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
       PROGRAM CBLAT3
 *
-*  Test program for the COMPLEX          Level 3 Blas.
-*
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 9 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 23 lines:
-*  'CBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  CGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHERK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHER2K T PUT F FOR NO TEST. SAME COLUMNS.
-*  CSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -53,8 +99,8 @@
       PARAMETER          ( NSUBS = 9 )
       COMPLEX            ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
-      REAL               RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -103,7 +149,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -112,7 +158,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -189,14 +235,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   70 CONTINUE
-      IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 80
-      EPS = RHALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of CMMCH using exact data.
@@ -1946,7 +1985,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1956,12 +1995,19 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA, BETA, RALPHA, and RBETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to CSYMM and CHEMM
+*            with INFOT = 9  (eca)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0E0, TWO = 2.0E0 )
 *     .. Local Scalars ..
       COMPLEX            ALPHA, BETA
       REAL               RALPHA, RBETA
@@ -1979,6 +2025,14 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA, BETA, RALPHA, and RBETA.
+*
+      ALPHA = CMPLX( ONE, -ONE )
+      BETA = CMPLX( TWO, -TWO )
+      RALPHA = ONE
+      RBETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
      $        90 )ISNUM
    10 INFOT = 1
@@ -2205,16 +2259,16 @@
       CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2272,16 +2326,16 @@
       CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -3268,7 +3322,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LCERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/dblat2.f b/blas/testing/dblat2.f
index 4002d4368..0fa80afa4 100644
--- a/blas/testing/dblat2.f
+++ b/blas/testing/dblat2.f
@@ -1,75 +1,121 @@
+*> \brief \b DBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM DBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the DOUBLE PRECISION Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 16 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 34 lines:
+*> 'dblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'DBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 0.9       VALUES OF BETAC
+*> DGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DGER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup double_blas_testing
+*
+*  =====================================================================
       PROGRAM DBLAT2
 *
-*  Test program for the DOUBLE PRECISION Level 2 Blas.
-*
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 16 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 34 lines:
-*  'DBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'DBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 0.9       VALUES OF BETA
-*  DGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DGER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 16 )
-      DOUBLE PRECISION   ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 )
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -121,7 +167,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -130,7 +176,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -235,14 +281,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   90 CONTINUE
-      IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 100
-      EPS = HALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of DMVCH using exact data.
@@ -2982,7 +3021,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LDERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/dblat3.f b/blas/testing/dblat3.f
index 082e03e5e..8d37c7453 100644
--- a/blas/testing/dblat3.f
+++ b/blas/testing/dblat3.f
@@ -1,55 +1,101 @@
+*> \brief \b DBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM DBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the DOUBLE PRECISION Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 6 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 20 lines:
+*> 'dblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'DBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 1.3       VALUES OF BETA
+*> DGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup double_blas_testing
+*
+*  =====================================================================
       PROGRAM DBLAT3
 *
-*  Test program for the DOUBLE PRECISION Level 3 Blas.
-*
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 6 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 20 lines:
-*  'DBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'DBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 1.3       VALUES OF BETA
-*  DGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 6 )
-      DOUBLE PRECISION   ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 )
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -96,7 +142,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -105,7 +151,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -182,14 +228,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   70 CONTINUE
-      IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 80
-      EPS = HALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of DMMCH using exact data.
@@ -1802,7 +1841,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, BETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1812,12 +1851,18 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA and BETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to SSYMM with INFOT = 9  (eca)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, TWO
+      PARAMETER          ( ONE = 1.0D0, TWO = 2.0D0 )
 *     .. Local Scalars ..
       DOUBLE PRECISION   ALPHA, BETA
 *     .. Local Arrays ..
@@ -1834,6 +1879,12 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA and BETA.
+*
+      ALPHA = ONE
+      BETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM
    10 INFOT = 1
       CALL DGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
@@ -1963,16 +2014,16 @@
       CALL DSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2660,7 +2711,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LDERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/sblat2.f b/blas/testing/sblat2.f
index 057a85429..71605ed31 100644
--- a/blas/testing/sblat2.f
+++ b/blas/testing/sblat2.f
@@ -1,75 +1,121 @@
+*> \brief \b SBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM SBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the REAL Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 16 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 34 lines:
+*> 'sblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'SBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 0.9       VALUES OF BETA
+*> SGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SGER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup single_blas_testing
+*
+*  =====================================================================
       PROGRAM SBLAT2
 *
-*  Test program for the REAL             Level 2 Blas.
-*
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 16 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 34 lines:
-*  'SBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'SBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 0.9       VALUES OF BETA
-*  SGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SGER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 16 )
-      REAL               ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 )
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -121,7 +167,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -130,7 +176,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -235,14 +281,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   90 CONTINUE
-      IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 100
-      EPS = HALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of SMVCH using exact data.
@@ -2982,7 +3021,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LSERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/sblat3.f b/blas/testing/sblat3.f
index 325a9eb92..879269633 100644
--- a/blas/testing/sblat3.f
+++ b/blas/testing/sblat3.f
@@ -1,55 +1,101 @@
+*> \brief \b SBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM SBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the REAL             Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 6 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 20 lines:
+*> 'sblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'SBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 1.3       VALUES OF BETA
+*> SGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup single_blas_testing
+*
+*  =====================================================================
       PROGRAM SBLAT3
 *
-*  Test program for the REAL             Level 3 Blas.
-*
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 6 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 20 lines:
-*  'SBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'SBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 1.3       VALUES OF BETA
-*  SGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 6 )
-      REAL               ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 )
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -96,7 +142,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -105,7 +151,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -182,14 +228,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   70 CONTINUE
-      IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 80
-      EPS = HALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of SMMCH using exact data.
@@ -1802,7 +1841,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, BETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1812,12 +1851,18 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA and BETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to SSYMM with INFOT = 9  (eca)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0E0, TWO = 2.0E0 )
 *     .. Local Scalars ..
       REAL               ALPHA, BETA
 *     .. Local Arrays ..
@@ -1834,6 +1879,12 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA and BETA.
+*
+      ALPHA = ONE
+      BETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM
    10 INFOT = 1
       CALL SGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
@@ -1963,16 +2014,16 @@
       CALL SSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2660,7 +2711,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LSERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/zblat1.f b/blas/testing/zblat1.f
index e2415e1c4..d30112c63 100644
--- a/blas/testing/zblat1.f
+++ b/blas/testing/zblat1.f
@@ -1,7 +1,49 @@
+*> \brief \b ZBLAT1
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT1
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>    Test program for the COMPLEX*16 Level 1 BLAS.
+*>
+*>    Based upon the original BLAS test routine together with:
+*>    F06GAF Example Program Text
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
       PROGRAM ZBLAT1
-*     Test program for the COMPLEX*16 Level 1 BLAS.
-*     Based upon the original BLAS test routine together with:
-*     F06GAF Example Program Text
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
 *     .. Parameters ..
       INTEGER          NOUT
       PARAMETER        (NOUT=6)
@@ -114,8 +156,8 @@
      +                  (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0),
      +                  (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0),
      +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
-     +                  (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0),
-     +                  (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0),
+     +                  (7.0D0,8.0D0), (0.3D0,0.1D0), (0.5D0,0.0D0),
+     +                  (0.0D0,0.5D0), (0.0D0,0.2D0), (2.0D0,3.0D0),
      +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/
       DATA              ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
@@ -129,10 +171,10 @@
      +                  (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0),
      +                  (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
      +                  (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0),
-     +                  (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0),
-     +                  (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/
-      DATA              STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/
-      DATA              STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/
+     +                  (0.5D0,0.0D0), (6.0D0,9.0D0), (0.0D0,0.5D0),
+     +                  (8.0D0,3.0D0), (0.0D0,0.2D0), (9.0D0,4.0D0)/
+      DATA              STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.8D0/
+      DATA              STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.6D0/
       DATA              ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
@@ -145,8 +187,8 @@
      +                  (0.11D0,-0.03D0), (-0.17D0,0.46D0),
      +                  (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
      +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
-     +                  (0.19D0,-0.17D0), (0.32D0,0.09D0),
-     +                  (0.23D0,-0.24D0), (0.18D0,0.01D0),
+     +                  (0.19D0,-0.17D0), (0.20D0,-0.35D0),
+     +                  (0.35D0,0.20D0), (0.14D0,0.08D0),
      +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0),
      +                  (2.0D0,3.0D0)/
       DATA              ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
@@ -162,9 +204,9 @@
      +                  (-0.17D0,0.46D0), (4.0D0,7.0D0),
      +                  (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
      +                  (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0),
-     +                  (0.32D0,0.09D0), (6.0D0,9.0D0),
-     +                  (0.23D0,-0.24D0), (8.0D0,3.0D0),
-     +                  (0.18D0,0.01D0), (9.0D0,4.0D0)/
+     +                  (0.20D0,-0.35D0), (6.0D0,9.0D0),
+     +                  (0.35D0,0.20D0), (8.0D0,3.0D0),
+     +                  (0.14D0,0.08D0), (9.0D0,4.0D0)/
       DATA              ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
@@ -177,8 +219,8 @@
      +                  (0.03D0,0.03D0), (-0.18D0,0.03D0),
      +                  (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
      +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
-     +                  (0.09D0,0.03D0), (0.03D0,0.12D0),
-     +                  (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0),
+     +                  (0.09D0,0.03D0), (0.15D0,0.00D0),
+     +                  (0.00D0,0.15D0), (0.00D0,0.06D0), (2.0D0,3.0D0),
      +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/
       DATA              ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
@@ -193,8 +235,8 @@
      +                  (-0.18D0,0.03D0), (4.0D0,7.0D0),
      +                  (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
      +                  (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0),
-     +                  (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0),
-     +                  (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/
+     +                  (0.15D0,0.00D0), (6.0D0,9.0D0), (0.00D0,0.15D0),
+     +                  (8.0D0,3.0D0), (0.00D0,0.06D0), (9.0D0,4.0D0)/
       DATA              ITRUE3/0, 1, 2, 2, 2/
 *     .. Executable Statements ..
       DO 60 INCX = 1, 2
@@ -529,7 +571,8 @@
 *
 *     .. Parameters ..
       INTEGER          NOUT
-      PARAMETER        (NOUT=6)
+      DOUBLE PRECISION ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0D0)
 *     .. Scalar Arguments ..
       DOUBLE PRECISION SFAC
       INTEGER          LEN
@@ -552,7 +595,7 @@
 *
       DO 40 I = 1, LEN
          SD = SCOMP(I) - STRUE(I)
-         IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0)
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO))
      +       GO TO 40
 *
 *                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
diff --git a/blas/testing/zblat2.f b/blas/testing/zblat2.f
index e65cdcc70..53129a11e 100644
--- a/blas/testing/zblat2.f
+++ b/blas/testing/zblat2.f
@@ -1,68 +1,114 @@
+*> \brief \b ZBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX*16       Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 17 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 35 lines:
+*> 'zblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> ZGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGERC  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGERU  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
       PROGRAM ZBLAT2
 *
-*  Test program for the COMPLEX*16       Level 2 Blas.
-*
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 17 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 35 lines:
-*  'ZBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  ZGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZGERC  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZGERU  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHER2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -72,8 +118,8 @@
       COMPLEX*16         ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
      $                   ONE = ( 1.0D0, 0.0D0 ) )
-      DOUBLE PRECISION   RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -127,7 +173,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -136,7 +182,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -241,14 +287,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   90 CONTINUE
-      IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 100
-      EPS = RHALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of ZMVCH using exact data.
@@ -3087,7 +3126,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LZERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/zblat3.f b/blas/testing/zblat3.f
index d6a522f2a..59ca24145 100644
--- a/blas/testing/zblat3.f
+++ b/blas/testing/zblat3.f
@@ -1,50 +1,97 @@
+*> \brief \b ZBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX*16       Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 9 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 23 lines:
+*> 'zblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'ZBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> ZGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHERK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> 
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
       PROGRAM ZBLAT3
 *
-*  Test program for the COMPLEX*16       Level 3 Blas.
-*
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 9 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 23 lines:
-*  'ZBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'ZBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  ZGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHERK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHER2K T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -54,8 +101,8 @@
       COMPLEX*16         ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
      $                   ONE = ( 1.0D0, 0.0D0 ) )
-      DOUBLE PRECISION   RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -104,7 +151,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -113,7 +160,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -190,14 +237,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   70 CONTINUE
-      IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 80
-      EPS = RHALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of ZMMCH using exact data.
@@ -1949,7 +1989,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1959,12 +1999,20 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA, BETA, RALPHA, and RBETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to ZSYMM and ZHEMM
+*            with INFOT = 9  (eca)
+*  10-9-00:  Declared INTRINSIC DCMPLX (susan)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0D0, TWO = 2.0D0 )
 *     .. Local Scalars ..
       COMPLEX*16         ALPHA, BETA
       DOUBLE PRECISION   RALPHA, RBETA
@@ -1973,6 +2021,8 @@
 *     .. External Subroutines ..
       EXTERNAL           ZGEMM, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM,
      $                   ZSYR2K, ZSYRK, ZTRMM, ZTRSM
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX
 *     .. Common blocks ..
       COMMON             /INFOC/INFOT, NOUTC, OK, LERR
 *     .. Executable Statements ..
@@ -1982,6 +2032,14 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA, BETA, RALPHA, and RBETA.
+*
+      ALPHA = DCMPLX( ONE, -ONE )
+      BETA = DCMPLX( TWO, -TWO )
+      RALPHA = ONE
+      RBETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
      $        90 )ISNUM
    10 INFOT = 1
@@ -2208,16 +2266,16 @@
       CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2275,16 +2333,16 @@
       CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -3274,7 +3332,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LZERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp
index dd39a5244..c373e8699 100644
--- a/blas/xerbla.cpp
+++ b/blas/xerbla.cpp
@@ -1,5 +1,5 @@
 
-#include <iostream>
+#include <stdio.h>
 
 #if (defined __GNUC__) && (!defined __MINGW32__) && (!defined __CYGWIN__)
 #define EIGEN_WEAK_LINKING __attribute__ ((weak))
@@ -14,7 +14,7 @@ extern "C"
 
 EIGEN_WEAK_LINKING int xerbla_(const char * msg, int *info, int)
 {
-  std::cerr << "Eigen BLAS ERROR #" << *info << ": " << msg << "\n";
+  printf("Eigen BLAS ERROR #%i: %s\n", *info, msg );
   return 0;
 }
 
diff --git a/cmake/Eigen3Config.cmake.in b/cmake/Eigen3Config.cmake.in
new file mode 100644
index 000000000..c5c546887
--- /dev/null
+++ b/cmake/Eigen3Config.cmake.in
@@ -0,0 +1,21 @@
+# This file exports the Eigen3::Eigen CMake target which should be passed to the
+# target_link_libraries command.
+
+@PACKAGE_INIT@
+
+include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake")
+
+# Legacy variables, do *not* use. May be removed in the future.
+
+set (EIGEN3_FOUND 1)
+set (EIGEN3_USE_FILE    "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake")
+
+set (EIGEN3_DEFINITIONS  "@EIGEN_DEFINITIONS@")
+set (EIGEN3_INCLUDE_DIR  "@PACKAGE_EIGEN_INCLUDE_DIR@")
+set (EIGEN3_INCLUDE_DIRS "@PACKAGE_EIGEN_INCLUDE_DIR@")
+set (EIGEN3_ROOT_DIR     "@PACKAGE_EIGEN_ROOT_DIR@")
+
+set (EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@")
+set (EIGEN3_VERSION_MAJOR  "@EIGEN_VERSION_MAJOR@")
+set (EIGEN3_VERSION_MINOR  "@EIGEN_VERSION_MINOR@")
+set (EIGEN3_VERSION_PATCH  "@EIGEN_VERSION_PATCH@")
diff --git a/cmake/Eigen3ConfigLegacy.cmake.in b/cmake/Eigen3ConfigLegacy.cmake.in
new file mode 100644
index 000000000..62d722469
--- /dev/null
+++ b/cmake/Eigen3ConfigLegacy.cmake.in
@@ -0,0 +1,30 @@
+#                                               -*- cmake -*-
+#
+#  Eigen3Config.cmake(.in)
+
+# Use the following variables to compile and link against Eigen:
+#  EIGEN3_FOUND              - True if Eigen was found on your system
+#  EIGEN3_USE_FILE           - The file making Eigen usable
+#  EIGEN3_DEFINITIONS        - Definitions needed to build with Eigen
+#  EIGEN3_INCLUDE_DIR        - Directory where signature_of_eigen3_matrix_library can be found
+#  EIGEN3_INCLUDE_DIRS       - List of directories of Eigen and it's dependencies
+#  EIGEN3_ROOT_DIR           - The base directory of Eigen
+#  EIGEN3_VERSION_STRING     - A human-readable string containing the version
+#  EIGEN3_VERSION_MAJOR      - The major version of Eigen
+#  EIGEN3_VERSION_MINOR      - The minor version of Eigen
+#  EIGEN3_VERSION_PATCH      - The patch version of Eigen
+
+@PACKAGE_INIT@
+
+set ( EIGEN3_FOUND 1 )
+set ( EIGEN3_USE_FILE     "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake" )
+
+set ( EIGEN3_DEFINITIONS  "@EIGEN_DEFINITIONS@" )
+set ( EIGEN3_INCLUDE_DIR  "@PACKAGE_EIGEN_INCLUDE_DIR@" )
+set ( EIGEN3_INCLUDE_DIRS "@PACKAGE_EIGEN_INCLUDE_DIR@" )
+set ( EIGEN3_ROOT_DIR     "@PACKAGE_EIGEN_ROOT_DIR@" )
+
+set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" )
+set ( EIGEN3_VERSION_MAJOR  "@EIGEN_VERSION_MAJOR@" )
+set ( EIGEN3_VERSION_MINOR  "@EIGEN_VERSION_MINOR@" )
+set ( EIGEN3_VERSION_PATCH  "@EIGEN_VERSION_PATCH@" )
diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake
index 11ecc9585..afc24b5e9 100644
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake
@@ -14,41 +14,23 @@ add_dependencies(check buildtests)
 # check whether /bin/bash exists
 find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH)
 
-# CMake/Ctest does not allow us to change the build command,
-# so we have to workaround by directly editing the generated DartConfiguration.tcl file
-# save CMAKE_MAKE_PROGRAM
-set(CMAKE_MAKE_PROGRAM_SAVE ${CMAKE_MAKE_PROGRAM})
-# and set a fake one
-set(CMAKE_MAKE_PROGRAM "@EIGEN_MAKECOMMAND_PLACEHOLDER@")
-
 # This call activates testing and generates the DartConfiguration.tcl
 include(CTest)
 
-set(EIGEN_TEST_BUILD_FLAGS " " CACHE STRING "Options passed to the build command of unit tests")
+set(EIGEN_TEST_BUILD_FLAGS "" CACHE STRING "Options passed to the build command of unit tests")
 
-# overwrite default DartConfiguration.tcl
-# The worarounds are different for each version of the MSVC IDE
-if(MSVC_IDE)
-  if(CMAKE_MAKE_PROGRAM_SAVE MATCHES "devenv") # devenv
-    set(EIGEN_MAKECOMMAND_PLACEHOLDER "${CMAKE_MAKE_PROGRAM_SAVE} Eigen.sln /build \"Release\" /project buildtests ${EIGEN_TEST_BUILD_FLAGS} \n# ")    
-  else() # msbuild
-    set(EIGEN_MAKECOMMAND_PLACEHOLDER "${CMAKE_MAKE_PROGRAM_SAVE} buildtests.vcxproj /p:Configuration=\${CTEST_CONFIGURATION_TYPE}  ${EIGEN_TEST_BUILD_FLAGS}\n# ")
-  endif()
-else()
-  # for make and nmake
-  set(EIGEN_MAKECOMMAND_PLACEHOLDER "${CMAKE_MAKE_PROGRAM_SAVE} buildtests ${EIGEN_TEST_BUILD_FLAGS}")
+# Overwrite default DartConfiguration.tcl such that ctest can build our unit tests.
+# Recall that our unit tests are not in the "all" target, so we have to explicitely ask ctest to build our custom 'buildtests' target.
+# At this stage, we can also add custom flags to the build tool through the user defined EIGEN_TEST_BUILD_FLAGS variable.
+file(READ  "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" EIGEN_DART_CONFIG_FILE)
+# try to grab the default flags
+string(REGEX MATCH "MakeCommand:.*-- (.*)\nDefaultCTestConfigurationType" EIGEN_DUMMY ${EIGEN_DART_CONFIG_FILE})
+if(NOT CMAKE_MATCH_1)
+string(REGEX MATCH "MakeCommand:.*[^c]make (.*)\nDefaultCTestConfigurationType" EIGEN_DUMMY ${EIGEN_DART_CONFIG_FILE})
 endif()
-
-# copy ctest properties, which currently
-# o raise the warning levels
-configure_file(${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl ${CMAKE_BINARY_DIR}/DartConfiguration.tcl)
-
-# restore default CMAKE_MAKE_PROGRAM
-set(CMAKE_MAKE_PROGRAM ${CMAKE_MAKE_PROGRAM_SAVE})
-# un-set temporary variables so that it is like they never existed. 
-# CMake 2.6.3 introduces the more logical unset() syntax for this.
-set(CMAKE_MAKE_PROGRAM_SAVE) 
-set(EIGEN_MAKECOMMAND_PLACEHOLDER)
+string(REGEX REPLACE "MakeCommand:.*DefaultCTestConfigurationType" "MakeCommand: ${CMAKE_COMMAND} --build . --target buildtests --config \"\${CTEST_CONFIGURATION_TYPE}\" -- ${CMAKE_MATCH_1} ${EIGEN_TEST_BUILD_FLAGS}\nDefaultCTestConfigurationType"
+       EIGEN_DART_CONFIG_FILE2 ${EIGEN_DART_CONFIG_FILE})
+file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" ${EIGEN_DART_CONFIG_FILE2})
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CTestCustom.cmake.in ${CMAKE_BINARY_DIR}/CTestCustom.cmake)
 
@@ -64,18 +46,16 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   if(EIGEN_COVERAGE_TESTING)
     set(COVERAGE_FLAGS "-fprofile-arcs -ftest-coverage")
     set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/test/")
-  else(EIGEN_COVERAGE_TESTING)
-    set(COVERAGE_FLAGS "")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS}")
   endif(EIGEN_COVERAGE_TESTING)
-  if(EIGEN_TEST_C++0x)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++0x")
-  endif(EIGEN_TEST_C++0x)
-  if(CMAKE_SYSTEM_NAME MATCHES Linux)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS} -g2")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${COVERAGE_FLAGS} -O2 -g2")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${COVERAGE_FLAGS} -fno-inline-functions")
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${COVERAGE_FLAGS} -O0 -g3")
-  endif(CMAKE_SYSTEM_NAME MATCHES Linux)
+  
 elseif(MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
 endif(CMAKE_COMPILER_IS_GNUCXX)
+
+
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CXX11)
+
+if(EIGEN_TEST_CXX11 AND EIGEN_COMPILER_SUPPORT_CXX11)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+endif()
diff --git a/cmake/EigenDetermineOSVersion.cmake b/cmake/EigenDetermineOSVersion.cmake
index 3c48d4c37..9246fa67c 100644
--- a/cmake/EigenDetermineOSVersion.cmake
+++ b/cmake/EigenDetermineOSVersion.cmake
@@ -26,7 +26,7 @@ function(DetermineShortWindowsName WIN_VERSION win_num_version)
 endfunction()
 
 function(DetermineOSVersion OS_VERSION)
-  if (WIN32)
+  if (WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
     file (TO_NATIVE_PATH "$ENV{COMSPEC}" SHELL)
     exec_program( ${SHELL} ARGS "/c" "ver" OUTPUT_VARIABLE ver_output)
 				
diff --git a/cmake/EigenDetermineVSServicePack.cmake b/cmake/EigenDetermineVSServicePack.cmake
index 8e5546a85..fed78194d 100644
--- a/cmake/EigenDetermineVSServicePack.cmake
+++ b/cmake/EigenDetermineVSServicePack.cmake
@@ -4,7 +4,6 @@ include(CMakeDetermineVSServicePack)
 # _DetermineVSServicePack_FastCheckVersionWithCompiler which lead to errors on some systems.
 function(EigenDetermineVSServicePack _pack)
     if(NOT DETERMINED_VS_SERVICE_PACK OR NOT ${_pack})
-
         if(NOT DETERMINED_VS_SERVICE_PACK)
             _DetermineVSServicePack_CheckVersionWithTryCompile(DETERMINED_VS_SERVICE_PACK _cl_version)
             if(NOT DETERMINED_VS_SERVICE_PACK)
@@ -13,10 +12,25 @@ function(EigenDetermineVSServicePack _pack)
         endif()
 
         if(DETERMINED_VS_SERVICE_PACK)
-
             if(_cl_version)
                 # Call helper function to determine VS version
                 _DetermineVSServicePackFromCompiler(_sp "${_cl_version}")
+              
+                # temporary fix, until CMake catches up
+                if (NOT _sp)
+                    if(${_cl_version} VERSION_EQUAL "17.00.50727.1")
+                        set(_sp "vc110")
+                    elseif(${_cl_version} VERSION_EQUAL "17.00.51106.1")
+                        set(_sp "vc110sp1")
+                    elseif(${_cl_version} VERSION_EQUAL "17.00.60315.1")
+                        set(_sp "vc110sp2")
+                    elseif(${_cl_version} VERSION_EQUAL "17.00.60610.1")
+                        set(_sp "vc110sp3")
+                    else()
+                        set(_sp ${CMAKE_CXX_COMPILER_VERSION})
+                    endif()
+                endif()
+                
                 if(_sp)
                     set(${_pack} ${_sp} CACHE INTERNAL
                         "The Visual Studio Release with Service Pack")
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index f4796ca41..a92a2978b 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -1,19 +1,48 @@
 
 macro(ei_add_property prop value)
-  get_property(previous GLOBAL PROPERTY ${prop})  
+  get_property(previous GLOBAL PROPERTY ${prop})
   if ((NOT previous) OR (previous STREQUAL ""))
     set_property(GLOBAL PROPERTY ${prop} "${value}")
   else()
     set_property(GLOBAL PROPERTY ${prop} "${previous} ${value}")
-  endif()  
+  endif()
 endmacro(ei_add_property)
 
 #internal. See documentation of ei_add_test for details.
 macro(ei_add_test_internal testname testname_with_suffix)
   set(targetname ${testname_with_suffix})
 
-  set(filename ${testname}.cpp)
-  add_executable(${targetname} ${filename})
+  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
+  else()
+    set(filename ${testname}.cpp)
+  endif()
+
+  if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
+    if(EIGEN_TEST_CUDA_CLANG)
+      set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
+      if(CUDA_64_BIT_DEVICE_CODE)
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+      else()
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib")
+      endif()
+      if (${ARGC} GREATER 2)
+        add_executable(${targetname} ${filename})
+      else()
+        add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+      endif()
+      target_link_libraries(${targetname} "cudart_static" "cuda" "dl" "rt" "pthread")
+    else()
+      if (${ARGC} GREATER 2)
+        cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+      else()
+        cuda_add_executable(${targetname} ${filename})
+      endif()
+    endif()
+  else()
+    add_executable(${targetname} ${filename})
+  endif()
+
   if (targetname MATCHES "^eigen2_")
     add_dependencies(eigen2_buildtests ${targetname})
   else()
@@ -27,20 +56,20 @@ macro(ei_add_test_internal testname testname_with_suffix)
       ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1")
     endif(EIGEN_DEBUG_ASSERTS)
   endif(EIGEN_NO_ASSERTION_CHECKING)
-  
+
   ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}")
 
   ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}")
-  
-  if(MSVC AND NOT EIGEN_SPLIT_LARGE_TESTS)
+
+  if(MSVC)
     ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj")
-  endif()  
+  endif()
 
   # let the user pass flags.
   if(${ARGC} GREATER 2)
     ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}")
   endif(${ARGC} GREATER 2)
-  
+
   if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
     ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
   endif()
@@ -66,16 +95,12 @@ macro(ei_add_test_internal testname testname_with_suffix)
       # notice: no double quotes around ${libs_to_link} here. It may be a list.
       target_link_libraries(${targetname} ${libs_to_link})
     endif()
-  endif() 
-
-  if(EIGEN_BIN_BASH_EXISTS)
-    add_test(${testname_with_suffix} "${Eigen_SOURCE_DIR}/test/runtest.sh" "${testname_with_suffix}")
-  else()
-    add_test(${testname_with_suffix} "${targetname}")
   endif()
-  
+
+  add_test(${testname_with_suffix} "${targetname}")
+
   # Specify target and test labels accoirding to EIGEN_CURRENT_SUBPROJECT
-  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)  
+  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)
   if ((current_subproject) AND (NOT (current_subproject STREQUAL "")))
     set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}")
     add_dependencies("Build${current_subproject}" ${targetname})
@@ -84,6 +109,103 @@ macro(ei_add_test_internal testname testname_with_suffix)
 
 endmacro(ei_add_test_internal)
 
+# SYCL
+macro(ei_add_test_internal_sycl testname testname_with_suffix)
+  include_directories( SYSTEM ${COMPUTECPP_PACKAGE_ROOT_DIR}/include)
+  set(targetname ${testname_with_suffix})
+
+  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
+  else()
+    set(filename ${testname}.cpp)
+  endif()
+
+  set( include_file ${CMAKE_CURRENT_BINARY_DIR}/inc_${filename})
+  set( bc_file ${CMAKE_CURRENT_BINARY_DIR}/${filename})
+  set( host_file ${CMAKE_CURRENT_SOURCE_DIR}/${filename})
+
+  ADD_CUSTOM_COMMAND(
+    OUTPUT ${include_file}
+    COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file}
+    COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}.sycl\\\"" >> ${include_file}
+    DEPENDS ${filename} ${bc_file}.sycl
+    COMMENT "Building ComputeCpp integration header file ${include_file}"
+  )
+  # Add a custom target for the generated integration header
+  add_custom_target(${testname}_integration_header_sycl DEPENDS ${include_file})
+
+  add_executable(${targetname} ${include_file})
+  add_dependencies(${targetname} ${testname}_integration_header_sycl)
+  add_sycl_to_target(${targetname} ${filename} ${CMAKE_CURRENT_BINARY_DIR})
+
+  if (targetname MATCHES "^eigen2_")
+    add_dependencies(eigen2_buildtests ${targetname})
+  else()
+    add_dependencies(buildtests ${targetname})
+  endif()
+
+  if(EIGEN_NO_ASSERTION_CHECKING)
+    ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1")
+  else(EIGEN_NO_ASSERTION_CHECKING)
+    if(EIGEN_DEBUG_ASSERTS)
+      ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1")
+    endif(EIGEN_DEBUG_ASSERTS)
+  endif(EIGEN_NO_ASSERTION_CHECKING)
+
+  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}")
+
+  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}")
+
+  if(MSVC AND NOT EIGEN_SPLIT_LARGE_TESTS)
+    ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj")
+  endif()
+
+  # let the user pass flags.
+  if(${ARGC} GREATER 2)
+    ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}")
+  endif(${ARGC} GREATER 2)
+
+  if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
+    ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
+  endif()
+
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(${targetname} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+  if(EXTERNAL_LIBS)
+    target_link_libraries(${targetname} ${EXTERNAL_LIBS})
+  endif()
+  if(EIGEN_TEST_CUSTOM_LINKER_FLAGS)
+    target_link_libraries(${targetname} ${EIGEN_TEST_CUSTOM_LINKER_FLAGS})
+  endif()
+
+  if(${ARGC} GREATER 3)
+    set(libs_to_link ${ARGV3})
+    # it could be that some cmake module provides a bad library string " "  (just spaces),
+    # and that severely breaks target_link_libraries ("can't link to -l-lstdc++" errors).
+    # so we check for strings containing only spaces.
+    string(STRIP "${libs_to_link}" libs_to_link_stripped)
+    string(LENGTH "${libs_to_link_stripped}" libs_to_link_stripped_length)
+    if(${libs_to_link_stripped_length} GREATER 0)
+      # notice: no double quotes around ${libs_to_link} here. It may be a list.
+      target_link_libraries(${targetname} ${libs_to_link})
+    endif()
+  endif()
+
+  add_test(${testname_with_suffix} "${targetname}")
+
+  # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT
+  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)
+  if ((current_subproject) AND (NOT (current_subproject STREQUAL "")))
+    set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}")
+    add_dependencies("Build${current_subproject}" ${targetname})
+    set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}")
+  endif()
+
+
+endmacro(ei_add_test_internal_sycl)
+
+
 # Macro to add a test
 #
 # the unique mandatory parameter testname must correspond to a file
@@ -131,7 +253,13 @@ macro(ei_add_test testname)
   set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}${testname}\n")
   set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
 
-  file(READ "${testname}.cpp" test_source)
+  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
+  else()
+    set(filename ${testname}.cpp)
+  endif()
+
+  file(READ "${filename}" test_source)
   set(parts 0)
   string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+"
          occurences "${test_source}")
@@ -154,6 +282,39 @@ macro(ei_add_test testname)
   endif(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
 endmacro(ei_add_test)
 
+macro(ei_add_test_sycl testname)
+  get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
+  set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}${testname}\n")
+  set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+
+  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
+  else()
+    set(filename ${testname}.cpp)
+  endif()
+
+  file(READ "${filename}" test_source)
+  set(parts 0)
+  string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+"
+         occurences "${test_source}")
+  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}")
+  list(REMOVE_DUPLICATES suffixes)
+  if(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
+    add_custom_target(${testname})
+    foreach(suffix ${suffixes})
+      ei_add_test_internal_sycl(${testname} ${testname}_${suffix}
+        "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}")
+      add_dependencies(${testname} ${testname}_${suffix})
+    endforeach(suffix)
+  else(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
+    set(symbols_to_enable_all_parts "")
+    foreach(suffix ${suffixes})
+      set(symbols_to_enable_all_parts
+        "${symbols_to_enable_all_parts} -DEIGEN_TEST_PART_${suffix}=1")
+    endforeach(suffix)
+    ei_add_test_internal_sycl(${testname} ${testname} "${ARGV1} ${symbols_to_enable_all_parts}" "${ARGV2}")
+  endif(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
+endmacro(ei_add_test_sycl)
 
 # adds a failtest, i.e. a test that succeed if the program fails to compile
 # note that the test runner for these is CMake itself, when passed -DEIGEN_FAILTEST=ON
@@ -218,7 +379,7 @@ macro(ei_testing_print_summary)
   elseif(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
     message(STATUS "Explicit vectorization disabled (alignment kept enabled)")
   else()
-  
+
   message(STATUS "Maximal matrix/vector size: ${EIGEN_TEST_MAX_SIZE}")
 
     if(EIGEN_TEST_SSE2)
@@ -251,18 +412,75 @@ macro(ei_testing_print_summary)
       message(STATUS "SSE4.2:            Using architecture defaults")
     endif()
 
+    if(EIGEN_TEST_AVX)
+      message(STATUS "AVX:               ON")
+    else()
+      message(STATUS "AVX:               Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_FMA)
+      message(STATUS "FMA:               ON")
+    else()
+      message(STATUS "FMA:               Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_AVX512)
+      message(STATUS "AVX512:            ON")
+    else()
+      message(STATUS "AVX512:            Using architecture defaults")
+    endif()
+
     if(EIGEN_TEST_ALTIVEC)
       message(STATUS "Altivec:           ON")
     else()
       message(STATUS "Altivec:           Using architecture defaults")
     endif()
 
+    if(EIGEN_TEST_VSX)
+      message(STATUS "VSX:               ON")
+    else()
+      message(STATUS "VSX:               Using architecture defaults")
+    endif()
+
     if(EIGEN_TEST_NEON)
       message(STATUS "ARM NEON:          ON")
     else()
       message(STATUS "ARM NEON:          Using architecture defaults")
     endif()
 
+    if(EIGEN_TEST_NEON64)
+      message(STATUS "ARMv8 NEON:        ON")
+    else()
+      message(STATUS "ARMv8 NEON:        Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_ZVECTOR)
+      message(STATUS "S390X ZVECTOR:     ON")
+    else()
+      message(STATUS "S390X ZVECTOR:     Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_CXX11)
+      message(STATUS "C++11:             ON")
+    else()
+      message(STATUS "C++11:             OFF")
+    endif()
+
+    if(EIGEN_TEST_SYCL)
+      message(STATUS "SYCL:              ON")
+    else()
+      message(STATUS "SYCL:              OFF")
+    endif()
+    if(EIGEN_TEST_CUDA)
+      if(EIGEN_TEST_CUDA_CLANG)
+        message(STATUS "CUDA:              ON (using clang)")
+      else()
+        message(STATUS "CUDA:              ON (using nvcc)")
+      endif()
+    else()
+      message(STATUS "CUDA:              OFF")
+    endif()
+
   endif() # vectorization / alignment options
 
   message(STATUS "\n${EIGEN_TESTING_SUMMARY}")
@@ -287,7 +505,7 @@ macro(ei_init_testing)
 
   set_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT "0")
   set_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT "0")
-  
+
   # uncomment anytime you change the ei_get_compilerver_from_cxx_version_string macro
   # ei_test_get_compilerver_from_cxx_version_string()
 endmacro(ei_init_testing)
@@ -296,55 +514,47 @@ macro(ei_set_sitename)
   # if the sitename is not yet set, try to set it
   if(NOT ${SITE} OR ${SITE} STREQUAL "")
     set(eigen_computername $ENV{COMPUTERNAME})
-	set(eigen_hostname $ENV{HOSTNAME})
+    set(eigen_hostname $ENV{HOSTNAME})
     if(eigen_hostname)
       set(SITE ${eigen_hostname})
-	elseif(eigen_computername)
-	  set(SITE ${eigen_computername})
+    elseif(eigen_computername)
+      set(SITE ${eigen_computername})
     endif()
   endif()
   # in case it is already set, enforce lower case
   if(SITE)
     string(TOLOWER ${SITE} SITE)
-  endif()  
+  endif()
 endmacro(ei_set_sitename)
 
 macro(ei_get_compilerver VAR)
-  if(MSVC)
-    # on windows system, we use a modified CMake script  
-    include(EigenDetermineVSServicePack)
-    EigenDetermineVSServicePack( my_service_pack )
+    if(MSVC)
+      # on windows system, we use a modified CMake script
+      include(EigenDetermineVSServicePack)
+      EigenDetermineVSServicePack( my_service_pack )
 
-    if( my_service_pack )
-      set(${VAR} ${my_service_pack})
+      if( my_service_pack )
+        set(${VAR} ${my_service_pack})
+      else()
+        set(${VAR} "na")
+      endif()
     else()
-      set(${VAR} "na")
-    endif()
-  else()
     # on all other system we rely on ${CMAKE_CXX_COMPILER}
     # supporting a "--version" or "/version" flag
-    
-    if(WIN32 AND NOT CYGWIN)
+
+    if(WIN32 AND ${CMAKE_CXX_COMPILER_ID} EQUAL "Intel")
       set(EIGEN_CXX_FLAG_VERSION "/version")
     else()
       set(EIGEN_CXX_FLAG_VERSION "--version")
     endif()
-    
-    # check whether the head command exists
-    find_program(HEAD_EXE head NO_CMAKE_ENVIRONMENT_PATH NO_CMAKE_PATH NO_CMAKE_SYSTEM_PATH)
-    if(HEAD_EXE)
-      execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${EIGEN_CXX_FLAG_VERSION}
-                      COMMAND head -n 1
-                      OUTPUT_VARIABLE eigen_cxx_compiler_version_string OUTPUT_STRIP_TRAILING_WHITESPACE)
-    else()
-      execute_process(COMMAND ${CMAKE_CXX_COMPILER}  ${EIGEN_CXX_FLAG_VERSION}
-                      OUTPUT_VARIABLE eigen_cxx_compiler_version_string OUTPUT_STRIP_TRAILING_WHITESPACE)
-      string(REGEX REPLACE "[\n\r].*"  ""  eigen_cxx_compiler_version_string  ${eigen_cxx_compiler_version_string})
-    endif()
-    
+
+    execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${EIGEN_CXX_FLAG_VERSION}
+                    OUTPUT_VARIABLE eigen_cxx_compiler_version_string OUTPUT_STRIP_TRAILING_WHITESPACE)
+    string(REGEX REPLACE "[\n\r].*"  ""  eigen_cxx_compiler_version_string  ${eigen_cxx_compiler_version_string})
+
     ei_get_compilerver_from_cxx_version_string("${eigen_cxx_compiler_version_string}" CNAME CVER)
     set(${VAR} "${CNAME}-${CVER}")
-    
+
   endif()
 endmacro(ei_get_compilerver)
 
@@ -353,18 +563,20 @@ endmacro(ei_get_compilerver)
 # the testing macro call in ei_init_testing() of the EigenTesting.cmake file.
 # See also the ei_test_get_compilerver_from_cxx_version_string macro at the end of the file
 macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
-  # extract possible compiler names  
+  # extract possible compiler names
   string(REGEX MATCH "g\\+\\+"      ei_has_gpp    ${VERSTRING})
   string(REGEX MATCH "llvm|LLVM"    ei_has_llvm   ${VERSTRING})
   string(REGEX MATCH "gcc|GCC"      ei_has_gcc    ${VERSTRING})
   string(REGEX MATCH "icpc|ICC"     ei_has_icpc   ${VERSTRING})
   string(REGEX MATCH "clang|CLANG"  ei_has_clang  ${VERSTRING})
-  
+
   # combine them
   if((ei_has_llvm) AND (ei_has_gpp OR ei_has_gcc))
     set(${CNAME} "llvm-g++")
   elseif((ei_has_llvm) AND (ei_has_clang))
     set(${CNAME} "llvm-clang++")
+  elseif(ei_has_clang)
+    set(${CNAME} "clang++")
   elseif(ei_has_icpc)
     set(${CNAME} "icpc")
   elseif(ei_has_gpp OR ei_has_gcc)
@@ -372,7 +584,7 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
   else()
     set(${CNAME} "_")
   endif()
-  
+
   # extract possible version numbers
   # first try to extract 3 isolated numbers:
   string(REGEX MATCH " [0-9]+\\.[0-9]+\\.[0-9]+" eicver ${VERSTRING})
@@ -390,9 +602,9 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
       endif()
     endif()
   endif()
-  
+
   string(REGEX REPLACE ".(.*)" "\\1" ${CVER} ${eicver})
-  
+
 endmacro(ei_get_compilerver_from_cxx_version_string)
 
 macro(ei_get_cxxflags VAR)
@@ -400,8 +612,18 @@ macro(ei_get_cxxflags VAR)
   ei_is_64bit_env(IS_64BIT_ENV)
   if(EIGEN_TEST_NEON)
     set(${VAR} NEON)
+  elseif(EIGEN_TEST_NEON64)
+    set(${VAR} NEON)
+  elseif(EIGEN_TEST_ZVECTOR)
+    set(${VAR} ZVECTOR)
+  elseif(EIGEN_TEST_VSX)
+    set(${VAR} VSX)
   elseif(EIGEN_TEST_ALTIVEC)
     set(${VAR} ALVEC)
+  elseif(EIGEN_TEST_FMA)
+    set(${VAR} FMA)
+  elseif(EIGEN_TEST_AVX)
+    set(${VAR} AVX)
   elseif(EIGEN_TEST_SSE4_2)
     set(${VAR} SSE42)
   elseif(EIGEN_TEST_SSE4_1)
@@ -411,30 +633,30 @@ macro(ei_get_cxxflags VAR)
   elseif(EIGEN_TEST_SSE3)
     set(${VAR} SSE3)
   elseif(EIGEN_TEST_SSE2 OR IS_64BIT_ENV)
-    set(${VAR} SSE2)  
+    set(${VAR} SSE2)
   endif()
 
   if(EIGEN_TEST_OPENMP)
     if (${VAR} STREQUAL "")
-	  set(${VAR} OMP)
-	else()
-	  set(${VAR} ${${VAR}}-OMP)
-	endif()
+      set(${VAR} OMP)
+    else()
+      set(${VAR} ${${VAR}}-OMP)
+    endif()
   endif()
-  
+
   if(EIGEN_DEFAULT_TO_ROW_MAJOR)
     if (${VAR} STREQUAL "")
-	  set(${VAR} ROW)
-	else()
-	  set(${VAR} ${${VAR}}-ROWMAJ)
-	endif()  
+      set(${VAR} ROW)
+    else()
+      set(${VAR} ${${VAR}}-ROWMAJ)
+    endif()
   endif()
 endmacro(ei_get_cxxflags)
 
 macro(ei_set_build_string)
   ei_get_compilerver(LOCAL_COMPILER_VERSION)
   ei_get_cxxflags(LOCAL_COMPILER_FLAGS)
-  
+
   include(EigenDetermineOSVersion)
   DetermineOSVersion(OS_VERSION)
 
@@ -450,7 +672,11 @@ macro(ei_set_build_string)
   else()
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-64bit)
   endif()
-  
+
+  if(EIGEN_TEST_CXX11)
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-cxx11)
+  endif()
+
   if(EIGEN_BUILD_STRING_SUFFIX)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${EIGEN_BUILD_STRING_SUFFIX})
   endif()
diff --git a/cmake/EigenUninstall.cmake b/cmake/EigenUninstall.cmake
new file mode 100644
index 000000000..4dae8c85c
--- /dev/null
+++ b/cmake/EigenUninstall.cmake
@@ -0,0 +1,40 @@
+################ CMake Uninstall Template #######################
+# CMake Template file for uninstallation of files
+# mentioned in 'install_manifest.txt'
+#
+# Used by uinstall target
+#################################################################
+
+set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt")
+
+if(EXISTS ${MANIFEST})
+  message(STATUS "============== Uninstalling Eigen  ===================")
+
+  file(STRINGS ${MANIFEST} files)
+  foreach(file ${files})
+    if(EXISTS ${file})
+      message(STATUS "Removing file: '${file}'")
+
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E remove ${file}
+        OUTPUT_VARIABLE rm_out
+        RESULT_VARIABLE rm_retval
+        )
+
+      if(NOT "${rm_retval}" STREQUAL 0)
+        message(FATAL_ERROR "Failed to remove file: '${file}'.")
+      endif()
+    else()
+      message(STATUS "File '${file}' does not exist.")
+    endif()
+  endforeach(file)
+
+  message(STATUS "========== Finished Uninstalling Eigen  ==============")
+else()
+  message(STATUS "Cannot find install manifest: '${MANIFEST}'")
+  message(STATUS "Probably make install has not been performed")
+  message(STATUS "   or install_manifest.txt has been deleted.")
+endif()
+
+
+
diff --git a/cmake/FindAdolc.cmake b/cmake/FindAdolc.cmake
index 1a7ff3628..937e54990 100644
--- a/cmake/FindAdolc.cmake
+++ b/cmake/FindAdolc.cmake
@@ -5,7 +5,7 @@ endif (ADOLC_INCLUDES AND ADOLC_LIBRARIES)
 
 find_path(ADOLC_INCLUDES
   NAMES
-  adolc/adouble.h
+  adolc/adtl.h
   PATHS
   $ENV{ADOLCDIR}
   ${INCLUDE_INSTALL_DIR}
diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake
new file mode 100644
index 000000000..07ebed61b
--- /dev/null
+++ b/cmake/FindComputeCpp.cmake
@@ -0,0 +1,245 @@
+#.rst:
+# FindComputeCpp
+#---------------
+#
+#   Copyright 2016 Codeplay Software Ltd.
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use these files except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+#########################
+#  FindComputeCpp.cmake
+#########################
+#
+#  Tools for finding and building with ComputeCpp.
+#
+#  User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp
+#   installation.
+#
+#  Latest version of this file can be found at:
+#    https://github.com/codeplaysoftware/computecpp-sdk
+
+# Require CMake version 3.2.2 or higher
+cmake_minimum_required(VERSION 3.2.2)
+
+# Check that a supported host compiler can be found
+if(CMAKE_COMPILER_IS_GNUCXX)
+    # Require at least gcc 4.8
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
+      message(FATAL_ERROR
+        "host compiler - Not found! (gcc version must be at least 4.8)")
+    # Require the GCC dual ABI to be disabled for 5.1 or higher
+    elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1)
+      set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True")
+      message(STATUS
+        "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)")
+    else()
+      message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
+    endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    # Require at least clang 3.6
+    if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
+      message(FATAL_ERROR
+        "host compiler - Not found! (clang version must be at least 3.6)")
+    else()
+      message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
+    endif()
+else()
+  message(WARNING
+    "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)")
+endif()
+
+set(COMPUTECPP_64_BIT_DEFAULT ON)
+option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode"
+        ${COMPUTECPP_64_BIT_DEFAULT})
+mark_as_advanced(COMPUTECPP_64_BIT_CODE)
+
+# Find OpenCL package
+find_package(OpenCL REQUIRED)
+
+# Find ComputeCpp packagee
+if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
+  message(FATAL_ERROR
+    "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
+else()
+  message(STATUS "ComputeCpp package - Found")
+endif()
+option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package")
+
+# Obtain the path to compute++
+find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
+  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
+if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
+  mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
+  message(STATUS "compute++ - Found")
+else()
+  message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
+endif()
+
+# Obtain the path to computecpp_info
+find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
+  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
+if (EXISTS ${COMPUTECPP_INFO_TOOL})
+  mark_as_advanced(${COMPUTECPP_INFO_TOOL})
+  message(STATUS "computecpp_info - Found")
+else()
+  message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
+endif()
+
+# Obtain the path to the ComputeCpp runtime library
+find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR}
+  HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib
+  DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH)
+
+if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
+  mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
+  message(STATUS "libComputeCpp.so - Found")
+else()
+  message(FATAL_ERROR "libComputeCpp.so - Not found!")
+endif()
+
+# Obtain the ComputeCpp include directory
+set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
+if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
+  message(FATAL_ERROR "ComputeCpp includes - Not found!")
+else()
+  message(STATUS "ComputeCpp includes - Found")
+endif()
+
+# Obtain the package version
+execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version"
+  OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION
+  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
+  message(FATAL_ERROR "Package version - Error obtaining version!")
+else()
+  mark_as_advanced(COMPUTECPP_PACKAGE_VERSION)
+  message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}")
+endif()
+
+# Obtain the device compiler flags
+execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags"
+  OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS
+  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
+  message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!")
+else()
+  mark_as_advanced(COMPUTECPP_COMPILER_FLAGS)
+  message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
+endif()
+
+set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1)
+
+# Check if the platform is supported
+execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
+  OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
+  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
+  message(FATAL_ERROR "platform - Error checking platform support!")
+else()
+  mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
+  if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
+    message(STATUS "platform - your system can support ComputeCpp")
+  else()
+    message(STATUS "platform - your system CANNOT support ComputeCpp")
+  endif()
+endif()
+
+####################
+#   __build_sycl
+####################
+#
+#  Adds a custom target for running compute++ and adding a dependency for the
+#  resulting integration header.
+#
+#  targetName : Name of the target.
+#  sourceFile : Source file to be compiled.
+#  binaryDir : Intermediate directory to output the integration header.
+#
+function(__build_spir targetName sourceFile binaryDir)
+
+  # Retrieve source file name.
+  get_filename_component(sourceFileName ${sourceFile} NAME)
+
+  # Set the path to the Sycl file.
+  set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl)
+
+  # Add any user-defined include to the device compiler
+  get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY
+    INCLUDE_DIRECTORIES)
+  set(device_compiler_includes "")
+  foreach(directory ${includeDirectories})
+    set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
+  endforeach()
+  if (CMAKE_INCLUDE_PATH)
+    foreach(directory ${CMAKE_INCLUDE_PATH})
+      set(device_compiler_includes "-I${directory}"
+        ${device_compiler_includes})
+    endforeach()
+  endif()
+
+  # Convert argument list format
+  separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS)
+
+  # Add custom command for running compute++
+  add_custom_command(
+    OUTPUT ${outputSyclFile}
+    COMMAND ${COMPUTECPP_DEVICE_COMPILER}
+            ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
+            -isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
+            ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
+            ${device_compiler_includes}
+            -o ${outputSyclFile}
+            -c ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}
+    DEPENDS ${sourceFile}
+    WORKING_DIRECTORY ${binaryDir}
+  COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
+
+  # Add a custom target for the generated integration header
+  add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile})
+
+  # Add a dependency on the integration header
+  add_dependencies(${targetName} ${targetName}_integration_header)
+
+  # Set the host compiler C++ standard to C++11
+  set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)
+
+  # Disable GCC dual ABI on GCC 5.1 and higher
+  if(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
+    set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS
+      "_GLIBCXX_USE_CXX11_ABI=0")
+  endif()
+
+endfunction()
+
+#######################
+#  add_sycl_to_target
+#######################
+#
+#  Adds a SYCL compilation custom command associated with an existing
+#  target and sets a dependancy on that new command.
+#
+#  targetName : Name of the target to add a SYCL to.
+#  sourceFile : Source file to be compiled for SYCL.
+#  binaryDir : Intermediate directory to output the integration header.
+#
+function(add_sycl_to_target targetName sourceFile binaryDir)
+
+  # Add custom target to run compute++ and generate the integration header
+  __build_spir(${targetName} ${sourceFile} ${binaryDir})
+
+  # Link with the ComputeCpp runtime library
+  target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY}
+                        PUBLIC ${OpenCL_LIBRARIES})
+
+endfunction(add_sycl_to_target)
diff --git a/cmake/FindEigen3.cmake b/cmake/FindEigen3.cmake
index 9c546a05d..9e9697860 100644
--- a/cmake/FindEigen3.cmake
+++ b/cmake/FindEigen3.cmake
@@ -9,6 +9,12 @@
 #  EIGEN3_FOUND - system has eigen lib with correct version
 #  EIGEN3_INCLUDE_DIR - the eigen include directory
 #  EIGEN3_VERSION - eigen version
+#
+# This module reads hints about search locations from 
+# the following enviroment variables:
+#
+# EIGEN3_ROOT
+# EIGEN3_ROOT_DIR
 
 # Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
 # Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
@@ -60,13 +66,23 @@ if (EIGEN3_INCLUDE_DIR)
   set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
 
 else (EIGEN3_INCLUDE_DIR)
-
-  find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
-      PATHS
-      ${CMAKE_INSTALL_PREFIX}/include
-      ${KDE4_INCLUDE_DIR}
-      PATH_SUFFIXES eigen3 eigen
-    )
+  
+  # search first if an Eigen3Config.cmake is available in the system,
+  # if successful this would set EIGEN3_INCLUDE_DIR and the rest of
+  # the script will work as usual
+  find_package(Eigen3 ${Eigen3_FIND_VERSION} NO_MODULE QUIET)
+
+  if(NOT EIGEN3_INCLUDE_DIR)
+    find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
+        HINTS
+        ENV EIGEN3_ROOT 
+        ENV EIGEN3_ROOT_DIR
+        PATHS
+        ${CMAKE_INSTALL_PREFIX}/include
+        ${KDE4_INCLUDE_DIR}
+        PATH_SUFFIXES eigen3 eigen
+      )
+  endif(NOT EIGEN3_INCLUDE_DIR)
 
   if(EIGEN3_INCLUDE_DIR)
     _eigen3_check_version()
diff --git a/cmake/FindSPQR.cmake b/cmake/FindSPQR.cmake
index 794c212af..1e958c3c1 100644
--- a/cmake/FindSPQR.cmake
+++ b/cmake/FindSPQR.cmake
@@ -26,7 +26,12 @@ if(SPQR_LIBRARIES)
   find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS $ENV{SPQRDIR} ${LIB_INSTALL_DIR})
   if (SUITESPARSE_LIBRARY)
     set(SPQR_LIBRARIES ${SPQR_LIBRARIES} ${SUITESPARSE_LIBRARY})
-  endif (SUITESPARSE_LIBRARY)
+  endif()
+
+  find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
+  if(CHOLMOD_LIBRARY)
+    set(SPQR_LIBRARIES ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARY})
+  endif()
   
 endif(SPQR_LIBRARIES)
 
diff --git a/cmake/FindSuperLU.cmake b/cmake/FindSuperLU.cmake
index 8a3df3666..f38146e06 100644
--- a/cmake/FindSuperLU.cmake
+++ b/cmake/FindSuperLU.cmake
@@ -17,10 +17,81 @@ find_path(SUPERLU_INCLUDES
   SRC
 )
 
-find_library(SUPERLU_LIBRARIES superlu PATHS $ENV{SUPERLUDIR} ${LIB_INSTALL_DIR} PATH_SUFFIXES lib)
-  
+find_library(SUPERLU_LIBRARIES
+  NAMES "superlu_5.2.1" "superlu_5.2" "superlu_5.1.1" "superlu_5.1" "superlu_5.0" "superlu_4.3" "superlu_4.2" "superlu_4.1" "superlu_4.0" "superlu_3.1" "superlu_3.0" "superlu"
+  PATHS $ENV{SUPERLUDIR} ${LIB_INSTALL_DIR}
+  PATH_SUFFIXES lib)
+
+if(SUPERLU_INCLUDES AND SUPERLU_LIBRARIES)
+
+include(CheckCXXSourceCompiles)
+include(CMakePushCheckState)
+cmake_push_check_state()
+
+set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${SUPERLU_INCLUDES})
+
+# check whether struct mem_usage_t is globally defined
+check_cxx_source_compiles("
+typedef int int_t;
+#include <supermatrix.h>
+#include <slu_util.h>
+int main() {
+  mem_usage_t mem;
+  return 0;
+}"
+SUPERLU_HAS_GLOBAL_MEM_USAGE_T)
+
+
+check_cxx_source_compiles("
+typedef int int_t;
+#include <supermatrix.h>
+#include <superlu_enum_consts.h>
+int main() {
+  return SLU_SINGLE;
+}"
+SUPERLU_HAS_CLEAN_ENUMS)
+
+check_cxx_source_compiles("
+typedef int int_t;
+#include <supermatrix.h>
+#include <slu_util.h>
+int main(void)
+{
+  GlobalLU_t glu;
+  return 0;
+}"
+SUPERLU_HAS_GLOBALLU_T)
+
+if(SUPERLU_HAS_GLOBALLU_T)
+  # at least 5.0
+  set(SUPERLU_VERSION_VAR "5.0")
+elseif(SUPERLU_HAS_CLEAN_ENUMS)
+  # at least 4.3
+  set(SUPERLU_VERSION_VAR "4.3")
+elseif(SUPERLU_HAS_GLOBAL_MEM_USAGE_T)
+  # at least 4.0
+  set(SUPERLU_VERSION_VAR "4.0")
+else()
+  set(SUPERLU_VERSION_VAR "3.0")
+endif()
+
+cmake_pop_check_state()
+
+if(SuperLU_FIND_VERSION)
+  if(${SUPERLU_VERSION_VAR} VERSION_LESS ${SuperLU_FIND_VERSION})
+    set(SUPERLU_VERSION_OK FALSE)
+  else()
+    set(SUPERLU_VERSION_OK TRUE)
+  endif()
+else()
+  set(SUPERLU_VERSION_OK TRUE)
+endif()
+
+endif()
+
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(SUPERLU DEFAULT_MSG
-                                  SUPERLU_INCLUDES SUPERLU_LIBRARIES)
+find_package_handle_standard_args(SUPERLU
+                                  REQUIRED_VARS SUPERLU_INCLUDES SUPERLU_LIBRARIES SUPERLU_VERSION_OK
+                                  VERSION_VAR SUPERLU_VERSION_VAR)
 
 mark_as_advanced(SUPERLU_INCLUDES SUPERLU_LIBRARIES)
diff --git a/cmake/FindUmfpack.cmake b/cmake/FindUmfpack.cmake
index 16b046cd6..53cf0b49b 100644
--- a/cmake/FindUmfpack.cmake
+++ b/cmake/FindUmfpack.cmake
@@ -20,24 +20,29 @@ find_library(UMFPACK_LIBRARIES umfpack PATHS $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR}
 
 if(UMFPACK_LIBRARIES)
 
-  if (NOT UMFPACK_LIBDIR)
+  if(NOT UMFPACK_LIBDIR)
     get_filename_component(UMFPACK_LIBDIR ${UMFPACK_LIBRARIES} PATH)
   endif(NOT UMFPACK_LIBDIR)
 
   find_library(COLAMD_LIBRARY colamd PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
-  if (COLAMD_LIBRARY)
+  if(COLAMD_LIBRARY)
     set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${COLAMD_LIBRARY})
-  endif (COLAMD_LIBRARY)
+  endif ()
   
   find_library(AMD_LIBRARY amd PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
-  if (AMD_LIBRARY)
+  if(AMD_LIBRARY)
     set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${AMD_LIBRARY})
-  endif (AMD_LIBRARY)
+  endif ()
 
   find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
-  if (SUITESPARSE_LIBRARY)
+  if(SUITESPARSE_LIBRARY)
     set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${SUITESPARSE_LIBRARY})
-  endif (SUITESPARSE_LIBRARY)
+  endif ()
+
+  find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
+  if(CHOLMOD_LIBRARY)
+    set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${CHOLMOD_LIBRARY})
+  endif()
 
 endif(UMFPACK_LIBRARIES)
 
@@ -45,4 +50,4 @@ include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(UMFPACK DEFAULT_MSG
                                   UMFPACK_INCLUDES UMFPACK_LIBRARIES)
 
-mark_as_advanced(UMFPACK_INCLUDES UMFPACK_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY SUITESPARSE_LIBRARY)
+mark_as_advanced(UMFPACK_INCLUDES UMFPACK_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY CHOLMOD_LIBRARY SUITESPARSE_LIBRARY)
diff --git a/cmake/UseEigen3.cmake b/cmake/UseEigen3.cmake
new file mode 100644
index 000000000..a38bac82d
--- /dev/null
+++ b/cmake/UseEigen3.cmake
@@ -0,0 +1,6 @@
+#                                               -*- cmake -*-
+#
+#  UseEigen3.cmake
+
+add_definitions     ( ${EIGEN3_DEFINITIONS} )
+include_directories ( ${EIGEN3_INCLUDE_DIRS} )
diff --git a/cmake/language_support.cmake b/cmake/language_support.cmake
index 231f7ff70..2f14f30b8 100644
--- a/cmake/language_support.cmake
+++ b/cmake/language_support.cmake
@@ -43,7 +43,7 @@ function(workaround_9220 language language_works)
   if(return_code EQUAL 0)
     # Second run
     execute_process (
-      COMMAND ${CMAKE_COMMAND} .
+      COMMAND ${CMAKE_COMMAND} . -G "${CMAKE_GENERATOR}"
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language}
       RESULT_VARIABLE return_code
       OUTPUT_QUIET
@@ -64,3 +64,4 @@ endfunction(workaround_9220)
 #message("CXX_language_works = ${CXX_language_works}")
 #workaround_9220(CXXp CXXp_language_works)
 #message("CXXp_language_works = ${CXXp_language_works}")
+
diff --git a/debug/gdb/printers.py b/debug/gdb/printers.py
index 86996a4f9..0d67a5f99 100644
--- a/debug/gdb/printers.py
+++ b/debug/gdb/printers.py
@@ -49,7 +49,7 @@ class EigenMatrixPrinter:
 		regex = re.compile('\<.*\>')
 		m = regex.findall(tag)[0][1:-1]
 		template_params = m.split(',')
-		template_params = map(lambda x:x.replace(" ", ""), template_params)
+		template_params = [x.replace(" ", "") for x in template_params]
 		
 		if template_params[1] == '-0x00000000000000001' or template_params[1] == '-0x000000001' or template_params[1] == '-1':
 			self.rows = val['m_storage']['m_rows']
@@ -88,8 +88,11 @@ class EigenMatrixPrinter:
 			
 		def __iter__ (self):
 			return self
-			
+
 		def next(self):
+                        return self.__next__()  # Python 2.x compatibility
+
+		def __next__(self):
 			
 			row = self.currentRow
 			col = self.currentCol
@@ -151,8 +154,11 @@ class EigenQuaternionPrinter:
 			
 		def __iter__ (self):
 			return self
-			
+	
 		def next(self):
+                        return self.__next__()  # Python 2.x compatibility
+
+		def __next__(self):
 			element = self.currentElement
 			
 			if self.currentElement >= 4: #there are 4 elements in a quanternion
diff --git a/demos/opengl/quaternion_demo.cpp b/demos/opengl/quaternion_demo.cpp
index 04165619b..dd323a4c9 100644
--- a/demos/opengl/quaternion_demo.cpp
+++ b/demos/opengl/quaternion_demo.cpp
@@ -234,7 +234,7 @@ void RenderingWidget::drawScene()
   gpu.drawVector(Vector3f::Zero(), length*Vector3f::UnitZ(), Color(0,0,1,1));
 
   // draw the fractal object
-  float sqrt3 = internal::sqrt(3.);
+  float sqrt3 = std::sqrt(3.);
   glLightfv(GL_LIGHT0, GL_AMBIENT, Vector4f(0.5,0.5,0.5,1).data());
   glLightfv(GL_LIGHT0, GL_DIFFUSE, Vector4f(0.5,1,0.5,1).data());
   glLightfv(GL_LIGHT0, GL_SPECULAR, Vector4f(1,1,1,1).data());
diff --git a/demos/opengl/trackball.cpp b/demos/opengl/trackball.cpp
index 77ac790c8..7c2da8e96 100644
--- a/demos/opengl/trackball.cpp
+++ b/demos/opengl/trackball.cpp
@@ -23,7 +23,7 @@ void Trackball::track(const Vector2i& point2D)
   {
     Vector3f axis = mLastPoint3D.cross(newPoint3D).normalized();
     float cos_angle = mLastPoint3D.dot(newPoint3D);
-    if ( internal::abs(cos_angle) < 1.0 )
+    if ( std::abs(cos_angle) < 1.0 )
     {
       float angle = 2. * acos(cos_angle);
       if (mMode==Around)
diff --git a/doc/A05_PortingFrom2To3.dox b/doc/A05_PortingFrom2To3.dox
index 4d5f3ae1f..51555f996 100644
--- a/doc/A05_PortingFrom2To3.dox
+++ b/doc/A05_PortingFrom2To3.dox
@@ -2,8 +2,6 @@ namespace Eigen {
 
 /** \page Eigen2ToEigen3 Porting from Eigen2 to Eigen3
 
-<div class="bigwarning">Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3.</div>
-
 This page lists the most important API changes between Eigen2 and Eigen3,
 and gives tips to help porting your application from Eigen2 to Eigen3.
 
@@ -11,11 +9,8 @@ and gives tips to help porting your application from Eigen2 to Eigen3.
 
 \section CompatibilitySupport Eigen2 compatibility support
 
-In order to ease the switch from Eigen2 to Eigen3, Eigen3 features \subpage Eigen2SupportModes "Eigen2 support modes".
-
-The quick way to enable this is to define the \c EIGEN2_SUPPORT preprocessor token \b before including any Eigen header (typically it should be set in your project options).
-
-A more powerful, \em staged migration path is also provided, which may be useful to migrate larger projects from Eigen2 to Eigen3. This is explained in the \ref Eigen2SupportModes "Eigen 2 support modes" page. 
+Up to version 3.2 %Eigen provides <a href="http://eigen.tuxfamily.org/dox-3.2/Eigen2SupportModes.html">Eigen2 support modes</a>. These are removed now, because they were barely used anymore and became hard to maintain after internal re-designs.
+You can still use them by first <a href="http://eigen.tuxfamily.org/dox-3.2/Eigen2ToEigen3.html">porting your code to Eigen 3.2</a>.
 
 \section Using The USING_PART_OF_NAMESPACE_EIGEN macro
 
@@ -228,7 +223,7 @@ triangular part to work on</td></tr>
 
 \section GeometryModule Changes in the Geometry module
 
-The Geometry module is the one that changed the most. If you rely heavily on it, it's probably a good idea to use the \ref Eigen2SupportModes "Eigen 2 support modes" to perform your migration.
+The Geometry module is the one that changed the most. If you rely heavily on it, it's probably a good idea to use the <a href="http://eigen.tuxfamily.org/dox-3.2/Eigen2SupportModes.html">"Eigen 2 support modes"</a> to perform your migration.
 
 \section Transform The Transform class
 
@@ -266,7 +261,7 @@ use it unless you are sure of what you are doing, i.e., you have rigourosly meas
 
 The EIGEN_ALIGN_128 macro has been renamed to EIGEN_ALIGN16. Don't be surprised, it's just that we switched to counting in bytes ;-)
 
-The EIGEN_DONT_ALIGN option still exists in Eigen 3, but it has a new cousin: EIGEN_DONT_ALIGN_STATICALLY. It allows to get rid of all static alignment issues while keeping alignment of dynamic-size heap-allocated arrays, thus keeping vectorization for dynamic-size objects.
+The \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN \endlink option still exists in Eigen 3, but it has a new cousin: \link TopicPreprocessorDirectivesPerformance  EIGEN_DONT_ALIGN_STATICALLY.\endlink It allows to get rid of all static alignment issues while keeping alignment of dynamic-size heap-allocated arrays. Vectorization of statically allocated arrays is still preserved (unless you define \link TopicPreprocessorDirectivesPerformance EIGEN_UNALIGNED_VECTORIZE \endlink =0), at the cost of unaligned memory stores.
 
 \section AlignedMap Aligned Map objects
 
@@ -283,7 +278,7 @@ result = Vector4f::MapAligned(some_aligned_array);
 
 \section StdContainers STL Containers
 
-In Eigen2, <tt>#include<Eigen/StdVector></tt> tweaked std::vector to automatically align elements. The problem was that that was quite invasive. In Eigen3, we only override standard behavior if you use Eigen::aligned_allocator<T> as your allocator type. So for example, if you use std::vector<Matrix4f>, you need to do the following change (note that aligned_allocator is under namespace Eigen):
+In Eigen2, <tt>\#include\<Eigen/StdVector\></tt> tweaked std::vector to automatically align elements. The problem was that that was quite invasive. In Eigen3, we only override standard behavior if you use Eigen::aligned_allocator<T> as your allocator type. So for example, if you use std::vector<Matrix4f>, you need to do the following change (note that aligned_allocator is under namespace Eigen):
 
 <table class="manual">
 <tr><th>Eigen 2</th><th>Eigen 3</th></tr>
diff --git a/doc/A10_Eigen2SupportModes.dox b/doc/A10_Eigen2SupportModes.dox
deleted file mode 100644
index f3df91515..000000000
--- a/doc/A10_Eigen2SupportModes.dox
+++ /dev/null
@@ -1,95 +0,0 @@
-namespace Eigen {
-
-/** \page Eigen2SupportModes Eigen 2 support modes
-
-<div class="bigwarning">Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3.</div>
-
-This page documents the Eigen2 support modes, a powerful tool to help migrating your project from Eigen 2 to Eigen 3.
-Don't miss our page on \ref Eigen2ToEigen3 "API changes" between Eigen 2 and Eigen 3.
-
-\eigenAutoToc
-
-\section EIGEN2_SUPPORT_Macro The quick way: define EIGEN2_SUPPORT
-
-By defining EIGEN2_SUPPORT before including any Eigen 3 header, you get back a large part of the Eigen 2 API, while keeping the Eigen 3 API and ABI unchanged.
-
-This defaults to the \ref Stage30 "stage 30" described below.
-
-The rest of this page describes an optional, more powerful \em staged migration path.
-
-\section StagedMigrationPathOverview Overview of the staged migration path
-
-The primary reason why EIGEN2_SUPPORT alone may not be enough to migrate a large project from Eigen 2 to Eigen 3 is that some of the Eigen 2 API is inherently incompatible with the Eigen 3 API. This happens when the same identifier is used in Eigen 2 and in Eigen 3 with different meanings. To help migrate projects that rely on such API, we provide a staged migration path allowing to perform the migration \em incrementally.
-
-It goes as follows:
-\li Step 0: start with a project using Eigen 2.
-\li Step 1: build your project against Eigen 3 with \ref Stage10 "Eigen 2 support stage 10". This mode enables maximum compatibility with the Eigen 2 API, with just a few exceptions.
-\li Step 2: build your project against Eigen 3 with \ref Stage20 "Eigen 2 support stage 20". This mode forces you to add eigen2_ prefixes to the Eigen2 identifiers that conflict with Eigen 3 API.
-\li Step 3: build your project against Eigen 3 with \ref Stage30 "Eigen 2 support stage 30". This mode enables the full Eigen 3 API.
-\li Step 4: build your project against Eigen 3 with \ref Stage40 "Eigen 2 support stage 40". This mode enables the full Eigen 3 strictness on matters, such as const-correctness, where Eigen 2 was looser.
-\li Step 5: build your project against Eigen 3 without any Eigen 2 support mode.
-
-\section Stage10 Stage 10: define EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API
-
-Enable this mode by defining the EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API preprocessor macro before including any Eigen 3 header.
-
-This mode maximizes support for the Eigen 2 API. As a result, it does not offer the full Eigen 3 API. Also, it doesn't offer quite 100% of the Eigen 2 API.
-
-The part of the Eigen 3 API that is not present in this mode, is Eigen 3's Geometry module. Indeed, this mode completely replaces it by a copy of Eigen 2's Geometry module.
-
-The parts of the API that are still not 100% Eigen 2 compatible in this mode are:
-\li Dot products over complex numbers. Eigen 2's dot product was linear in the first variable. Eigen 3's dot product is linear in the second variable. In other words, the Eigen 2 code \code x.dot(y) \endcode is equivalent to the Eigen 3 code \code y.dot(x) \endcode In yet other words, dot products are complex-conjugated in Eigen 3 compared to Eigen 2. The switch to the new convention was commanded by common usage, especially with the notation \f$ x^Ty \f$ for dot products of column-vectors.
-\li The Sparse module.
-\li Certain fine details of linear algebraic decompositions. For example, LDLT decomposition is now pivoting in Eigen 3 whereas it wasn't in Eigen 2, so code that was relying on its underlying matrix structure will break.
-\li Usage of Eigen types in STL containers, \ref Eigen2ToEigen3 "as explained on this page".
-
-\section Stage20 Stage 20: define EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS
-
-Enable this mode by defining the EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API preprocessor macro before including any Eigen 3 header.
-
-This mode removes the Eigen 2 API that is directly conflicting with Eigen 3 API. Instead, these bits of Eigen 2 API remain available with eigen2_ prefixes. The main examples of such API are:
-\li the whole Geometry module. For example, replace \c Quaternion by \c eigen2_Quaternion, replace \c Transform3f by \c eigen2_Transform3f, etc.
-\li the lu() method to obtain a LU decomposition. Replace by eigen2_lu().
-
-There is also one more eigen2_-prefixed identifier that you should know about, even though its use is not checked at compile time by this mode: the dot() method. As was discussed above, over complex numbers, its meaning is different between Eigen 2 and Eigen 3. You can use eigen2_dot() to get the Eigen 2 behavior.
-
-\section Stage30 Stage 30: define EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-
-Enable this mode by defining the EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API preprocessor macro before including any Eigen 3 header. Also, this mode is what you get by default when you just define EIGEN2_SUPPORT.
-
-This mode gives you the full unaltered Eigen 3 API, while still keeping as much support as possible for the Eigen 2 API.
-
-The eigen2_-prefixed identifiers are still available, but at this stage you should now replace them by Eigen 3 identifiers. Have a look at our page on \ref Eigen2ToEigen3 "API changes" between Eigen 2 and Eigen 3.
-
-\section Stage40 Stage 40: define EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS
-
-Enable this mode by defining the EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS preprocessor macro before including any Eigen 3 header.
-
-This mode tightens the last bits of strictness, especially const-correctness, that had to be loosened to support what Eigen 2 allowed. For example, this code compiled in Eigen 2:
-\code
-const float array[4];
-x = Map<Vector4f>(array);
-\endcode
-That allowed to circumvent constness. This is no longer allowed in Eigen 3. If you have to map const data in Eigen 3, map it as a const-qualified type. However, rather than explictly constructing Map objects, we strongly encourage you to use the static Map methods instead, as they take care of all of this for you:
-\code
-const float array[4];
-x = Vector4f::Map(array);
-\endcode
-This lets Eigen do the right thing for you and works equally well in Eigen 2 and in Eigen 3.
-
-\section FinallyDropAllEigen2Support Finally drop all Eigen 2 support
-
-Stage 40 is the first where it's "comfortable" to stay for a little longer period, since it preserves 100% Eigen 3 compatibility. However, we still encourage you to complete your migration as quickly as possible. While we do run the Eigen 2 test suite against Eigen 3's stage 10 support mode, we can't guarantee the same level of support and quality assurance for Eigen 2 support as we do for Eigen 3 itself, especially not in the long term. \ref Eigen2ToEigen3 "This page" describes a large part of the changes that you may need to perform.
-
-\section ABICompatibility What about ABI compatibility?
-
-It goes as follows:
-\li Stage 10 already is ABI compatible with Eigen 3 for the basic (Matrix, Array, SparseMatrix...) types. However, since this stage uses a copy of Eigen 2's Geometry module instead of Eigen 3's own Geometry module, the ABI in the Geometry module is not Eigen 3 compatible.
-\li Stage 20 removes the Eigen 3-incompatible Eigen 2 Geometry module (it remains available with eigen2_ prefix). So at this stage, all the identifiers that exist in Eigen 3 have the Eigen 3 ABI (and API).
-\li Stage 30 introduces the remaining Eigen 3 identifiers. So at this stage, you have the full Eigen 3 ABI.
-\li Stage 40 is no different than Stage 30 in these matters.
-
-
-*/
-
-}
diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt
index b9f497f87..8409f8850 100644
--- a/doc/AsciiQuickReference.txt
+++ b/doc/AsciiQuickReference.txt
@@ -32,17 +32,19 @@ A << 1, 2, 3,     // Initialize A. The elements can also be
 B << A, A, A;     // B is three horizontally stacked A's.
 A.fill(10);       // Fill A with all 10's.
 
-// Eigen                            // Matlab
-MatrixXd::Identity(rows,cols)       // eye(rows,cols)
-C.setIdentity(rows,cols)            // C = eye(rows,cols)
-MatrixXd::Zero(rows,cols)           // zeros(rows,cols)
-C.setZero(rows,cols)                // C = ones(rows,cols)
-MatrixXd::Ones(rows,cols)           // ones(rows,cols)
-C.setOnes(rows,cols)                // C = ones(rows,cols)
-MatrixXd::Random(rows,cols)         // rand(rows,cols)*2-1        // MatrixXd::Random returns uniform random numbers in (-1, 1).
-C.setRandom(rows,cols)              // C = rand(rows,cols)*2-1
-VectorXd::LinSpaced(size,low,high)   // linspace(low,high,size)'
-v.setLinSpaced(size,low,high)        // v = linspace(low,high,size)'
+// Eigen                                    // Matlab
+MatrixXd::Identity(rows,cols)               // eye(rows,cols)
+C.setIdentity(rows,cols)                    // C = eye(rows,cols)
+MatrixXd::Zero(rows,cols)                   // zeros(rows,cols)
+C.setZero(rows,cols)                        // C = zeros(rows,cols)
+MatrixXd::Ones(rows,cols)                   // ones(rows,cols)
+C.setOnes(rows,cols)                        // C = ones(rows,cols)
+MatrixXd::Random(rows,cols)                 // rand(rows,cols)*2-1            // MatrixXd::Random returns uniform random numbers in (-1, 1).
+C.setRandom(rows,cols)                      // C = rand(rows,cols)*2-1
+VectorXd::LinSpaced(size,low,high)          // linspace(low,high,size)'
+v.setLinSpaced(size,low,high)               // v = linspace(low,high,size)'
+VectorXi::LinSpaced(((hi-low)/step)+1,      // low:step:hi
+                    low,low+step*(size-1))  //
 
 
 // Matrix slicing and blocks. All expressions listed here are read/write.
@@ -82,17 +84,20 @@ P.bottomRightCorner<rows,cols>()   // P(end-rows+1:end, end-cols+1:end)
 
 // Of particular note is Eigen's swap function which is highly optimized.
 // Eigen                           // Matlab
-R.row(i) = P.col(j);               // R(i, :) = P(:, i)
+R.row(i) = P.col(j);               // R(i, :) = P(:, j)
 R.col(j1).swap(mat1.col(j2));      // R(:, [j1 j2]) = R(:, [j2, j1])
 
-// Views, transpose, etc; all read-write except for .adjoint().
+// Views, transpose, etc;
 // Eigen                           // Matlab
 R.adjoint()                        // R'
-R.transpose()                      // R.' or conj(R')
-R.diagonal()                       // diag(R)
+R.transpose()                      // R.' or conj(R')       // Read-write
+R.diagonal()                       // diag(R)               // Read-write
 x.asDiagonal()                     // diag(x)
-R.transpose().colwise().reverse(); // rot90(R)
-R.conjugate()                      // conj(R)
+R.transpose().colwise().reverse()  // rot90(R)              // Read-write
+R.rowwise().reverse()              // fliplr(R)
+R.colwise().reverse()              // flipud(R)
+R.replicate(i,j)                   // repmat(P,i,j)
+
 
 // All the same as Matlab, but matlab doesn't have *= style operators.
 // Matrix-vector.  Matrix-matrix.   Matrix-scalar.
@@ -104,37 +109,40 @@ a *= M;            R  = P + Q;      R  = P/s;
                    R -= Q;          R /= s;
 
 // Vectorized operations on each element independently
-// Eigen                  // Matlab
-R = P.cwiseProduct(Q);    // R = P .* Q
-R = P.array() * s.array();// R = P .* s
-R = P.cwiseQuotient(Q);   // R = P ./ Q
-R = P.array() / Q.array();// R = P ./ Q
-R = P.array() + s.array();// R = P + s
-R = P.array() - s.array();// R = P - s
-R.array() += s;           // R = R + s
-R.array() -= s;           // R = R - s
-R.array() < Q.array();    // R < Q
-R.array() <= Q.array();   // R <= Q
-R.cwiseInverse();         // 1 ./ P
-R.array().inverse();      // 1 ./ P
-R.array().sin()           // sin(P)
-R.array().cos()           // cos(P)
-R.array().pow(s)          // P .^ s
-R.array().square()        // P .^ 2
-R.array().cube()          // P .^ 3
-R.cwiseSqrt()             // sqrt(P)
-R.array().sqrt()          // sqrt(P)
-R.array().exp()           // exp(P)
-R.array().log()           // log(P)
-R.cwiseMax(P)             // max(R, P)
-R.array().max(P.array())  // max(R, P)
-R.cwiseMin(P)             // min(R, P)
-R.array().min(P.array())  // min(R, P)
-R.cwiseAbs()              // abs(P)
-R.array().abs()           // abs(P)
-R.cwiseAbs2()             // abs(P.^2)
-R.array().abs2()          // abs(P.^2)
-(R.array() < s).select(P,Q);  // (R < s ? P : Q)
+// Eigen                       // Matlab
+R = P.cwiseProduct(Q);         // R = P .* Q
+R = P.array() * s.array();     // R = P .* s
+R = P.cwiseQuotient(Q);        // R = P ./ Q
+R = P.array() / Q.array();     // R = P ./ Q
+R = P.array() + s.array();     // R = P + s
+R = P.array() - s.array();     // R = P - s
+R.array() += s;                // R = R + s
+R.array() -= s;                // R = R - s
+R.array() < Q.array();         // R < Q
+R.array() <= Q.array();        // R <= Q
+R.cwiseInverse();              // 1 ./ P
+R.array().inverse();           // 1 ./ P
+R.array().sin()                // sin(P)
+R.array().cos()                // cos(P)
+R.array().pow(s)               // P .^ s
+R.array().square()             // P .^ 2
+R.array().cube()               // P .^ 3
+R.cwiseSqrt()                  // sqrt(P)
+R.array().sqrt()               // sqrt(P)
+R.array().exp()                // exp(P)
+R.array().log()                // log(P)
+R.cwiseMax(P)                  // max(R, P)
+R.array().max(P.array())       // max(R, P)
+R.cwiseMin(P)                  // min(R, P)
+R.array().min(P.array())       // min(R, P)
+R.cwiseAbs()                   // abs(P)
+R.array().abs()                // abs(P)
+R.cwiseAbs2()                  // abs(P.^2)
+R.array().abs2()               // abs(P.^2)
+(R.array() < s).select(P,Q );  // (R < s ? P : Q)
+R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0)
+R = P.unaryExpr(ptr_fun(func)) // R = arrayfun(func, P)   // with: scalar func(const scalar &x);
+
 
 // Reductions.
 int r, c;
@@ -165,12 +173,12 @@ x.dot(y)                  // dot(x, y)
 x.cross(y)                // cross(x, y) Requires #include <Eigen/Geometry>
 
 //// Type conversion
-// Eigen                           // Matlab
-A.cast<double>();                  // double(A)
-A.cast<float>();                   // single(A)
-A.cast<int>();                     // int32(A)
-A.real();                          // real(A)
-A.imag();                          // imag(A)
+// Eigen                  // Matlab
+A.cast<double>();         // double(A)
+A.cast<float>();          // single(A)
+A.cast<int>();            // int32(A)
+A.real();                 // real(A)
+A.imag();                 // imag(A)
 // if the original type equals destination type, no work is done
 
 // Note that for most operations Eigen requires all operands to have the same type:
diff --git a/doc/B01_Experimental.dox b/doc/B01_Experimental.dox
index 5fc0ccd60..e1f031db8 100644
--- a/doc/B01_Experimental.dox
+++ b/doc/B01_Experimental.dox
@@ -4,7 +4,7 @@ namespace Eigen {
 
 \eigenAutoToc
 
-\section summary Summary
+\section Experimental_summary Summary
 
 With the 2.0 release, Eigen's API is, to a large extent, stable. However, we wish to retain the freedom to make API incompatible changes. To that effect, we call many parts of Eigen "experimental" which means that they are not subject to API stability guarantee.
 
@@ -17,7 +17,7 @@ Experimental features may at any time:
 \li be subject to an API incompatible change;
 \li introduce API or ABI incompatible changes in your own code if you let them affect your API or ABI.
 
-\section modules Experimental modules
+\section Experimental_modules Experimental modules
 
 The following modules are considered entirely experimental, and we make no firm API stability guarantee about them for the time being:
 \li SVD
@@ -26,7 +26,7 @@ The following modules are considered entirely experimental, and we make no firm
 \li Sparse
 \li Geometry (this one should be mostly stable, but it's a little too early to make a formal guarantee)
 
-\section core Experimental parts of the Core module
+\section Experimental_core Experimental parts of the Core module
 
 In the Core module, the only classes subject to ABI stability guarantee (meaning that you can use it for data members in your public ABI) is:
 \li Matrix
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 2fc2a0dfc..db413bc65 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -10,12 +10,20 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   endif(CMAKE_SYSTEM_NAME MATCHES Linux)
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
+option(EIGEN_INTERNAL_DOCUMENTATION "Build internal documentation" OFF)
+
+
 # Set some Doxygen flags
 set(EIGEN_DOXY_PROJECT_NAME             "Eigen")
 set(EIGEN_DOXY_OUTPUT_DIRECTORY_SUFFIX  "")
 set(EIGEN_DOXY_INPUT                    "\"${Eigen_SOURCE_DIR}/Eigen\" \"${Eigen_SOURCE_DIR}/doc\"")
 set(EIGEN_DOXY_HTML_COLORSTYLE_HUE      "220")
 set(EIGEN_DOXY_TAGFILES                 "")
+if(EIGEN_INTERNAL_DOCUMENTATION)
+  set(EIGEN_DOXY_INTERNAL                 "YES")
+else(EIGEN_INTERNAL_DOCUMENTATION)
+  set(EIGEN_DOXY_INTERNAL                 "NO")
+endif(EIGEN_INTERNAL_DOCUMENTATION)
 
 configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in
@@ -70,6 +78,8 @@ add_custom_target(
   COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/html/
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/eigen_navtree_hacks.js           ${CMAKE_CURRENT_BINARY_DIR}/html/
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/Eigen_Silly_Professor_64x64.png  ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2pnode.png                    ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2node.png                     ${CMAKE_CURRENT_BINARY_DIR}/html/
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/AsciiQuickReference.txt          ${CMAKE_CURRENT_BINARY_DIR}/html/
   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 )
@@ -80,6 +90,8 @@ add_custom_target(
   COMMAND ${CMAKE_COMMAND} -E make_directory ${Eigen_BINARY_DIR}/doc/html/unsupported
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/eigen_navtree_hacks.js           ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/Eigen_Silly_Professor_64x64.png  ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2pnode.png                    ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2node.png                     ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
   WORKING_DIRECTORY ${Eigen_BINARY_DIR}/doc
 )
 
@@ -89,6 +101,7 @@ add_dependencies(doc-unsupported-prerequisites unsupported_snippets unsupported_
 add_custom_target(doc ALL
   COMMAND doxygen
   COMMAND doxygen Doxyfile-unsupported
+  COMMAND ${CMAKE_COMMAND} -E copy ${Eigen_BINARY_DIR}/doc/html/group__TopicUnalignedArrayAssert.html ${Eigen_BINARY_DIR}/doc/html/TopicUnalignedArrayAssert.html
   COMMAND ${CMAKE_COMMAND} -E rename html eigen-doc
   COMMAND ${CMAKE_COMMAND} -E remove eigen-doc/eigen-doc.tgz
   COMMAND ${CMAKE_COMMAND} -E tar cfz eigen-doc.tgz eigen-doc
diff --git a/doc/CoeffwiseMathFunctionsTable.dox b/doc/CoeffwiseMathFunctionsTable.dox
new file mode 100644
index 000000000..3ae9420dc
--- /dev/null
+++ b/doc/CoeffwiseMathFunctionsTable.dox
@@ -0,0 +1,525 @@
+namespace Eigen {
+
+/** \eigenManualPage CoeffwiseMathFunctions Catalog of coefficient-wise math functions
+
+
+<!-- <span style="font-size:300%; color:red; font-weight: 900;">!WORK IN PROGRESS!</span> -->
+
+This table presents a catalog of the coefficient-wise math functions supported by %Eigen.
+In this table, \c a, \c b, refer to Array objects or expressions, and \c m refers to a linear algebra Matrix/Vector object. Standard scalar types are abbreviated as follows:
+  - \c int: \c i32
+  - \c float: \c f
+  - \c double: \c d
+  - \c std::complex<float>: \c cf
+  - \c std::complex<double>: \c cd
+
+For each row, the first column list the equivalent calls for arrays, and matrices when supported. Of course, all functions are available for matrices by first casting it as an array: \c m.array().
+
+The third column gives some hints in the underlying scalar implementation. In most cases, %Eigen does not implement itself the math function but relies on the STL for standard scalar types, or user-provided functions for custom scalar types.
+For instance, some simply calls the respective function of the STL while preserving <a href="http://en.cppreference.com/w/cpp/language/adl">argument-dependent lookup</a> for custom types.
+The following:
+\code
+using std::foo;
+foo(a[i]);
+\endcode
+means that the STL's function \c std::foo will be potentially called if it is compatible with the underlying scalar type. If not, then the user must ensure that an overload of the function foo is available for the given scalar type (usually defined in the same namespace as the given scalar type).
+This also means that, unless specified, if the function \c std::foo is available only in some recent c++ versions (e.g., c++11), then the respective %Eigen's function/method will be usable on standard types only if the compiler support the required c++ version.
+
+<table class="manual-hl">
+<tr>
+<th>API</th><th>Description</th><th>Default scalar implementation</th><th>SIMD</th>
+</tr>
+<tr><td colspan="4"></td></tr>
+<tr><th colspan="4">Basic operations</th></tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_abs
+  a.\link ArrayBase::abs abs\endlink(); \n
+  \link Eigen::abs abs\endlink(a); \n
+  m.\link MatrixBase::cwiseAbs cwiseAbs\endlink();
+  </td>
+  <td>absolute value (\f$ |a_i| \f$) </td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/fabs">std::abs</a>; \n
+  abs(a[i]);
+  </td>
+  <td>SSE2, AVX (i32,f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_inverse
+  a.\link ArrayBase::inverse inverse\endlink(); \n
+  \link Eigen::inverse inverse\endlink(a); \n
+  m.\link MatrixBase::cwiseInverse cwiseInverse\endlink();
+  </td>
+  <td>inverse value (\f$ 1/a_i \f$) </td>
+  <td class="code">
+  1/a[i];
+  </td>
+  <td>All engines (f,d,fc,fd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_conj
+  a.\link ArrayBase::conjugate conjugate\endlink(); \n
+  \link Eigen::conj conj\endlink(a); \n
+  m.\link MatrixBase::conjugate conjugate();
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Complex_conjugate">complex conjugate</a> (\f$ \bar{a_i} \f$),\n
+  no-op for real </td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/complex/conj">std::conj</a>; \n
+  conj(a[i]);
+  </td>
+  <td>All engines (fc,fd)</td>
+</tr>
+<tr>
+<th colspan="4">Exponential functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_exp
+  a.\link ArrayBase::exp exp\endlink(); \n
+  \link Eigen::exp exp\endlink(a);
+  </td>
+  <td>\f$ e \f$ raised to the given power (\f$ e^{a_i} \f$) </td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/exp">std::exp</a>; \n
+  exp(a[i]);
+  </td>
+  <td>SSE2, AVX (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_log
+  a.\link ArrayBase::log log\endlink(); \n
+  \link Eigen::log log\endlink(a);
+  </td>
+  <td>natural (base \f$ e \f$) logarithm (\f$ \ln({a_i}) \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/log">std::log</a>; \n
+  log(a[i]);
+  </td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_log1p
+  a.\link ArrayBase::log1p log1p\endlink(); \n
+  \link Eigen::log1p log1p\endlink(a);
+  </td>
+  <td>natural (base \f$ e \f$) logarithm of 1 plus \n the given number (\f$ \ln({1+a_i}) \f$)</td>
+  <td>built-in generic implementation based on \c log,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/log1p">\c std::log1p </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_log10
+  a.\link ArrayBase::log10 log10\endlink(); \n
+  \link Eigen::log10 log10\endlink(a);
+  </td>
+  <td>base 10 logarithm (\f$ \log_{10}({a_i}) \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/log10">std::log10</a>; \n
+  log10(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Power functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_pow
+  a.\link ArrayBase::pow pow\endlink(b); \n
+  \link Eigen::pow pow\endlink(a,b);
+  </td>
+  <td>raises a number to the given power (\f$ a_i ^ {b_i} \f$) \n \c a and \c b can be either an array or scalar.</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/pow">std::pow</a>; \n
+  pow(a[i],b[i]);\n
+  (plus builtin for integer types)</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sqrt
+  a.\link ArrayBase::sqrt sqrt\endlink(); \n
+  \link Eigen::sqrt sqrt\endlink(a);\n
+  m.\link MatrixBase::cwiseSqrt cwiseSqrt\endlink();
+  </td>
+  <td>computes square root (\f$ \sqrt a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sqrt">std::sqrt</a>; \n
+  sqrt(a[i]);</td>
+  <td>SSE2, AVX (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_rsqrt
+  a.\link ArrayBase::rsqrt rsqrt\endlink(); \n
+  \link Eigen::rsqrt rsqrt\endlink(a);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Fast_inverse_square_root">reciprocal square root</a> (\f$ 1/{\sqrt a_i} \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sqrt">std::sqrt</a>; \n
+  1/sqrt(a[i]); \n
+  </td>
+  <td>SSE2, AVX, AltiVec, ZVector (f,d)\n
+  (approx + 1 Newton iteration)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_square
+  a.\link ArrayBase::square square\endlink(); \n
+  \link Eigen::square square\endlink(a);
+  </td>
+  <td>computes square power (\f$ a_i^2 \f$)</td>
+  <td class="code">
+  a[i]*a[i]</td>
+  <td>All (i32,f,d,cf,cd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cube
+  a.\link ArrayBase::cube cube\endlink(); \n
+  \link Eigen::cube cube\endlink(a);
+  </td>
+  <td>computes cubic power (\f$ a_i^3 \f$)</td>
+  <td class="code">
+  a[i]*a[i]*a[i]</td>
+  <td>All (i32,f,d,cf,cd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_abs2
+  a.\link ArrayBase::abs2 abs2\endlink(); \n
+  \link Eigen::abs2 abs2\endlink(a);\n
+  m.\link MatrixBase::cwiseAbs2 cwiseAbs2\endlink();
+  </td>
+  <td>computes the squared absolute value (\f$ |a_i|^2 \f$)</td>
+  <td class="code">
+  real:    a[i]*a[i] \n
+  complex:  real(a[i])*real(a[i]) \n
+  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; + imag(a[i])*imag(a[i])</td>
+  <td>All (i32,f,d)</td>
+</tr>
+<tr>
+<th colspan="4">Trigonometric functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sin
+  a.\link ArrayBase::sin sin\endlink(); \n
+  \link Eigen::sin sin\endlink(a);
+  </td>
+  <td>computes sine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sin">std::sin</a>; \n
+  sin(a[i]);</td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cos
+  a.\link ArrayBase::cos cos\endlink(); \n
+  \link Eigen::cos cos\endlink(a);
+  </td>
+  <td>computes cosine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/cos">std::cos</a>; \n
+  cos(a[i]);</td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_tan
+  a.\link ArrayBase::tan tan\endlink(); \n
+  \link Eigen::tan tan\endlink(a);
+  </td>
+  <td>computes tangent</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/tan">std::tan</a>; \n
+  tan(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_asin
+  a.\link ArrayBase::asin asin\endlink(); \n
+  \link Eigen::asin asin\endlink(a);
+  </td>
+  <td>computes arc sine (\f$ \sin^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/asin">std::asin</a>; \n
+  asin(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_acos
+  a.\link ArrayBase::acos acos\endlink(); \n
+  \link Eigen::acos acos\endlink(a);
+  </td>
+  <td>computes arc cosine  (\f$ \cos^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/acos">std::acos</a>; \n
+  acos(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_atan
+  a.\link ArrayBase::atan tan\endlink(); \n
+  \link Eigen::atan atan\endlink(a);
+  </td>
+  <td>computes arc tangent (\f$ \tan^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/atan">std::atan</a>; \n
+  atan(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Hyperbolic functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sinh
+  a.\link ArrayBase::sinh sinh\endlink(); \n
+  \link Eigen::sinh sinh\endlink(a);
+  </td>
+  <td>computes hyperbolic sine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sinh">std::sinh</a>; \n
+  sinh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cosh
+  a.\link ArrayBase::cosh cohs\endlink(); \n
+  \link Eigen::cosh cosh\endlink(a);
+  </td>
+  <td>computes hyperbolic cosine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/cosh">std::cosh</a>; \n
+  cosh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_tanh
+  a.\link ArrayBase::tanh tanh\endlink(); \n
+  \link Eigen::tanh tanh\endlink(a);
+  </td>
+  <td>computes hyperbolic tangent</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/tanh">std::tanh</a>; \n
+  tanh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Nearest integer floating point operations</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_ceil
+  a.\link ArrayBase::ceil ceil\endlink(); \n
+  \link Eigen::ceil ceil\endlink(a);
+  </td>
+  <td>nearest integer not less than the given value</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/ceil">std::ceil</a>; \n
+  ceil(a[i]);</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_floor
+  a.\link ArrayBase::floor floor\endlink(); \n
+  \link Eigen::floor floor\endlink(a);
+  </td>
+  <td>nearest integer not greater than the given value</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/floor">std::floor</a>; \n
+  floor(a[i]);</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_round
+  a.\link ArrayBase::round round\endlink(); \n
+  \link Eigen::round round\endlink(a);
+  </td>
+  <td>nearest integer, \n rounding away from zero in halfway cases</td>
+  <td>built-in generic implementation \n based on \c floor and \c ceil,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/round">\c std::round </a>; \cpp11</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+<th colspan="4">Floating point manipulation functions</th>
+</tr>
+<tr>
+<th colspan="4">Classification and comparison</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isfinite
+  a.\link ArrayBase::isFinite isFinite\endlink(); \n
+  \link Eigen::isfinite isfinite\endlink(a);
+  </td>
+  <td>checks if the given number has finite value</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isfinite">\c std::isfinite </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isinf
+  a.\link ArrayBase::isInf isInf\endlink(); \n
+  \link Eigen::isinf isinf\endlink(a);
+  </td>
+  <td>checks if the given number is infinite</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isinf">\c std::isinf </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isnan
+  a.\link ArrayBase::isNaN isNaN\endlink(); \n
+  \link Eigen::isnan isnan\endlink(a);
+  </td>
+  <td>checks if the given number is not a number</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isnan">\c std::isnan </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Error and gamma functions</th>
+</tr>
+<tr> <td colspan="4">  Require \c \#include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_erf
+  a.\link ArrayBase::erf erf\endlink(); \n
+  \link Eigen::erf erf\endlink(a);
+  </td>
+  <td>error function</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/erf">std::erf</a>; \cpp11 \n
+  erf(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_erfc
+  a.\link ArrayBase::erfc erfc\endlink(); \n
+  \link Eigen::erfc erfc\endlink(a);
+  </td>
+  <td>complementary error function</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/erfc">std::erfc</a>; \cpp11 \n
+  erfc(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_lgamma
+  a.\link ArrayBase::lgamma lgamma\endlink(); \n
+  \link Eigen::lgamma lgamma\endlink(a);
+  </td>
+  <td>natural logarithm of the gamma function</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/lgamma">std::lgamma</a>; \cpp11 \n
+  lgamma(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_digamma
+  a.\link ArrayBase::digamma digamma\endlink(); \n
+  \link Eigen::digamma digamma\endlink(a);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Digamma_function">logarithmic derivative of the gamma function</a></td>
+  <td>
+  built-in for float and double
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_igamma
+  \link Eigen::igamma igamma\endlink(a,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Incomplete_gamma_function">lower incomplete gamma integral</a>
+  \n \f$ \gamma(a_i,x_i)= \frac{1}{|a_i|} \int_{0}^{x_i}e^{\text{-}t} t^{a_i-1} \mathrm{d} t \f$</td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_igammac
+  \link Eigen::igammac igammac\endlink(a,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Incomplete_gamma_function">upper incomplete gamma integral</a>
+  \n \f$ \Gamma(a_i,x_i) = \frac{1}{|a_i|} \int_{x_i}^{\infty}e^{\text{-}t} t^{a_i-1} \mathrm{d} t \f$</td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Special functions</th>
+</tr>
+<tr> <td colspan="4">  Require \c \#include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_polygamma
+  \link Eigen::polygamma polygamma\endlink(n,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Polygamma_function">n-th derivative of digamma at x</a></td>
+  <td>
+  built-in generic based on\n <a href="#cwisetable_lgamma">\c lgamma </a>,
+  <a href="#cwisetable_digamma"> \c digamma </a>
+  and <a href="#cwisetable_zeta">\c zeta </a>.
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_betainc
+  \link Eigen::betainc betainc\endlink(a,b,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function">Incomplete beta function</a></td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_zeta
+  \link Eigen::zeta zeta\endlink(a,b);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Hurwitz_zeta_function">Hurwitz zeta function</a>
+  \n \f$ \zeta(a_i,b_i)=\sum_{k=0}^{\infty}(b_i+k)^{\text{-}a_i} \f$</td>
+  <td>
+  built-in for float and double
+  </td>
+  <td></td>
+</tr>
+<tr><td colspan="4"></td></tr>
+</table>
+
+\n
+
+*/
+
+}
diff --git a/doc/CustomizingEigen.dox b/doc/CustomizingEigen.dox
deleted file mode 100644
index 5a0890ea9..000000000
--- a/doc/CustomizingEigen.dox
+++ /dev/null
@@ -1,188 +0,0 @@
-namespace Eigen {
-
-/** \page TopicCustomizingEigen Customizing/Extending Eigen
-
-Eigen can be extended in several ways, for instance, by defining global methods, \ref ExtendingMatrixBase "by adding custom methods to MatrixBase", adding support to \ref CustomScalarType "custom types" etc.
-
-\eigenAutoToc
-
-\section ExtendingMatrixBase Extending MatrixBase (and other classes)
-
-In this section we will see how to add custom methods to MatrixBase. Since all expressions and matrix types inherit MatrixBase, adding a method to MatrixBase make it immediately available to all expressions ! A typical use case is, for instance, to make Eigen compatible with another API.
-
-You certainly know that in C++ it is not possible to add methods to an existing class. So how that's possible ? Here the trick is to include in the declaration of MatrixBase a file defined by the preprocessor token \c EIGEN_MATRIXBASE_PLUGIN:
-\code
-class MatrixBase {
-  // ...
-  #ifdef EIGEN_MATRIXBASE_PLUGIN
-  #include EIGEN_MATRIXBASE_PLUGIN
-  #endif
-};
-\endcode
-Therefore to extend MatrixBase with your own methods you just have to create a file with your method declaration and define EIGEN_MATRIXBASE_PLUGIN before you include any Eigen's header file.
-
-You can extend many of the other classes used in Eigen by defining similarly named preprocessor symbols. For instance, define \c EIGEN_ARRAYBASE_PLUGIN if you want to extend the ArrayBase class. A full list of classes that can be extended in this way and the corresponding preprocessor symbols can be found on our page \ref TopicPreprocessorDirectives.
-
-Here is an example of an extension file for adding methods to MatrixBase: \n
-\b MatrixBaseAddons.h
-\code
-inline Scalar at(uint i, uint j) const { return this->operator()(i,j); }
-inline Scalar& at(uint i, uint j) { return this->operator()(i,j); }
-inline Scalar at(uint i) const { return this->operator[](i); }
-inline Scalar& at(uint i) { return this->operator[](i); }
-
-inline RealScalar squaredLength() const { return squaredNorm(); }
-inline RealScalar length() const { return norm(); }
-inline RealScalar invLength(void) const { return fast_inv_sqrt(squaredNorm()); }
-
-template<typename OtherDerived>
-inline Scalar squaredDistanceTo(const MatrixBase<OtherDerived>& other) const
-{ return (derived() - other.derived()).squaredNorm(); }
-
-template<typename OtherDerived>
-inline RealScalar distanceTo(const MatrixBase<OtherDerived>& other) const
-{ return internal::sqrt(derived().squaredDistanceTo(other)); }
-
-inline void scaleTo(RealScalar l) { RealScalar vl = norm(); if (vl>1e-9) derived() *= (l/vl); }
-
-inline Transpose<Derived> transposed() {return this->transpose();}
-inline const Transpose<Derived> transposed() const {return this->transpose();}
-
-inline uint minComponentId(void) const  { int i; this->minCoeff(&i); return i; }
-inline uint maxComponentId(void) const  { int i; this->maxCoeff(&i); return i; }
-
-template<typename OtherDerived>
-void makeFloor(const MatrixBase<OtherDerived>& other) { derived() = derived().cwiseMin(other.derived()); }
-template<typename OtherDerived>
-void makeCeil(const MatrixBase<OtherDerived>& other) { derived() = derived().cwiseMax(other.derived()); }
-
-const CwiseUnaryOp<internal::scalar_add_op<Scalar>, Derived>
-operator+(const Scalar& scalar) const
-{ return CwiseUnaryOp<internal::scalar_add_op<Scalar>, Derived>(derived(), internal::scalar_add_op<Scalar>(scalar)); }
-
-friend const CwiseUnaryOp<internal::scalar_add_op<Scalar>, Derived>
-operator+(const Scalar& scalar, const MatrixBase<Derived>& mat)
-{ return CwiseUnaryOp<internal::scalar_add_op<Scalar>, Derived>(mat.derived(), internal::scalar_add_op<Scalar>(scalar)); }
-\endcode
-
-Then one can the following declaration in the config.h or whatever prerequisites header file of his project:
-\code
-#define EIGEN_MATRIXBASE_PLUGIN "MatrixBaseAddons.h"
-\endcode
-
-\section InheritingFromMatrix Inheriting from Matrix
-
-Before inheriting from Matrix, be really, i mean REALLY sure that using
-EIGEN_MATRIX_PLUGIN is not what you really want (see previous section).
-If you just need to add few members to Matrix, this is the way to go.
-
-An example of when you actually need to inherit Matrix, is when you have
-several layers of heritage such as  MyVerySpecificVector1,MyVerySpecificVector1 -> MyVector1 -> Matrix and.
-MyVerySpecificVector3,MyVerySpecificVector4 -> MyVector2 -> Matrix.
-
-In order for your object to work within the %Eigen framework, you need to
-define a few members in your inherited class.
-
-Here is a minimalistic example:\n
-\code
-class MyVectorType : public  Eigen::VectorXd
-{
-public:
-    MyVectorType(void):Eigen::VectorXd() {}
-
-    typedef Eigen::VectorXd Base;
-
-    // This constructor allows you to construct MyVectorType from Eigen expressions
-    template<typename OtherDerived>
-    MyVectorType(const Eigen::MatrixBase<OtherDerived>& other)
-        : Eigen::Vector3d(other)
-    { }
-
-    // This method allows you to assign Eigen expressions to MyVectorType
-    template<typename OtherDerived>
-    MyVectorType & operator= (const Eigen::MatrixBase <OtherDerived>& other)
-    {
-        this->Base::operator=(other);
-        return *this;
-    }
-};
-\endcode
-
-This is the kind of error you can get if you don't provide those methods
-\code
-error: no match for ‘operator=’ in ‘delta =
-(((Eigen::MatrixBase<Eigen::Matrix<std::complex<float>, 10000, 1, 2, 10000,
-1> >*)(& delta)) + 8u)->Eigen::MatrixBase<Derived>::cwise [with Derived =
-Eigen::Matrix<std::complex<float>, 10000, 1, 2, 10000,
-1>]().Eigen::Cwise<ExpressionType>::operator* [with OtherDerived =
-Eigen::Matrix<std::complex<float>, 10000, 1, 2, 10000, 1>, ExpressionType =
-Eigen::Matrix<std::complex<float>, 10000, 1, 2, 10000, 1>](((const
-Eigen::MatrixBase<Eigen::Matrix<std::complex<float>, 10000, 1, 2, 10000, 1>
->&)(((const Eigen::MatrixBase<Eigen::Matrix<std::complex<float>, 10000, 1,
->2, 10000, 1> >*)((const spectral1d*)where)) + 8u)))’                                                    
-\endcode
-
-\anchor user_defined_scalars \section CustomScalarType Using custom scalar types
-
-By default, Eigen currently supports standard floating-point types (\c float, \c double, \c std::complex<float>, \c std::complex<double>, \c long \c double), as well as all native integer types (e.g., \c int, \c unsigned \c int, \c short, etc.), and \c bool.
-On x86-64 systems, \c long \c double permits to locally enforces the use of x87 registers with extended accuracy (in comparison to SSE).
-
-In order to add support for a custom type \c T you need:
--# make sure the common operator (+,-,*,/,etc.) are supported by the type \c T
--# add a specialization of struct Eigen::NumTraits<T> (see \ref NumTraits)
--# define the math functions that makes sense for your type. This includes standard ones like sqrt, pow, sin, tan, conj, real, imag, etc, as well as abs2 which is Eigen specific.
-     (see the file Eigen/src/Core/MathFunctions.h)
-
-The math function should be defined in the same namespace than \c T, or in the \c std namespace though that second approach is not recommended.
-
-Here is a concrete example adding support for the Adolc's \c adouble type. <a href="https://projects.coin-or.org/ADOL-C">Adolc</a> is an automatic differentiation library. The type \c adouble is basically a real value tracking the values of any number of partial derivatives.
-
-\code
-#ifndef ADOLCSUPPORT_H
-#define ADOLCSUPPORT_H
-
-#define ADOLC_TAPELESS
-#include <adolc/adouble.h>
-#include <Eigen/Core>
-
-namespace Eigen {
-
-template<> struct NumTraits<adtl::adouble>
- : NumTraits<double> // permits to get the epsilon, dummy_precision, lowest, highest functions
-{
-  typedef adtl::adouble Real;
-  typedef adtl::adouble NonInteger;
-  typedef adtl::adouble Nested;
-
-  enum {
-    IsComplex = 0,
-    IsInteger = 0,
-    IsSigned = 1,
-    RequireInitialization = 1,
-    ReadCost = 1,
-    AddCost = 3,
-    MulCost = 3
-  };
-};
-
-}
-
-namespace adtl {
-
-inline const adouble& conj(const adouble& x)  { return x; }
-inline const adouble& real(const adouble& x)  { return x; }
-inline adouble imag(const adouble&)    { return 0.; }
-inline adouble abs(const adouble&  x)  { return fabs(x); }
-inline adouble abs2(const adouble& x)  { return x*x; }
-
-}
-
-#endif // ADOLCSUPPORT_H
-\endcode
-
-
-\sa \ref TopicPreprocessorDirectives
-
-*/
-
-}
diff --git a/doc/CustomizingEigen_CustomScalar.dox b/doc/CustomizingEigen_CustomScalar.dox
new file mode 100644
index 000000000..1ee78cbe5
--- /dev/null
+++ b/doc/CustomizingEigen_CustomScalar.dox
@@ -0,0 +1,120 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_CustomScalar Using custom scalar types
+\anchor user_defined_scalars
+
+By default, Eigen currently supports standard floating-point types (\c float, \c double, \c std::complex<float>, \c std::complex<double>, \c long \c double), as well as all native integer types (e.g., \c int, \c unsigned \c int, \c short, etc.), and \c bool.
+On x86-64 systems, \c long \c double permits to locally enforces the use of x87 registers with extended accuracy (in comparison to SSE).
+
+In order to add support for a custom type \c T you need:
+-# make sure the common operator (+,-,*,/,etc.) are supported by the type \c T
+-# add a specialization of struct Eigen::NumTraits<T> (see \ref NumTraits)
+-# define the math functions that makes sense for your type. This includes standard ones like sqrt, pow, sin, tan, conj, real, imag, etc, as well as abs2 which is Eigen specific.
+     (see the file Eigen/src/Core/MathFunctions.h)
+
+The math function should be defined in the same namespace than \c T, or in the \c std namespace though that second approach is not recommended.
+
+Here is a concrete example adding support for the Adolc's \c adouble type. <a href="https://projects.coin-or.org/ADOL-C">Adolc</a> is an automatic differentiation library. The type \c adouble is basically a real value tracking the values of any number of partial derivatives.
+
+\code
+#ifndef ADOLCSUPPORT_H
+#define ADOLCSUPPORT_H
+
+#define ADOLC_TAPELESS
+#include <adolc/adouble.h>
+#include <Eigen/Core>
+
+namespace Eigen {
+
+template<> struct NumTraits<adtl::adouble>
+ : NumTraits<double> // permits to get the epsilon, dummy_precision, lowest, highest functions
+{
+  typedef adtl::adouble Real;
+  typedef adtl::adouble NonInteger;
+  typedef adtl::adouble Nested;
+
+  enum {
+    IsComplex = 0,
+    IsInteger = 0,
+    IsSigned = 1,
+    RequireInitialization = 1,
+    ReadCost = 1,
+    AddCost = 3,
+    MulCost = 3
+  };
+};
+
+}
+
+namespace adtl {
+
+inline const adouble& conj(const adouble& x)  { return x; }
+inline const adouble& real(const adouble& x)  { return x; }
+inline adouble imag(const adouble&)    { return 0.; }
+inline adouble abs(const adouble&  x)  { return fabs(x); }
+inline adouble abs2(const adouble& x)  { return x*x; }
+
+}
+
+#endif // ADOLCSUPPORT_H
+\endcode
+
+This other example adds support for the \c mpq_class type from <a href="https://gmplib.org/">GMP</a>. It shows in particular how to change the way Eigen picks the best pivot during LU factorization. It selects the coefficient with the highest score, where the score is by default the absolute value of a number, but we can define a different score, for instance to prefer pivots with a more compact representation (this is an example, not a recommendation). Note that the scores should always be non-negative and only zero is allowed to have a score of zero. Also, this can interact badly with thresholds for inexact scalar types.
+
+\code
+#include <gmpxx.h>
+#include <Eigen/Core>
+#include <boost/operators.hpp>
+
+namespace Eigen {
+  template<> struct NumTraits<mpq_class> : GenericNumTraits<mpq_class>
+  {
+    typedef mpq_class Real;
+    typedef mpq_class NonInteger;
+    typedef mpq_class Nested;
+
+    static inline Real epsilon() { return 0; }
+    static inline Real dummy_precision() { return 0; }
+    static inline Real digits10() { return 0; }
+
+    enum {
+      IsInteger = 0,
+      IsSigned = 1,
+      IsComplex = 0,
+      RequireInitialization = 1,
+      ReadCost = 6,
+      AddCost = 150,
+      MulCost = 100
+    };
+  };
+
+  namespace internal {
+
+    template<> struct scalar_score_coeff_op<mpq_class> {
+      struct result_type : boost::totally_ordered1<result_type> {
+        std::size_t len;
+        result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score()
+        result_type(mpq_class const& q) :
+          len(mpz_size(q.get_num_mpz_t())+
+              mpz_size(q.get_den_mpz_t())-1) {}
+        friend bool operator<(result_type x, result_type y) {
+          // 0 is the worst possible pivot
+          if (x.len == 0) return y.len > 0;
+          if (y.len == 0) return false;
+          // Prefer a pivot with a small representation
+          return x.len > y.len;
+        }
+        friend bool operator==(result_type x, result_type y) {
+          // Only used to test if the score is 0
+          return x.len == y.len;
+        }
+      };
+      result_type operator()(mpq_class const& x) const { return x; }
+    };
+  }
+}
+\endcode
+
+*/
+
+}
diff --git a/doc/CustomizingEigen_InheritingMatrix.dox b/doc/CustomizingEigen_InheritingMatrix.dox
new file mode 100644
index 000000000..b21e55433
--- /dev/null
+++ b/doc/CustomizingEigen_InheritingMatrix.dox
@@ -0,0 +1,34 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_InheritingMatrix Inheriting from Matrix
+
+Before inheriting from Matrix, be really, I mean REALLY, sure that using
+EIGEN_MATRIX_PLUGIN is not what you really want (see previous section).
+If you just need to add few members to Matrix, this is the way to go.
+
+An example of when you actually need to inherit Matrix, is when you
+have several layers of heritage such as 
+MyVerySpecificVector1, MyVerySpecificVector2 -> MyVector1 -> Matrix and
+MyVerySpecificVector3, MyVerySpecificVector4 -> MyVector2 -> Matrix.
+
+In order for your object to work within the %Eigen framework, you need to
+define a few members in your inherited class.
+
+Here is a minimalistic example:
+
+\include CustomizingEigen_Inheritance.cpp
+
+Output: \verbinclude CustomizingEigen_Inheritance.out
+
+This is the kind of error you can get if you don't provide those methods
+\verbatim
+error: no match for ‘operator=’ in ‘v = Eigen::operator*(
+const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1, 0, -0x000000001, 1> >::Scalar&, 
+const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType&)
+(((const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType&)
+((const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType*)(& v))))’
+\endverbatim
+
+*/
+
+}
diff --git a/doc/CustomizingEigen_NullaryExpr.dox b/doc/CustomizingEigen_NullaryExpr.dox
new file mode 100644
index 000000000..37c8dcd89
--- /dev/null
+++ b/doc/CustomizingEigen_NullaryExpr.dox
@@ -0,0 +1,86 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_NullaryExpr Matrix manipulation via nullary-expressions
+
+
+The main purpose of the class CwiseNullaryOp is to define \em procedural matrices such as constant or random matrices as returned by the Ones(), Zero(), Constant(), Identity() and Random() methods.
+Nevertheless, with some imagination it is possible to accomplish very sophisticated matrix manipulation with minimal efforts such that \ref TopicNewExpressionType "implementing new expression" is rarely needed.
+
+\section NullaryExpr_Circulant Example 1: circulant matrix
+
+To explore these possibilities let us start with the  \em circulant example of the \ref TopicNewExpressionType "implementing new expression" topic.
+Let us recall that a circulant matrix is a matrix where each column is the same as the
+column to the left, except that it is cyclically shifted downwards.
+For example, here is a 4-by-4 circulant matrix:
+\f[ \begin{bmatrix}
+    1 & 8 & 4 & 2 \\
+    2 & 1 & 8 & 4 \\
+    4 & 2 & 1 & 8 \\
+    8 & 4 & 2 & 1
+\end{bmatrix} \f]
+A circulant matrix is uniquely determined by its first column. We wish
+to write a function \c makeCirculant which, given the first column,
+returns an expression representing the circulant matrix.
+
+For this exercise, the return type of \c makeCirculant will be a CwiseNullaryOp that we need to instantiate with:
+1 - a proper \c circulant_functor storing the input vector and implementing the adequate coefficient accessor \c operator(i,j)
+2 - a template instantiation of class Matrix conveying compile-time information such as the scalar type, sizes, and preferred storage layout.
+
+Calling \c ArgType the type of the input vector, we can construct the equivalent squared Matrix type as follows:
+
+\snippet make_circulant2.cpp square
+
+This little helper structure will help us to implement our \c makeCirculant function as follows:
+
+\snippet make_circulant2.cpp makeCirculant
+
+As usual, our function takes as argument a \c MatrixBase (see this \ref TopicFunctionTakingEigenTypes "page" for more details).
+Then, the CwiseNullaryOp object is constructed through the DenseBase::NullaryExpr static method with the adequate runtime sizes.
+
+Then, we need to implement our \c circulant_functor, which is a straightforward exercise:
+
+\snippet make_circulant2.cpp circulant_func
+
+We are now all set to try our new feature:
+
+\snippet make_circulant2.cpp main
+
+
+If all the fragments are combined, the following output is produced,
+showing that the program works as expected:
+
+\include make_circulant2.out
+
+This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch.
+
+
+\section NullaryExpr_Indexing Example 2: indexing rows and columns
+
+The goal here is to mimic MatLab's ability to index a matrix through two vectors of indices referencing the rows and columns to be picked respectively, like this:
+
+\snippet nullary_indexing.out main1
+
+To this end, let us first write a nullary-functor storing references to the input matrix and to the two arrays of indices, and implementing the required \c operator()(i,j):
+
+\snippet nullary_indexing.cpp functor
+
+Then, let's create an \c indexing(A,rows,cols) function creating the nullary expression:
+
+\snippet nullary_indexing.cpp function
+
+Finally, here is an example of how this function can be used:
+
+\snippet nullary_indexing.cpp main1
+
+This straightforward implementation is already quite powerful as the row or column index arrays can also be expressions to perform offsetting, modulo, striding, reverse, etc.
+
+\snippet nullary_indexing.cpp main2
+
+and the output is:
+
+\snippet nullary_indexing.out main2
+
+*/
+
+}
+
diff --git a/doc/CustomizingEigen_Plugins.dox b/doc/CustomizingEigen_Plugins.dox
new file mode 100644
index 000000000..d88f2409b
--- /dev/null
+++ b/doc/CustomizingEigen_Plugins.dox
@@ -0,0 +1,69 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_Plugins Extending MatrixBase (and other classes)
+
+In this section we will see how to add custom methods to MatrixBase. Since all expressions and matrix types inherit MatrixBase, adding a method to MatrixBase make it immediately available to all expressions ! A typical use case is, for instance, to make Eigen compatible with another API.
+
+You certainly know that in C++ it is not possible to add methods to an existing class. So how that's possible ? Here the trick is to include in the declaration of MatrixBase a file defined by the preprocessor token \c EIGEN_MATRIXBASE_PLUGIN:
+\code
+class MatrixBase {
+  // ...
+  #ifdef EIGEN_MATRIXBASE_PLUGIN
+  #include EIGEN_MATRIXBASE_PLUGIN
+  #endif
+};
+\endcode
+Therefore to extend MatrixBase with your own methods you just have to create a file with your method declaration and define EIGEN_MATRIXBASE_PLUGIN before you include any Eigen's header file.
+
+You can extend many of the other classes used in Eigen by defining similarly named preprocessor symbols. For instance, define \c EIGEN_ARRAYBASE_PLUGIN if you want to extend the ArrayBase class. A full list of classes that can be extended in this way and the corresponding preprocessor symbols can be found on our page \ref TopicPreprocessorDirectives.
+
+Here is an example of an extension file for adding methods to MatrixBase: \n
+\b MatrixBaseAddons.h
+\code
+inline Scalar at(uint i, uint j) const { return this->operator()(i,j); }
+inline Scalar& at(uint i, uint j) { return this->operator()(i,j); }
+inline Scalar at(uint i) const { return this->operator[](i); }
+inline Scalar& at(uint i) { return this->operator[](i); }
+
+inline RealScalar squaredLength() const { return squaredNorm(); }
+inline RealScalar length() const { return norm(); }
+inline RealScalar invLength(void) const { return fast_inv_sqrt(squaredNorm()); }
+
+template<typename OtherDerived>
+inline Scalar squaredDistanceTo(const MatrixBase<OtherDerived>& other) const
+{ return (derived() - other.derived()).squaredNorm(); }
+
+template<typename OtherDerived>
+inline RealScalar distanceTo(const MatrixBase<OtherDerived>& other) const
+{ return internal::sqrt(derived().squaredDistanceTo(other)); }
+
+inline void scaleTo(RealScalar l) { RealScalar vl = norm(); if (vl>1e-9) derived() *= (l/vl); }
+
+inline Transpose<Derived> transposed() {return this->transpose();}
+inline const Transpose<Derived> transposed() const {return this->transpose();}
+
+inline uint minComponentId(void) const  { int i; this->minCoeff(&i); return i; }
+inline uint maxComponentId(void) const  { int i; this->maxCoeff(&i); return i; }
+
+template<typename OtherDerived>
+void makeFloor(const MatrixBase<OtherDerived>& other) { derived() = derived().cwiseMin(other.derived()); }
+template<typename OtherDerived>
+void makeCeil(const MatrixBase<OtherDerived>& other) { derived() = derived().cwiseMax(other.derived()); }
+
+const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const ConstantReturnType>
+operator+(const Scalar& scalar) const
+{ return CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const ConstantReturnType>(derived(), Constant(rows(),cols(),scalar)); }
+
+friend const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ConstantReturnType, Derived>
+operator+(const Scalar& scalar, const MatrixBase<Derived>& mat)
+{ return CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ConstantReturnType, Derived>(Constant(rows(),cols(),scalar), mat.derived()); }
+\endcode
+
+Then one can the following declaration in the config.h or whatever prerequisites header file of his project:
+\code
+#define EIGEN_MATRIXBASE_PLUGIN "MatrixBaseAddons.h"
+\endcode
+
+*/
+
+}
diff --git a/doc/DenseDecompositionBenchmark.dox b/doc/DenseDecompositionBenchmark.dox
new file mode 100644
index 000000000..7be9c70cd
--- /dev/null
+++ b/doc/DenseDecompositionBenchmark.dox
@@ -0,0 +1,42 @@
+namespace Eigen {
+
+/** \eigenManualPage DenseDecompositionBenchmark Benchmark of dense decompositions
+
+This page presents a speed comparison of the dense matrix decompositions offered by %Eigen for a wide range of square matrices and overconstrained problems.
+
+For a more general overview on the features and numerical robustness of linear solvers and decompositions, check this \link TopicLinearAlgebraDecompositions table \endlink.
+
+This benchmark has been run on a laptop equipped with an Intel core i7 \@ 2,6 GHz, and compiled with clang with \b AVX and \b FMA instruction sets enabled but without multi-threading.
+It uses \b single \b precision \b float numbers. For double, you can get a good estimate by multiplying the timings by a factor 2.
+
+The square matrices are symmetric, and for the overconstrained matrices, the reported timmings include the cost to compute the symmetric covariance matrix \f$ A^T A \f$ for the first four solvers based on Cholesky and LU, as denoted by the \b * symbol (top-right corner part of the table).
+Timings are in \b milliseconds, and factors are relative to the LLT decomposition which is the fastest but also the least general and robust.
+
+<table class="manual">
+<tr><th>solver/size</th>
+  <th>8x8</th>  <th>100x100</th>  <th>1000x1000</th>  <th>4000x4000</th>  <th>10000x8</th>  <th>10000x100</th>  <th>10000x1000</th>  <th>10000x4000</th></tr>
+<tr><td>LLT</td><td>0.05</td><td>0.42</td><td>5.83</td><td>374.55</td><td>6.79 <sup><a href="#note_ls">*</a></sup></td><td>30.15 <sup><a href="#note_ls">*</a></sup></td><td>236.34 <sup><a href="#note_ls">*</a></sup></td><td>3847.17 <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr class="alt"><td>LDLT</td><td>0.07 (x1.3)</td><td>0.65 (x1.5)</td><td>26.86 (x4.6)</td><td>2361.18 (x6.3)</td><td>6.81 (x1) <sup><a href="#note_ls">*</a></sup></td><td>31.91 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>252.61 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>5807.66 (x1.5) <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr><td>PartialPivLU</td><td>0.08 (x1.5)</td><td>0.69 (x1.6)</td><td>15.63 (x2.7)</td><td>709.32 (x1.9)</td><td>6.81 (x1) <sup><a href="#note_ls">*</a></sup></td><td>31.32 (x1) <sup><a href="#note_ls">*</a></sup></td><td>241.68 (x1) <sup><a href="#note_ls">*</a></sup></td><td>4270.48 (x1.1) <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr class="alt"><td>FullPivLU</td><td>0.1 (x1.9)</td><td>4.48 (x10.6)</td><td>281.33 (x48.2)</td><td>-</td><td>6.83 (x1) <sup><a href="#note_ls">*</a></sup></td><td>32.67 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>498.25 (x2.1) <sup><a href="#note_ls">*</a></sup></td><td>-</td></tr>
+<tr><td>HouseholderQR</td><td>0.19 (x3.5)</td><td>2.18 (x5.2)</td><td>23.42 (x4)</td><td>1337.52 (x3.6)</td><td>34.26 (x5)</td><td>129.01 (x4.3)</td><td>377.37 (x1.6)</td><td>4839.1 (x1.3)</td></tr>
+<tr class="alt"><td>ColPivHouseholderQR</td><td>0.23 (x4.3)</td><td>2.23 (x5.3)</td><td>103.34 (x17.7)</td><td>9987.16 (x26.7)</td><td>36.05 (x5.3)</td><td>163.18 (x5.4)</td><td>2354.08 (x10)</td><td>37860.5 (x9.8)</td></tr>
+<tr><td>CompleteOrthogonalDecomposition</td><td>0.23 (x4.3)</td><td>2.22 (x5.2)</td><td>99.44 (x17.1)</td><td>10555.3 (x28.2)</td><td>35.75 (x5.3)</td><td>169.39 (x5.6)</td><td>2150.56 (x9.1)</td><td>36981.8 (x9.6)</td></tr>
+<tr class="alt"><td>FullPivHouseholderQR</td><td>0.23 (x4.3)</td><td>4.64 (x11)</td><td>289.1 (x49.6)</td><td>-</td><td>69.38 (x10.2)</td><td>446.73 (x14.8)</td><td>4852.12 (x20.5)</td><td>-</td></tr>
+<tr><td>JacobiSVD</td><td>1.01 (x18.6)</td><td>71.43 (x168.4)</td><td>-</td><td>-</td><td>113.81 (x16.7)</td><td>1179.66 (x39.1)</td><td>-</td><td>-</td></tr>
+<tr class="alt"><td>BDCSVD</td><td>1.07 (x19.7)</td><td>21.83 (x51.5)</td><td>331.77 (x56.9)</td><td>18587.9 (x49.6)</td><td>110.53 (x16.3)</td><td>397.67 (x13.2)</td><td>2975 (x12.6)</td><td>48593.2 (x12.6)</td></tr>
+</table>
+
+<a name="note_ls">\b *: </a> This decomposition do not support direct least-square solving for over-constrained problems, and the reported timing include the cost to form the symmetric covariance matrix \f$ A^T A \f$.
+
+\b Observations:
+ + LLT is always the fastest solvers.
+ + For largely over-constrained problems, the cost of Cholesky/LU decompositions is dominated by the computation of the symmetric covariance matrix.
+ + For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
+ + CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
+
+The above table has been generated by the <a href="https://bitbucket.org/eigen/eigen/raw/default/bench/dense_solvers.cpp">bench/dense_solvers.cpp</a> file, feel-free to hack it to generate a table matching your hardware, compiler, and favorite problem sizes.
+
+*/
+
+}
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 696dd2af1..48bb0a8ec 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -125,7 +125,7 @@ ALWAYS_DETAILED_SEC    = NO
 # members were ordinary class members. Constructors, destructors and assignment
 # operators of the base classes will not be shown.
 
-INLINE_INHERITED_MEMB  = YES
+INLINE_INHERITED_MEMB  = NO
 
 # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
 # path before files name in the file list and in the header files. If set
@@ -206,6 +206,7 @@ TAB_SIZE               = 8
 # You can put \n's in the value part of an alias to insert newlines.
 
 ALIASES                = "only_for_vectors=This is only for vectors (either row-vectors or column-vectors), i.e. matrices which are known at compile-time to have either one row or one column." \
+                         "not_reentrant=\warning This function is not re-entrant." \
                          "array_module=This is defined in the %Array module. \code #include <Eigen/Array> \endcode" \
                          "cholesky_module=This is defined in the %Cholesky module. \code #include <Eigen/Cholesky> \endcode" \
                          "eigenvalues_module=This is defined in the %Eigenvalues module. \code #include <Eigen/Eigenvalues> \endcode" \
@@ -215,6 +216,7 @@ ALIASES                = "only_for_vectors=This is only for vectors (either row-
                          "lu_module=This is defined in the %LU module. \code #include <Eigen/LU> \endcode" \
                          "qr_module=This is defined in the %QR module. \code #include <Eigen/QR> \endcode" \
                          "svd_module=This is defined in the %SVD module. \code #include <Eigen/SVD> \endcode" \
+                         "specialfunctions_module=This is defined in the \b unsupported SpecialFunctions module. \code #include <Eigen/SpecialFunctions> \endcode" \
                          "label=\bug" \
                          "matrixworld=<a href='#matrixonly' style='color:green;text-decoration: none;'>*</a>" \
                          "arrayworld=<a href='#arrayonly' style='color:blue;text-decoration: none;'>*</a>" \
@@ -222,7 +224,13 @@ ALIASES                = "only_for_vectors=This is only for vectors (either row-
                          "note_about_using_kernel_to_study_multiple_solutions=If you need a complete analysis of the space of solutions, take the one solution obtained by this method and add to it elements of the kernel, as determined by kernel()." \
                          "note_about_checking_solutions=This method just tries to find as good a solution as possible. If you want to check whether a solution exists or if it is accurate, just call this function to get a result and then compute the error of this result, or use MatrixBase::isApprox() directly, for instance like this: \code bool a_solution_exists = (A*result).isApprox(b, precision); \endcode This method avoids dividing by zero, so that the non-existence of a solution doesn't by itself mean that you'll get \c inf or \c nan values." \
                          "note_try_to_help_rvo=This function returns the result by value. In order to make that efficient, it is implemented as just a return statement using a special constructor, hopefully allowing the compiler to perform a RVO (return value optimization)." \
-                         "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\""
+                         "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\"" \
+                         "implsparsesolverconcept=This class follows the \link TutorialSparseSolverConcept sparse solver concept \endlink." \
+                         "blank= " \
+                         "cpp11=<span class='cpp11'>[c++11]</span>" \
+                         "cpp14=<span class='cpp14'>[c++14]</span>" \
+                         "cpp17=<span class='cpp17'>[c++17]</span>"
+                         
 
 ALIASES += "eigenAutoToc=  "
 ALIASES += "eigenManualPage=\defgroup"
@@ -270,7 +278,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
 # you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
 
-EXTENSION_MAPPING      =
+EXTENSION_MAPPING      = .h=C++ no_extension=C++
 
 # If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all
 # comments according to the Markdown format, which allows for more readable
@@ -458,7 +466,7 @@ HIDE_IN_BODY_DOCS      = NO
 # to NO (the default) then the documentation will be excluded.
 # Set it to YES to include the internal documentation.
 
-INTERNAL_DOCS          = NO
+INTERNAL_DOCS          = ${EIGEN_DOXY_INTERNAL}
 
 # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
 # file names in lower-case letters. If set to YES upper-case letters are also
@@ -472,13 +480,13 @@ CASE_SENSE_NAMES       = YES
 # will show members with their full class and namespace scopes in the
 # documentation. If set to YES the scope will be hidden.
 
-HIDE_SCOPE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
 
 # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
 # will put a list of the files that are included by a file in the documentation
 # of that file.
 
-SHOW_INCLUDE_FILES     = NO
+SHOW_INCLUDE_FILES     = ${EIGEN_DOXY_INTERNAL}
 
 # If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
 # will list include files with double quotes in the documentation
@@ -544,7 +552,7 @@ STRICT_PROTO_MATCHING  = NO
 # disable (NO) the todo list. This list is created by putting \todo
 # commands in the documentation.
 
-GENERATE_TODOLIST      = NO
+GENERATE_TODOLIST      = ${EIGEN_DOXY_INTERNAL}
 
 # The GENERATE_TESTLIST tag can be used to enable (YES) or
 # disable (NO) the test list. This list is created by putting \test
@@ -556,7 +564,7 @@ GENERATE_TESTLIST      = NO
 # disable (NO) the bug list. This list is created by putting \bug
 # commands in the documentation.
 
-GENERATE_BUGLIST       = NO
+GENERATE_BUGLIST       = ${EIGEN_DOXY_INTERNAL}
 
 # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
 # disable (NO) the deprecated list. This list is created by putting
@@ -719,7 +727,8 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = "${Eigen_SOURCE_DIR}/Eigen/Eigen2Support" \
+EXCLUDE                = "${Eigen_SOURCE_DIR}/Eigen/src/Core/products" \
+                         "${Eigen_SOURCE_DIR}/Eigen/Eigen2Support" \
                          "${Eigen_SOURCE_DIR}/Eigen/src/Eigen2Support" \
                          "${Eigen_SOURCE_DIR}/doc/examples" \
                          "${Eigen_SOURCE_DIR}/doc/special_examples" \
@@ -800,7 +809,7 @@ EXAMPLE_RECURSIVE      = NO
 # directories that contain image that are included in the documentation (see
 # the \image command).
 
-IMAGE_PATH             =
+IMAGE_PATH             = ${Eigen_BINARY_DIR}/doc/html
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
@@ -864,13 +873,13 @@ STRIP_CODE_COMMENTS    = YES
 # then for each documented function all documented
 # functions referencing it will be listed.
 
-REFERENCED_BY_RELATION = YES
+REFERENCED_BY_RELATION = NO
 
 # If the REFERENCES_RELATION tag is set to YES
 # then for each documented function all documented entities
 # called/used by that function will be listed.
 
-REFERENCES_RELATION    = YES
+REFERENCES_RELATION    = NO
 
 # If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
 # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
@@ -1581,9 +1590,14 @@ PREDEFINED             = EIGEN_EMPTY_STRUCT \
                          EIGEN_VECTORIZE \
                          EIGEN_QT_SUPPORT \
                          EIGEN_STRONG_INLINE=inline \
-                         "EIGEN2_SUPPORT_STAGE=99" \
+                         EIGEN_DEVICE_FUNC= \
                          "EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR)=template<typename OtherDerived> const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> METHOD(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const;" \
-                         "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp<internal::scalar_product_op<typename LHS::Scalar, typename RHS::Scalar >, const LHS, const RHS>"
+                         "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp<internal::scalar_product_op<LHS::Scalar,RHS::Scalar>, const LHS, const RHS>"\
+                         "EIGEN_CAT2(a,b)= a ## b"\
+                         "EIGEN_CAT(a,b)=EIGEN_CAT2(a,b)"\
+                         "EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME)=CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<LHS::Scalar, RHS::Scalar>, const LHS, const RHS>"\
+                         DOXCOMMA=,
+
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
 # this tag can be used to specify a list of macro names that should be expanded.
@@ -1599,7 +1613,15 @@ EXPAND_AS_DEFINED      = EIGEN_MAKE_TYPEDEFS \
                          EIGEN_CURRENT_STORAGE_BASE_CLASS \
                          EIGEN_MATHFUNC_IMPL \
                          _EIGEN_GENERIC_PUBLIC_INTERFACE \
-                         EIGEN2_SUPPORT
+                         EIGEN_ARRAY_DECLARE_GLOBAL_UNARY \
+                         EIGEN_EMPTY \
+                         EIGEN_EULER_ANGLES_TYPEDEFS \
+                         EIGEN_EULER_ANGLES_SINGLE_TYPEDEF \
+                         EIGEN_EULER_SYSTEM_TYPEDEF \
+                         EIGEN_DOC_UNARY_ADDONS \
+                         EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL \
+                         EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
+
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
 # doxygen's preprocessor will remove all references to function-like macros
diff --git a/doc/FixedSizeVectorizable.dox b/doc/FixedSizeVectorizable.dox
index 8ae135173..49e38af76 100644
--- a/doc/FixedSizeVectorizable.dox
+++ b/doc/FixedSizeVectorizable.dox
@@ -4,7 +4,7 @@ namespace Eigen {
 
 The goal of this page is to explain what we mean by "fixed-size vectorizable".
 
-\section summary Executive Summary
+\section FixedSizeVectorizable_summary Executive Summary
 
 An Eigen object is called "fixed-size vectorizable" if it has fixed size and that size is a multiple of 16 bytes.
 
@@ -21,7 +21,7 @@ Examples include:
 \li Eigen::Quaterniond
 \li Eigen::Quaternionf
 
-\section explanation Explanation
+\section FixedSizeVectorizable_explanation Explanation
 
 First, "fixed-size" should be clear: an Eigen object has fixed size if its number of rows and its number of columns are fixed at compile-time. So for example Matrix3f has fixed size, but MatrixXf doesn't (the opposite of fixed-size is dynamic-size).
 
diff --git a/doc/InplaceDecomposition.dox b/doc/InplaceDecomposition.dox
new file mode 100644
index 000000000..cb1c6d413
--- /dev/null
+++ b/doc/InplaceDecomposition.dox
@@ -0,0 +1,115 @@
+namespace Eigen {
+
+/** \eigenManualPage InplaceDecomposition Inplace matrix decompositions
+
+Starting from %Eigen 3.3, the LU, Cholesky, and QR decompositions can operate \em inplace, that is, directly within the given input matrix.
+This feature is especially useful when dealing with huge matrices, and or when the available memory is very limited (embedded systems).
+
+To this end, the respective decomposition class must be instantiated with a Ref<> matrix type, and the decomposition object must be constructed with the input matrix as argument. As an example, let us consider an inplace LU decomposition with partial pivoting.
+
+Let's start with the basic inclusions, and declaration of a 2x2 matrix \c A:
+
+<table class="example">
+<tr><th>code</th><th>output</th></tr>
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp init
+  </td>
+  <td>\snippet TutorialInplaceLU.out init
+  </td>
+</tr>
+</table>
+
+No surprise here! Then, let's declare our inplace LU object \c lu, and check the content of the matrix \c A:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp declaration
+  </td>
+  <td>\snippet TutorialInplaceLU.out declaration
+  </td>
+</tr>
+</table>
+
+Here, the \c lu object computes and stores the \c L and \c U factors within the memory held by the matrix \c A.
+The coefficients of \c A have thus been destroyed during the factorization, and replaced by the L and U factors as one can verify:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp matrixLU
+  </td>
+  <td>\snippet TutorialInplaceLU.out matrixLU
+  </td>
+</tr>
+</table>
+
+Then, one can use the \c lu object as usual, for instance to solve the Ax=b problem:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp solve
+  </td>
+  <td>\snippet TutorialInplaceLU.out solve
+  </td>
+</tr>
+</table>
+
+Here, since the content of the original matrix \c A has been lost, we had to declared a new matrix \c A0 to verify the result.
+
+Since the memory is shared between \c A and \c lu, modifying the matrix \c A will make \c lu invalid.
+This can easily be verified by modifying the content of \c A and trying to solve the initial problem again:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp modifyA
+  </td>
+  <td>\snippet TutorialInplaceLU.out modifyA
+  </td>
+</tr>
+</table>
+
+Note that there is no shared pointer under the hood, it is the \b responsibility \b of \b the \b user to keep the input matrix \c A in life as long as \c lu is living.
+
+If one wants to update the factorization with the modified A, one has to call the compute method as usual:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute
+  </td>
+  <td>\snippet TutorialInplaceLU.out recompute
+  </td>
+</tr>
+</table>
+
+Note that calling compute does not change the memory which is referenced by the \c lu object. Therefore, if the compute method is called with another matrix \c A1 different than \c A, then the content of \c A1 won't be modified. This is still the content of \c A that will be used to store the L and U factors of the matrix \c A1.
+This can easily be verified as follows:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute_bis0
+ </td>
+  <td>\snippet TutorialInplaceLU.out recompute_bis0
+ </td>
+</tr>
+</table>
+The matrix \c A1 is unchanged, and one can thus solve A1*x=b, and directly check the residual without any copy of \c A1:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute_bis1
+  </td>
+  <td>\snippet TutorialInplaceLU.out recompute_bis1
+ </td>
+</tr>
+</table>
+
+
+Here is the list of matrix decompositions supporting this inplace mechanism:
+
+- class LLT
+- class LDLT
+- class PartialPivLU
+- class FullPivLU
+- class HouseholderQR
+- class ColPivHouseholderQR
+- class FullPivHouseholderQR
+- class CompleteOrthogonalDecomposition
+
+*/
+
+}
+\ No newline at end of file
diff --git a/doc/LeastSquares.dox b/doc/LeastSquares.dox
new file mode 100644
index 000000000..e2191a22f
--- /dev/null
+++ b/doc/LeastSquares.dox
@@ -0,0 +1,70 @@
+namespace Eigen {
+
+/** \eigenManualPage LeastSquares Solving linear least squares systems
+
+This page describes how to solve linear least squares systems using %Eigen. An overdetermined system
+of equations, say \a Ax = \a b, has no solutions. In this case, it makes sense to search for the
+vector \a x which is closest to being a solution, in the sense that the difference \a Ax - \a b is
+as small as possible. This \a x is called the least square solution (if the Euclidean norm is used).
+
+The three methods discussed on this page are the SVD decomposition, the QR decomposition and normal
+equations. Of these, the SVD decomposition is generally the most accurate but the slowest, normal
+equations is the fastest but least accurate, and the QR decomposition is in between.
+
+\eigenAutoToc
+
+
+\section LeastSquaresSVD Using the SVD decomposition
+
+The \link JacobiSVD::solve() solve() \endlink method in the JacobiSVD class can be directly used to
+solve linear squares systems. It is not enough to compute only the singular values (the default for
+this class); you also need the singular vectors but the thin SVD decomposition suffices for
+computing least squares solutions:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgSVDSolve.cpp </td>
+  <td>\verbinclude TutorialLinAlgSVDSolve.out </td>
+</tr>
+</table>
+
+This is example from the page \link TutorialLinearAlgebra Linear algebra and decompositions \endlink.
+
+
+\section LeastSquaresQR Using the QR decomposition
+
+The solve() method in QR decomposition classes also computes the least squares solution. There are
+three QR decomposition classes: HouseholderQR (no pivoting, so fast but unstable),
+ColPivHouseholderQR (column pivoting, thus a bit slower but more accurate) and FullPivHouseholderQR
+(full pivoting, so slowest and most stable). Here is an example with column pivoting:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include LeastSquaresQR.cpp </td>
+  <td>\verbinclude LeastSquaresQR.out </td>
+</tr>
+</table>
+
+
+\section LeastSquaresNormalEquations Using normal equations
+
+Finding the least squares solution of \a Ax = \a b is equivalent to solving the normal equation
+<i>A</i><sup>T</sup><i>Ax</i> = <i>A</i><sup>T</sup><i>b</i>. This leads to the following code
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include LeastSquaresNormalEquations.cpp </td>
+  <td>\verbinclude LeastSquaresNormalEquations.out </td>
+</tr>
+</table>
+
+If the matrix \a A is ill-conditioned, then this is not a good method, because the condition number
+of <i>A</i><sup>T</sup><i>A</i> is the square of the condition number of \a A. This means that you
+lose twice as many digits using normal equation than if you use the other methods.
+
+*/
+
+}
+\ No newline at end of file
diff --git a/doc/Manual.dox b/doc/Manual.dox
index 3367982ca..342b145fd 100644
--- a/doc/Manual.dox
+++ b/doc/Manual.dox
@@ -3,18 +3,32 @@
 
 namespace Eigen {
 
+/** \page UserManual_CustomizingEigen Extending/Customizing Eigen
+  %Eigen can be extended in several ways, for instance, by defining global methods, by inserting custom methods within main %Eigen's classes through the \ref TopicCustomizing_Plugins "plugin" mechanism, by adding support to \ref TopicCustomizing_CustomScalar "custom scalar types" etc. See below for the respective sub-topics.
+  - \subpage TopicCustomizing_Plugins
+  - \subpage TopicCustomizing_InheritingMatrix
+  - \subpage TopicCustomizing_CustomScalar
+  - \subpage TopicCustomizing_NullaryExpr
+  - \subpage TopicNewExpressionType
+  \sa \ref TopicPreprocessorDirectives
+*/
+
+
 /** \page UserManual_Generalities General topics
   - \subpage Eigen2ToEigen3
   - \subpage TopicFunctionTakingEigenTypes
   - \subpage TopicPreprocessorDirectives
   - \subpage TopicAssertions
-  - \subpage TopicCustomizingEigen
   - \subpage TopicMultiThreading
+  - \subpage TopicUsingBlasLapack
   - \subpage TopicUsingIntelMKL
+  - \subpage TopicCUDA
+  - \subpage TopicPitfalls
   - \subpage TopicTemplateKeyword
   - \subpage UserManual_UnderstandingEigen
+  - \subpage TopicCMakeGuide
 */
-  
+
 /** \page UserManual_UnderstandingEigen Understanding Eigen
   - \subpage TopicInsideEigenExample
   - \subpage TopicClassHierarchy
@@ -56,6 +70,8 @@ namespace Eigen {
     \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TutorialMapClass
     \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialReshapeSlicing
+    \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TopicAliasing
     \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TopicStorageOrders
@@ -85,6 +101,9 @@ namespace Eigen {
 /** \addtogroup Householder_Module
     \ingroup DenseMatrixManipulation_Reference */ 
 
+/** \addtogroup CoeffwiseMathFunctions
+    \ingroup DenseMatrixManipulation_chapter */
+
 /** \addtogroup QuickRefPage
     \ingroup DenseMatrixManipulation_chapter */
 
@@ -96,6 +115,12 @@ namespace Eigen {
     \ingroup DenseLinearSolvers_chapter */
 /** \addtogroup TopicLinearAlgebraDecompositions
     \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup LeastSquares 
+    \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup InplaceDecomposition
+    \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup DenseDecompositionBenchmark
+    \ingroup DenseLinearSolvers_chapter */
 
 /** \addtogroup DenseLinearSolvers_Reference
     \ingroup DenseLinearSolvers_chapter */
@@ -120,6 +145,8 @@ namespace Eigen {
     \ingroup Sparse_chapter */
 /** \addtogroup TopicSparseSystems
     \ingroup Sparse_chapter */
+/** \addtogroup MatrixfreeSolverExample
+    \ingroup Sparse_chapter */
 
 /** \addtogroup Sparse_Reference
     \ingroup Sparse_chapter */
@@ -156,4 +183,7 @@ namespace Eigen {
     \ingroup Geometry_Reference */
 /** \addtogroup Splines_Module
     \ingroup Geometry_Reference */
+
+/** \internal \brief Namespace containing low-level routines from the %Eigen library. */
+namespace internal {}
 }
diff --git a/doc/MatrixfreeSolverExample.dox b/doc/MatrixfreeSolverExample.dox
new file mode 100644
index 000000000..3efa292b5
--- /dev/null
+++ b/doc/MatrixfreeSolverExample.dox
@@ -0,0 +1,20 @@
+
+namespace Eigen {
+
+/**
+
+\eigenManualPage MatrixfreeSolverExample Matrix-free solvers
+
+Iterative solvers such as ConjugateGradient and BiCGSTAB can be used in a matrix free context. To this end, user must provide a wrapper class inheriting EigenBase<> and implementing the following methods:
+ - \c Index \c rows() and \c Index \c cols(): returns number of rows and columns respectively
+ - \c operator* with your type and an %Eigen dense column vector (its actual implementation goes in a specialization of the internal::generic_product_impl class)
+
+\c Eigen::internal::traits<> must also be specialized for the wrapper type.
+
+Here is a complete example wrapping an Eigen::SparseMatrix:
+\include matrixfree_cg.cpp
+Output: \verbinclude matrixfree_cg.out
+
+*/
+
+}
+\ No newline at end of file
diff --git a/doc/NewExpressionType.dox b/doc/NewExpressionType.dox
new file mode 100644
index 000000000..c2f243312
--- /dev/null
+++ b/doc/NewExpressionType.dox
@@ -0,0 +1,143 @@
+namespace Eigen {
+
+/** \page TopicNewExpressionType Adding a new expression type
+
+<!--<span style="font-size:130%; color:red; font-weight: 900;"></span>-->
+\warning
+Disclaimer: this page is tailored to very advanced users who are not afraid of dealing with some %Eigen's internal aspects.
+In most cases, a custom expression can be avoided by either using custom \ref MatrixBase::unaryExpr "unary" or \ref MatrixBase::binaryExpr "binary" functors,
+while extremely complex matrix manipulations can be achieved by a nullary functors as described in the \ref TopicCustomizing_NullaryExpr "previous page".
+
+This page describes with the help of an example how to implement a new
+light-weight expression type in %Eigen. This consists of three parts:
+the expression type itself, a traits class containing compile-time
+information about the expression, and the evaluator class which is
+used to evaluate the expression to a matrix.
+
+\b TO \b DO: Write a page explaining the design, with details on
+vectorization etc., and refer to that page here.
+
+
+\eigenAutoToc
+
+\section TopicSetting The setting
+
+A circulant matrix is a matrix where each column is the same as the
+column to the left, except that it is cyclically shifted downwards.
+For example, here is a 4-by-4 circulant matrix:
+\f[ \begin{bmatrix} 
+    1 & 8 & 4 & 2 \\ 
+    2 & 1 & 8 & 4 \\
+    4 & 2 & 1 & 8 \\
+    8 & 4 & 2 & 1
+\end{bmatrix} \f]
+A circulant matrix is uniquely determined by its first column. We wish
+to write a function \c makeCirculant which, given the first column,
+returns an expression representing the circulant matrix.
+
+For simplicity, we restrict the \c makeCirculant function to dense
+matrices. It may make sense to also allow arrays, or sparse matrices,
+but we will not do so here. We also do not want to support
+vectorization.
+
+
+\section TopicPreamble Getting started
+
+We will present the file implementing the \c makeCirculant function
+part by part. We start by including the appropriate header files and
+forward declaring the expression class, which we will call
+\c Circulant. The \c makeCirculant function will return an object of
+this type. The class \c Circulant is in fact a class template; the
+template argument \c ArgType refers to the type of the vector passed
+to the \c makeCirculant function.
+
+\include make_circulant.cpp.preamble
+
+
+\section TopicTraits The traits class
+
+For every expression class \c X, there should be a traits class 
+\c Traits<X> in the \c Eigen::internal namespace containing
+information about \c X known as compile time.
+
+As explained in \ref TopicSetting, we designed the \c Circulant
+expression class to refer to dense matrices. The entries of the
+circulant matrix have the same type as the entries of the vector
+passed to the \c makeCirculant function. The type used to index the
+entries is also the same. Again for simplicity, we will only return
+column-major matrices. Finally, the circulant matrix is a square
+matrix (number of rows equals number of columns), and the number of
+rows equals the number of rows of the column vector passed to the
+\c makeCirculant function. If this is a dynamic-size vector, then the
+size of the circulant matrix is not known at compile-time.
+
+This leads to the following code:
+
+\include make_circulant.cpp.traits
+
+
+\section TopicExpression The expression class
+
+The next step is to define the expression class itself. In our case,
+we want to inherit from \c MatrixBase in order to expose the interface
+for dense matrices. In the constructor, we check that we are passed a
+column vector (see \ref TopicAssertions) and we store the vector from
+which we are going to build the circulant matrix in the member
+variable \c m_arg. Finally, the expression class should compute the
+size of the corresponding circulant matrix. As explained above, this
+is a square matrix with as many columns as the vector used to
+construct the matrix.
+
+\b TO \b DO: What about the \c Nested typedef? It seems to be
+necessary; is this only temporary?
+
+\include make_circulant.cpp.expression
+
+
+\section TopicEvaluator The evaluator
+
+The last big fragment implements the evaluator for the \c Circulant
+expression. The evaluator computes the entries of the circulant
+matrix; this is done in the \c .coeff() member function. The entries
+are computed by finding the corresponding entry of the vector from
+which the circulant matrix is constructed. Getting this entry may
+actually be non-trivial when the circulant matrix is constructed from
+a vector which is given by a complicated expression, so we use the
+evaluator which corresponds to the vector.
+
+The \c CoeffReadCost constant records the cost of computing an entry
+of the circulant matrix; we ignore the index computation and say that
+this is the same as the cost of computing an entry of the vector from
+which the circulant matrix is constructed.
+
+In the constructor, we save the evaluator for the column vector which
+defined the circulant matrix. We also save the size of that vector;
+remember that we can query an expression object to find the size but
+not the evaluator. 
+
+\include make_circulant.cpp.evaluator
+
+
+\section TopicEntry The entry point
+
+After all this, the \c makeCirculant function is very simple. It
+simply creates an expression object and returns it.
+
+\include make_circulant.cpp.entry
+
+
+\section TopicMain A simple main function for testing
+
+Finally, a short \c main function that shows how the \c makeCirculant
+function can be called.
+
+\include make_circulant.cpp.main
+
+If all the fragments are combined, the following output is produced,
+showing that the program works as expected:
+
+\include make_circulant.out
+
+*/
+}
+
diff --git a/doc/Overview.dox b/doc/Overview.dox
index 9ab96233a..dbb49bd21 100644
--- a/doc/Overview.dox
+++ b/doc/Overview.dox
@@ -17,7 +17,9 @@ You're a MatLab user? There is also a <a href="AsciiQuickReference.txt">short AS
 The \b main \b documentation is organized into \em chapters covering different domains of features.
 They are themselves composed of \em user \em manual pages describing the different features in a comprehensive way, and \em reference pages that gives you access to the API documentation through the related Eigen's \em modules and \em classes.
 
-Under the  \subpage UserManual_Generalities section, you will find documentation on more general topics such as preprocessor directives, controlling assertions, multi-threading, MKL support, some Eigen's internal insights, and much more...
+Under the \subpage UserManual_CustomizingEigen section, you will find discussions and examples on extending %Eigen's features and supporting custom scalar types.
+
+Under the \subpage UserManual_Generalities section, you will find documentation on more general topics such as preprocessor directives, controlling assertions, multi-threading, MKL support, some Eigen's internal insights, and much more...
 
 Finally, do not miss the search engine, useful to quickly get to the documentation of a given class or function.
 
diff --git a/doc/Pitfalls.dox b/doc/Pitfalls.dox
new file mode 100644
index 000000000..cf42effef
--- /dev/null
+++ b/doc/Pitfalls.dox
@@ -0,0 +1,38 @@
+namespace Eigen {
+
+/** \page TopicPitfalls Common pitfalls
+
+\section TopicPitfalls_template_keyword Compilation error with template methods
+
+See this \link TopicTemplateKeyword page \endlink.
+
+\section TopicPitfalls_auto_keyword C++11 and the auto keyword
+
+In short: do not use the auto keywords with Eigen's expressions, unless you are 100% sure about what you are doing. In particular, do not use the auto keyword as a replacement for a Matrix<> type. Here is an example:
+
+\code
+MatrixXd A, B;
+auto C = A*B;
+for(...) { ... w = C * v;  ...}
+\endcode
+
+In this example, the type of C is not a MatrixXd but an abstract expression representing a matrix product and storing references to A and B. Therefore, the product of A*B will be carried out multiple times, once per iteration of the for loop. Moreover, if the coefficients of A or B change during the iteration, then C will evaluate to different values.
+
+Here is another example leading to a segfault:
+\code
+auto C = ((A+B).eval()).transpose();
+// do something with C
+\endcode
+The problem is that eval() returns a temporary object (in this case a MatrixXd) which is then referenced by the Transpose<> expression. However, this temporary is deleted right after the first line, and there the C expression reference a dead object. The same issue might occur when sub expressions are automatically evaluated by Eigen as in the following example:
+\code
+VectorXd u, v;
+auto C = u + (A*v).normalized();
+// do something with C
+\endcode
+where the normalized() method has to evaluate the expensive product A*v to avoid evaluating it twice. On the other hand, the following example is perfectly fine:
+\code
+auto C = (u + (A*v).normalized()).eval();
+\endcode
+In this case, C will be a regular VectorXd object.
+*/
+}
diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox
index 8a2968ebb..f01b39aec 100644
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox
@@ -5,7 +5,7 @@ namespace Eigen {
 You can control some aspects of %Eigen by defining the preprocessor tokens using \c \#define. These macros
 should be defined before any %Eigen headers are included. Often they are best set in the project options.
 
-This page lists the preprocesor tokens recognised by %Eigen.
+This page lists the preprocessor tokens recognized by %Eigen.
 
 \eigenAutoToc
 
@@ -18,25 +18,67 @@ one option, and other parts (or libraries that you use) are compiled with anothe
 fail to link or exhibit subtle bugs. Nevertheless, these options can be useful for people who know what they
 are doing.
 
- - \b EIGEN2_SUPPORT - if defined, enables the Eigen2 compatibility mode. This is meant to ease the transition
-   of Eigen2 to Eigen3 (see \ref Eigen2ToEigen3). Not defined by default.
- - \b EIGEN2_SUPPORT_STAGEnn_xxx (for various values of nn and xxx) - staged migration path from Eigen2 to
-   Eigen3; see \ref Eigen2SupportModes.
+ - \b EIGEN2_SUPPORT and \b EIGEN2_SUPPORT_STAGEnn_xxx are disabled starting from the 3.3 release.
+   Defining one of these will raise a compile-error. If you need to compile Eigen2 code,
+   <a href="http://eigen.tuxfamily.org/index.php?title=Eigen2">check this site</a>.
  - \b EIGEN_DEFAULT_DENSE_INDEX_TYPE - the type for column and row indices in matrices, vectors and array
    (DenseBase::Index). Set to \c std::ptrdiff_t by default.
  - \b EIGEN_DEFAULT_IO_FORMAT - the IOFormat to use when printing a matrix if no %IOFormat is specified.
    Defaults to the %IOFormat constructed by the default constructor IOFormat::IOFormat().
  - \b EIGEN_INITIALIZE_MATRICES_BY_ZERO - if defined, all entries of newly constructed matrices and arrays are
    initialized to zero, as are new entries in matrices and arrays after resizing. Not defined by default.
+   \warning The unary (resp. binary) constructor of \c 1x1 (resp. \c 2x1 or \c 1x2) fixed size matrices is
+   always interpreted as an initialization constructor where the argument(s) are the coefficient values
+   and not the sizes. For instance, \code Vector2d v(2,1); \endcode will create a vector with coeficients [2,1],
+   and \b not a \c 2x1 vector initialized with zeros (i.e., [0,0]). If such cases might occur, then it is
+   recommended to use the default constructor with a explicit call to resize:
+   \code
+   Matrix<?,SizeAtCompileTime,1> v;
+   v.resize(size);
+   Matrix<?,RowsAtCompileTime,ColsAtCompileTime> m;
+   m.resize(rows,cols);
+   \endcode
  - \b EIGEN_INITIALIZE_MATRICES_BY_NAN - if defined, all entries of newly constructed matrices and arrays are
    initialized to NaN, as are new entries in matrices and arrays after resizing. This option is especially
    useful for debugging purpose, though a memory tool like <a href="http://valgrind.org/">valgrind</a> is
    preferable. Not defined by default.
+   \warning See the documentation of \c EIGEN_INITIALIZE_MATRICES_BY_ZERO for a discussion on a limitations
+   of these macros when applied to \c 1x1, \c 1x2, and \c 2x1 fixed-size matrices.
  - \b EIGEN_NO_AUTOMATIC_RESIZING - if defined, the matrices (or arrays) on both sides of an assignment 
    <tt>a = b</tt> have to be of the same size; otherwise, %Eigen automatically resizes \c a so that it is of
    the correct size. Not defined by default.
 
 
+\section TopicPreprocessorDirectivesCppVersion C++ standard features
+
+By default, %Eigen strive to automatically detect and enable langage features at compile-time based on
+the information provided by the compiler.
+
+ - \b EIGEN_MAX_CPP_VER - disables usage of C++ features requiring a version greater than EIGEN_MAX_CPP_VER.
+   Possible values are: 03, 11, 14, 17, etc. If not defined (the default), %Eigen enables all features supported
+   by the compiler.
+
+Individual features can be explicitly enabled or disabled by defining the following token to 0 or 1 respectively.
+For instance, one might limit the C++ version to C++03 by defining EIGEN_MAX_CPP_VER=03, but still enable C99 math
+functions by defining EIGEN_HAS_C99_MATH=1.
+
+ - \b EIGEN_HAS_C99_MATH - controls the usage of C99 math functions such as erf, erfc, lgamma, etc.
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_CXX11_MATH - controls the implementation of some functions such as round, logp1, isinf, isnan, etc.
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_RVALUE_REFERENCES - defines whetehr rvalue references are supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_STD_RESULT_OF - defines whether std::result_of is supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_VARIADIC_TEMPLATES - defines whether variadic templates are supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_CONSTEXPR - defines whether relaxed const expression are supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<14.
+ - \b EIGEN_HAS_CXX11_CONTAINERS - defines whether STL's containers follows C++11 specifications
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_CXX11_NOEXCEPT - defines whether noexcept is supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+
 \section TopicPreprocessorDirectivesAssertions Assertions
 
 The %Eigen library contains many assertions to guard against programming errors, both at compile time and at
@@ -55,32 +97,39 @@ run time. However, these assertions do cost time and can thus be turned off.
 
 \section TopicPreprocessorDirectivesPerformance Alignment, vectorization and performance tweaking
 
- - \b EIGEN_MALLOC_ALREADY_ALIGNED - Can be set to 0 or 1 to tell whether default system malloc already
+ - \b \c EIGEN_MALLOC_ALREADY_ALIGNED - Can be set to 0 or 1 to tell whether default system \c malloc already
    returns aligned buffers. In not defined, then this information is automatically deduced from the compiler
    and system preprocessor tokens.
- - \b EIGEN_DONT_ALIGN - disables alignment completely. %Eigen will not try to align its objects and does not
-   expect that any objects passed to it are aligned. This will turn off vectorization. Not defined by default.
- - \b EIGEN_DONT_ALIGN_STATICALLY - disables alignment of arrays on the stack. Not defined by default, unless
-   \c EIGEN_DONT_ALIGN is defined.
- - \b EIGEN_DONT_PARALLELIZE - if defined, this disables multi-threading. This is only relevant if you enabled OpenMP.
+ - \b \c EIGEN_MAX_ALIGN_BYTES - Must be a power of two, or 0. Defines an upper bound on the memory boundary in bytes on which dynamically and statically allocated data may be aligned by %Eigen. If not defined, a default value is automatically computed based on architecture, compiler, and OS.
+ This option is typically used to enforce binary compatibility between code/libraries compiled with different SIMD options. For instance, one may compile AVX code and enforce ABI compatibility with existing SSE code by defining \c EIGEN_MAX_ALIGN_BYTES=16. In the other way round, since by default AVX implies 32 bytes alignment for best performance, one can compile SSE code to be ABI compatible with AVX code by defining \c EIGEN_MAX_ALIGN_BYTES=32.
+ - \b \c EIGEN_MAX_STATIC_ALIGN_BYTES - Same as \c EIGEN_MAX_ALIGN_BYTES but for statically allocated data only. By default, if only  \c EIGEN_MAX_ALIGN_BYTES is defined, then \c EIGEN_MAX_STATIC_ALIGN_BYTES == \c EIGEN_MAX_ALIGN_BYTES, otherwise a default value is automatically computed based on architecture, compiler, and OS (can be smaller than the default value of EIGEN_MAX_ALIGN_BYTES on architectures that do not support stack alignment).
+ Let us emphasize that \c EIGEN_MAX_*_ALIGN_BYTES define only a diserable upper bound. In practice data is aligned to largest power-of-two common divisor of \c EIGEN_MAX_STATIC_ALIGN_BYTES and the size of the data, such that memory is not wasted.
+ - \b \c EIGEN_DONT_PARALLELIZE - if defined, this disables multi-threading. This is only relevant if you enabled OpenMP.
    See \ref TopicMultiThreading for details.
  - \b EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless 
    alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN.
- - \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently
+ - \b \c EIGEN_UNALIGNED_VECTORIZE - disables/enables vectorization with unaligned stores. Default is 1 (enabled).
+   If set to 0 (disabled), then expression for which the destination cannot be aligned are not vectorized (e.g., unaligned
+   small fixed size vectors or matrices)
+ - \b \c EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently
    enables the SSE vectorization of sin() and cos(), and speedups sqrt() for single precision. Defined to 1 by default.
    Define it to 0 to disable.
- - \b EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable
+ - \b \c EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable
    unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not
    correspond to the number of iterations or the number of instructions. The default is value 100.
- - \b EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
+ - \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
    temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding
    this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB.
 
 
+ - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default.
+ - \c EIGEN_DONT_ALIGN_STATICALLY - Deprecated, it is a synonym for \c EIGEN_MAX_STATIC_ALIGN_BYTES=0. It disables alignment of arrays on the stack. Not defined by default, unless \c EIGEN_DONT_ALIGN is defined.
+
+
 \section TopicPreprocessorDirectivesPlugins Plugins
 
 It is possible to add new methods to many fundamental classes in %Eigen by writing a plugin. As explained in
-the section \ref ExtendingMatrixBase, the plugin is specified by defining a \c EIGEN_xxx_PLUGIN macro. The
+the section \ref TopicCustomizing_Plugins, the plugin is specified by defining a \c EIGEN_xxx_PLUGIN macro. The
 following macros are supported; none of them are defined by default.
 
  - \b EIGEN_ARRAY_PLUGIN - filename of plugin for extending the Array class.
@@ -91,6 +140,8 @@ following macros are supported; none of them are defined by default.
  - \b EIGEN_MATRIX_PLUGIN - filename of plugin for extending the Matrix class.
  - \b EIGEN_MATRIXBASE_PLUGIN - filename of plugin for extending the MatrixBase class.
  - \b EIGEN_PLAINOBJECTBASE_PLUGIN - filename of plugin for extending the PlainObjectBase class.
+ - \b EIGEN_MAPBASE_PLUGIN - filename of plugin for extending the MapBase class.
+ - \b EIGEN_QUATERNION_PLUGIN - filename of plugin for extending the Quaternion class.
  - \b EIGEN_QUATERNIONBASE_PLUGIN - filename of plugin for extending the QuaternionBase class.
  - \b EIGEN_SPARSEMATRIX_PLUGIN - filename of plugin for extending the SparseMatrix class.
  - \b EIGEN_SPARSEMATRIXBASE_PLUGIN - filename of plugin for extending the SparseMatrixBase class.
diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox
index a4be0f68a..44f5410db 100644
--- a/doc/QuickReference.dox
+++ b/doc/QuickReference.dox
@@ -13,17 +13,17 @@ The Eigen library is divided in a Core module and several additional modules. Ea
 
 <table class="manual">
 <tr><th>Module</th><th>Header file</th><th>Contents</th></tr>
-<tr><td>\link Core_Module Core \endlink</td><td>\code#include <Eigen/Core>\endcode</td><td>Matrix and Array classes, basic linear algebra (including triangular and selfadjoint products), array manipulation</td></tr>
+<tr            ><td>\link Core_Module Core \endlink</td><td>\code#include <Eigen/Core>\endcode</td><td>Matrix and Array classes, basic linear algebra (including triangular and selfadjoint products), array manipulation</td></tr>
 <tr class="alt"><td>\link Geometry_Module Geometry \endlink</td><td>\code#include <Eigen/Geometry>\endcode</td><td>Transform, Translation, Scaling, Rotation2D and 3D rotations (Quaternion, AngleAxis)</td></tr>
-<tr><td>\link LU_Module LU \endlink</td><td>\code#include <Eigen/LU>\endcode</td><td>Inverse, determinant, LU decompositions with solver (FullPivLU, PartialPivLU)</td></tr>
-<tr><td>\link Cholesky_Module Cholesky \endlink</td><td>\code#include <Eigen/Cholesky>\endcode</td><td>LLT and LDLT Cholesky factorization with solver</td></tr>
-<tr class="alt"><td>\link Householder_Module Householder \endlink</td><td>\code#include <Eigen/Householder>\endcode</td><td>Householder transformations; this module is used by several linear algebra modules</td></tr>
-<tr><td>\link SVD_Module SVD \endlink</td><td>\code#include <Eigen/SVD>\endcode</td><td>SVD decomposition with least-squares solver (JacobiSVD)</td></tr>
-<tr class="alt"><td>\link QR_Module QR \endlink</td><td>\code#include <Eigen/QR>\endcode</td><td>QR decomposition with solver (HouseholderQR, ColPivHouseholderQR, FullPivHouseholderQR)</td></tr>
-<tr><td>\link Eigenvalues_Module Eigenvalues \endlink</td><td>\code#include <Eigen/Eigenvalues>\endcode</td><td>Eigenvalue, eigenvector decompositions (EigenSolver, SelfAdjointEigenSolver, ComplexEigenSolver)</td></tr>
-<tr class="alt"><td>\link Sparse_modules Sparse \endlink</td><td>\code#include <Eigen/Sparse>\endcode</td><td>%Sparse matrix storage and related basic linear algebra (SparseMatrix, DynamicSparseMatrix, SparseVector)</td></tr>
-<tr><td></td><td>\code#include <Eigen/Dense>\endcode</td><td>Includes Core, Geometry, LU, Cholesky, SVD, QR, and Eigenvalues header files</td></tr>
-<tr class="alt"><td></td><td>\code#include <Eigen/Eigen>\endcode</td><td>Includes %Dense and %Sparse header files (the whole Eigen library)</td></tr>
+<tr            ><td>\link LU_Module LU \endlink</td><td>\code#include <Eigen/LU>\endcode</td><td>Inverse, determinant, LU decompositions with solver (FullPivLU, PartialPivLU)</td></tr>
+<tr class="alt"><td>\link Cholesky_Module Cholesky \endlink</td><td>\code#include <Eigen/Cholesky>\endcode</td><td>LLT and LDLT Cholesky factorization with solver</td></tr>
+<tr            ><td>\link Householder_Module Householder \endlink</td><td>\code#include <Eigen/Householder>\endcode</td><td>Householder transformations; this module is used by several linear algebra modules</td></tr>
+<tr class="alt"><td>\link SVD_Module SVD \endlink</td><td>\code#include <Eigen/SVD>\endcode</td><td>SVD decompositions with least-squares solver (JacobiSVD, BDCSVD)</td></tr>
+<tr            ><td>\link QR_Module QR \endlink</td><td>\code#include <Eigen/QR>\endcode</td><td>QR decomposition with solver (HouseholderQR, ColPivHouseholderQR, FullPivHouseholderQR)</td></tr>
+<tr class="alt"><td>\link Eigenvalues_Module Eigenvalues \endlink</td><td>\code#include <Eigen/Eigenvalues>\endcode</td><td>Eigenvalue, eigenvector decompositions (EigenSolver, SelfAdjointEigenSolver, ComplexEigenSolver)</td></tr>
+<tr            ><td>\link Sparse_Module Sparse \endlink</td><td>\code#include <Eigen/Sparse>\endcode</td><td>%Sparse matrix storage and related basic linear algebra (SparseMatrix, SparseVector) \n (see \ref SparseQuickRefPage for details on sparse modules)</td></tr>
+<tr class="alt"><td></td><td>\code#include <Eigen/Dense>\endcode</td><td>Includes Core, Geometry, LU, Cholesky, SVD, QR, and Eigenvalues header files</td></tr>
+<tr            ><td></td><td>\code#include <Eigen/Eigen>\endcode</td><td>Includes %Dense and %Sparse header files (the whole Eigen library)</td></tr>
 </table>
 
 <a href="#" class="top">top</a>
@@ -340,7 +340,7 @@ mat1 = mat2.adjoint();        mat1.adjointInPlace();
 \endcode
 </td></tr>
 <tr><td>
-\link MatrixBase::dot() dot \endlink product \n inner product \matrixworld</td><td>\code
+\link MatrixBase::dot dot \endlink product \n inner product \matrixworld</td><td>\code
 scalar = vec1.dot(vec2);
 scalar = col1.adjoint() * col2;
 scalar = (col1.adjoint() * col2).value();\endcode
@@ -364,32 +364,10 @@ vec3 = vec1.cross(vec2);\endcode</td></tr>
 
 <a href="#" class="top">top</a>
 \section QuickRef_Coeffwise Coefficient-wise \& Array operators
-Coefficient-wise operators for matrices and vectors:
-<table class="manual">
-<tr><th>Matrix API \matrixworld</th><th>Via Array conversions</th></tr>
-<tr><td>\code
-mat1.cwiseMin(mat2)
-mat1.cwiseMax(mat2)
-mat1.cwiseAbs2()
-mat1.cwiseAbs()
-mat1.cwiseSqrt()
-mat1.cwiseProduct(mat2)
-mat1.cwiseQuotient(mat2)\endcode
-</td><td>\code
-mat1.array().min(mat2.array())
-mat1.array().max(mat2.array())
-mat1.array().abs2()
-mat1.array().abs()
-mat1.array().sqrt()
-mat1.array() * mat2.array()
-mat1.array() / mat2.array()
-\endcode</td></tr>
-</table>
-
-It is also very simple to apply any user defined function \c foo using DenseBase::unaryExpr together with std::ptr_fun:
-\code mat1.unaryExpr(std::ptr_fun(foo))\endcode
 
-Array operators:\arrayworld
+In addition to the aforementioned operators, Eigen supports numerous coefficient-wise operator and functions.
+Most of them unambiguously makes sense in array-world\arrayworld. The following operators are readily available for arrays,
+or available through .array() for vectors and matrices:
 
 <table class="manual">
 <tr><td>Arithmetic operators</td><td>\code
@@ -400,28 +378,108 @@ array1 + scalar     array1 - scalar     array1 += scalar    array1 -= scalar
 array1 < array2     array1 > array2     array1 < scalar     array1 > scalar
 array1 <= array2    array1 >= array2    array1 <= scalar    array1 >= scalar
 array1 == array2    array1 != array2    array1 == scalar    array1 != scalar
+array1.min(array2)  array1.max(array2)  array1.min(scalar)  array1.max(scalar)
 \endcode</td></tr>
-<tr><td>Trigo, power, and \n misc functions \n and the STL variants</td><td>\code
-array1.min(array2)            
-array1.max(array2)            
+<tr><td>Trigo, power, and \n misc functions \n and the STL-like variants</td><td>\code
 array1.abs2()
 array1.abs()                  abs(array1)
 array1.sqrt()                 sqrt(array1)
 array1.log()                  log(array1)
+array1.log10()                log10(array1)
 array1.exp()                  exp(array1)
-array1.pow(exponent)          pow(array1,exponent)
+array1.pow(array2)            pow(array1,array2)
+array1.pow(scalar)            pow(array1,scalar)
+                              pow(scalar,array2)
 array1.square()
 array1.cube()
 array1.inverse()
+
 array1.sin()                  sin(array1)
 array1.cos()                  cos(array1)
 array1.tan()                  tan(array1)
 array1.asin()                 asin(array1)
 array1.acos()                 acos(array1)
+array1.atan()                 atan(array1)
+array1.sinh()                 sinh(array1)
+array1.cosh()                 cosh(array1)
+array1.tanh()                 tanh(array1)
+array1.arg()                  arg(array1)
+
+array1.floor()                floor(array1)
+array1.ceil()                 ceil(array1)
+array1.round()                round(aray1)
+
+array1.isFinite()             isfinite(array1)
+array1.isInf()                isinf(array1)
+array1.isNaN()                isnan(array1)
+\endcode
+</td></tr>
+</table>
+
+
+The following coefficient-wise operators are available for all kind of expressions (matrices, vectors, and arrays), and for both real or complex scalar types:
+
+<table class="manual">
+<tr><th>Eigen's API</th><th>STL-like APIs\arrayworld </th><th>Comments</th></tr>
+<tr><td>\code
+mat1.real()
+mat1.imag()
+mat1.conjugate()
+\endcode
+</td><td>\code
+real(array1)
+imag(array1)
+conj(array1)
+\endcode
+</td><td>
+\code
+ // read-write, no-op for real expressions
+ // read-only for real, read-write for complexes
+ // no-op for real expressions
 \endcode
 </td></tr>
 </table>
 
+Some coefficient-wise operators are readily available for for matrices and vectors through the following cwise* methods:
+<table class="manual">
+<tr><th>Matrix API \matrixworld</th><th>Via Array conversions</th></tr>
+<tr><td>\code
+mat1.cwiseMin(mat2)         mat1.cwiseMin(scalar)
+mat1.cwiseMax(mat2)         mat1.cwiseMax(scalar)
+mat1.cwiseAbs2()
+mat1.cwiseAbs()
+mat1.cwiseSqrt()
+mat1.cwiseInverse()
+mat1.cwiseProduct(mat2)
+mat1.cwiseQuotient(mat2)
+mat1.cwiseEqual(mat2)       mat1.cwiseEqual(scalar)
+mat1.cwiseNotEqual(mat2)
+\endcode
+</td><td>\code
+mat1.array().min(mat2.array())    mat1.array().min(scalar)
+mat1.array().max(mat2.array())    mat1.array().max(scalar)
+mat1.array().abs2()
+mat1.array().abs()
+mat1.array().sqrt()
+mat1.array().inverse()
+mat1.array() * mat2.array()
+mat1.array() / mat2.array()
+mat1.array() == mat2.array()      mat1.array() == scalar
+mat1.array() != mat2.array()
+\endcode</td></tr>
+</table>
+The main difference between the two API is that the one based on cwise* methods returns an expression in the matrix world,
+while the second one (based on .array()) returns an array expression.
+Recall that .array() has no cost, it only changes the available API and interpretation of the data.
+
+It is also very simple to apply any user defined function \c foo using DenseBase::unaryExpr together with <a href="http://en.cppreference.com/w/cpp/utility/functional/ptr_fun">std::ptr_fun</a> (c++03), <a href="http://en.cppreference.com/w/cpp/utility/functional/ref">std::ref</a> (c++11), or <a href="http://en.cppreference.com/w/cpp/language/lambda">lambdas</a> (c++11):
+\code
+mat1.unaryExpr(std::ptr_fun(foo));
+mat1.unaryExpr(std::ref(foo));
+mat1.unaryExpr([](double x) { return foo(x); });
+\endcode
+
+
 <a href="#" class="top">top</a>
 \section QuickRef_Reductions Reductions
 
diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox
index c00be10d3..fc33b93e7 100644
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@@ -4,52 +4,87 @@ In Eigen, there are several methods available to solve linear systems when the c
 
 \eigenAutoToc
 
-\section TutorialSparseDirectSolvers Sparse solvers
+\section TutorialSparseSolverList List of sparse solvers
 
-%Eigen currently provides a limited set of built-in solvers, as well as wrappers to external solver libraries.
-They are summarized in the following table:
+%Eigen currently provides a wide set of built-in solvers, as well as wrappers to external solver libraries.
+They are summarized in the following tables:
+
+\subsection TutorialSparseSolverList_Direct Built-in direct solvers
 
 <table class="manual">
-<tr><th>Class</th><th>Module</th><th>Solver kind</th><th>Matrix kind</th><th>Features related to performance</th>
-    <th>Dependencies,License</th><th class="width20em"><p>Notes</p></th></tr>
-<tr><td>SimplicialLLT    </td><td>\link SparseCholesky_Module SparseCholesky \endlink</td><td>Direct LLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
-    <td>built-in, LGPL</td>
+<tr><th>Class</th><th>Solver kind</th><th>Matrix kind</th><th>Features related to performance</th>
+    <th>License</th><th class="width20em"><p>Notes</p></th></tr>
+
+<tr><td>SimplicialLLT \n <tt>\#include<Eigen/\link SparseCholesky_Module SparseCholesky\endlink></tt></td><td>Direct LLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
+    <td>LGPL</td>
     <td>SimplicialLDLT is often preferable</td></tr>
-<tr><td>SimplicialLDLT   </td><td>\link SparseCholesky_Module SparseCholesky \endlink</td><td>Direct LDLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
-    <td>built-in, LGPL</td>
+
+<tr><td>SimplicialLDLT \n <tt>\#include<Eigen/\link SparseCholesky_Module SparseCholesky\endlink></tt></td><td>Direct LDLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
+    <td>LGPL</td>
     <td>Recommended for very sparse and not too large problems (e.g., 2D Poisson eq.)</td></tr>
-<tr><td>ConjugateGradient</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Classic iterative CG</td><td>SPD</td><td>Preconditionning</td>
-    <td>built-in, MPL2</td>
-    <td>Recommended for large symmetric problems (e.g., 3D Poisson eq.)</td></tr>
-<tr><td>BiCGSTAB</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Iterative stabilized bi-conjugate gradient</td><td>Square</td><td>Preconditionning</td>
-    <td>built-in, MPL2</td>
-    <td>To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.</td></tr>
-<tr><td>SparseLU</td> <td>\link SparseLU_Module SparseLU \endlink </td> <td>LU factorization </td>
+
+<tr><td>SparseLU \n <tt>\#include<Eigen/\link SparseLU_Module SparseLU\endlink></tt></td> <td>LU factorization </td>
     <td>Square </td><td>Fill-in reducing, Leverage fast dense algebra</td>
-    <td> built-in, MPL2</td> <td>optimized for small and large problems with irregular patterns </td></tr>
-<tr><td>SparseQR</td> <td>\link SparseQR_Module SparseQR \endlink</td> <td> QR factorization</td>
+    <td>MPL2</td>
+    <td>optimized for small and large problems with irregular patterns </td></tr>
+
+<tr><td>SparseQR \n <tt>\#include<Eigen/\link SparseQR_Module SparseQR\endlink></tt></td> <td> QR factorization</td>
     <td>Any, rectangular</td><td> Fill-in reducing</td>
-    <td>built-in, MPL2</td><td>recommended for least-square problems, has a basic rank-revealing feature</td></tr>
-<tr> <th colspan="7"> Wrappers to external solvers </th></tr>
+    <td>MPL2</td>
+    <td>recommended for least-square problems, has a basic rank-revealing feature</td></tr>
+ </table>
+
+\subsection TutorialSparseSolverList_Iterative Built-in iterative solvers
+
+<table class="manual">
+<tr><th>Class</th><th>Solver kind</th><th>Matrix kind</th><th>Supported preconditioners, [default]</th>
+    <th>License</th><th class="width20em"><p>Notes</p></th></tr>
+
+<tr><td>ConjugateGradient \n <tt>\#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td> <td>Classic iterative CG</td><td>SPD</td>
+    <td>IdentityPreconditioner, [DiagonalPreconditioner], IncompleteCholesky</td>
+    <td>MPL2</td>
+    <td>Recommended for large symmetric problems (e.g., 3D Poisson eq.)</td></tr>
+
+<tr><td>LeastSquaresConjugateGradient \n <tt>\#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td><td>CG for rectangular least-square problem</td><td>Rectangular</td>
+    <td>IdentityPreconditioner, [LeastSquareDiagonalPreconditioner]</td>
+    <td>MPL2</td>
+    <td>Solve for min |A'Ax-b|^2 without forming A'A</td></tr>
+
+<tr><td>BiCGSTAB \n <tt>\#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td><td>Iterative stabilized bi-conjugate gradient</td><td>Square</td>
+    <td>IdentityPreconditioner, [DiagonalPreconditioner], IncompleteLUT</td>
+    <td>MPL2</td>
+    <td>To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.</td></tr>
+</table>
+
+\subsection TutorialSparseSolverList_Wrapper Wrappers to external solvers
+
+<table class="manual">
+<tr><th>Class</th><th>Module</th><th>Solver kind</th><th>Matrix kind</th><th>Features related to performance</th>
+    <th>Dependencies,License</th><th class="width20em"><p>Notes</p></th></tr>
 <tr><td>PastixLLT \n PastixLDLT \n PastixLU</td><td>\link PaStiXSupport_Module PaStiXSupport \endlink</td><td>Direct LLt, LDLt, LU factorizations</td><td>SPD \n SPD \n Square</td><td>Fill-in reducing, Leverage fast dense algebra, Multithreading</td>
     <td>Requires the <a href="http://pastix.gforge.inria.fr">PaStiX</a> package, \b CeCILL-C </td>
     <td>optimized for tough problems and symmetric patterns</td></tr>
 <tr><td>CholmodSupernodalLLT</td><td>\link CholmodSupport_Module CholmodSupport \endlink</td><td>Direct LLt factorization</td><td>SPD</td><td>Fill-in reducing, Leverage fast dense algebra</td>
-    <td>Requires the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">SuiteSparse</a> package, \b GPL </td>
+    <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
     <td></td></tr>
 <tr><td>UmfPackLU</td><td>\link UmfPackSupport_Module UmfPackSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, Leverage fast dense algebra</td>
-    <td>Requires the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">SuiteSparse</a> package, \b GPL </td>
+    <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
     <td></td></tr>
 <tr><td>SuperLU</td><td>\link SuperLUSupport_Module SuperLUSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, Leverage fast dense algebra</td>
     <td>Requires the <a href="http://crd-legacy.lbl.gov/~xiaoye/SuperLU/">SuperLU</a> library, (BSD-like)</td>
     <td></td></tr>
 <tr><td>SPQR</td><td>\link SPQRSupport_Module SPQRSupport \endlink  </td> <td> QR factorization </td> 
     <td> Any, rectangular</td><td>fill-in reducing, multithreaded, fast dense algebra</td>
-    <td> requires the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">SuiteSparse</a> package, \b GPL </td><td>recommended for linear least-squares problems, has a rank-revealing feature</tr>
+    <td> requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td><td>recommended for linear least-squares problems, has a rank-revealing feature</tr>
+<tr><td>PardisoLLT \n PardisoLDLT \n PardisoLU</td><td>\link PardisoSupport_Module PardisoSupport \endlink</td><td>Direct LLt, LDLt, LU factorizations</td><td>SPD \n SPD \n Square</td><td>Fill-in reducing, Leverage fast dense algebra, Multithreading</td>
+    <td>Requires the <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php">Intel MKL</a> package, \b Proprietary </td>
+    <td>optimized for tough problems patterns, see also \link TopicUsingIntelMKL using MKL with Eigen \endlink</td></tr>
 </table>
 
 Here \c SPD means symmetric positive definite.
 
+\section TutorialSparseSolverConcept Sparse solver concept
+
 All these solvers follow the same general concept.
 Here is a typical and general example:
 \code
@@ -101,8 +136,10 @@ x2 = solver.solve(b2);
 \endcode
 The compute() method is equivalent to calling both analyzePattern() and factorize().
 
-Finally, each solver provides some specific features, such as determinant, access to the factors, controls of the iterations, and so on.
-More details are availble in the documentations of the respective classes.
+Each solver provides some specific features, such as determinant, access to the factors, controls of the iterations, and so on.
+More details are available in the documentations of the respective classes.
+
+Finally, most of the iterative solvers, can also be used in a \b matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
 
 \section TheSparseCompute The Compute Step
 In the compute() function, the matrix is generally factorized: LLT for self-adjoint matrices, LDLT for general hermitian matrices, LU for non hermitian matrices and QR for rectangular matrices. These are the results of using direct solvers. For this class of solvers precisely, the compute step is further subdivided into analyzePattern() and factorize(). 
@@ -140,7 +177,16 @@ x2 = solver.solve(b2);
 For direct methods, the solution are computed at the machine precision. Sometimes, the solution need not be too accurate. In this case, the iterative methods are more suitable and the desired accuracy can be set before the solve step using \b setTolerance(). For all the available functions, please, refer to the documentation of the \link IterativeLinearSolvers_Module Iterative solvers module \endlink. 
 
 \section BenchmarkRoutine
-Most of the time, all you need is to know how much time it will take to qolve your system, and hopefully, what is the most suitable solver. In Eigen, we provide a benchmark routine that can be used for this purpose. It is very easy to use. In the build directory, navigate to bench/spbench and compile the routine by typing \b make \e spbenchsolver. Run it with --help option to get the list of all available options. Basically, the matrices to test should be in <a href="http://math.nist.gov/MatrixMarket/formats.html">MatrixMarket Coordinate format</a>, and the routine returns the statistics from all available solvers in Eigen. 
+Most of the time, all you need is to know how much time it will take to solve your system, and hopefully, what is the most suitable solver. In Eigen, we provide a benchmark routine that can be used for this purpose. It is very easy to use. In the build directory, navigate to bench/spbench and compile the routine by typing \b make \e spbenchsolver. Run it with --help option to get the list of all available options. Basically, the matrices to test should be in <a href="http://math.nist.gov/MatrixMarket/formats.html">MatrixMarket Coordinate format</a>, and the routine returns the statistics from all available solvers in Eigen.
+
+To export your matrices and right-hand-side vectors in the matrix-market format, you can the the unsupported SparseExtra module:
+\code
+#include <unsupported/Eigen/SparseExtra>
+...
+Eigen::saveMarket(A, "filename.mtx");
+Eigen::saveMarket(A, "filename_SPD.mtx", Eigen::Symmetric); // if A is symmetric-positive-definite
+Eigen::saveMarketVector(B, "filename_b.mtx");
+\endcode
 
 The following table gives an example of XML statistics from several Eigen built-in and external solvers. 
 <TABLE border="1">
diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox
index d04ac35c5..a25622e80 100644
--- a/doc/SparseQuickReference.dox
+++ b/doc/SparseQuickReference.dox
@@ -21,7 +21,7 @@ i.e either row major or column major. The default is column major. Most arithmet
 <td> Resize/Reserve</td>
 <td> 
  \code
-    sm1.resize(m,n);      //Change sm1 to a m x n matrix. 
+    sm1.resize(m,n);      // Change sm1 to a m x n matrix.
     sm1.reserve(nnz);     // Allocate room for nnz nonzeros elements.   
   \endcode 
 </td>
@@ -151,10 +151,10 @@ It is easy to perform arithmetic operations on sparse matrices provided that the
 <td> Permutation </td>
 <td> 
 \code 
-perm.indices(); // Reference to the vector of indices
+perm.indices();      // Reference to the vector of indices
 sm1.twistedBy(perm); // Permute rows and columns
-sm2 = sm1 * perm; //Permute the columns
-sm2 = perm * sm1; // Permute the columns
+sm2 = sm1 * perm;    // Permute the columns
+sm2 = perm * sm1;    // Permute the columns
 \endcode 
 </td>
 <td> 
@@ -181,9 +181,9 @@ sm2 = perm * sm1; // Permute the columns
 
 \section sparseotherops Other supported operations
 <table class="manual">
-<tr><th>Operations</th> <th> Code </th> <th> Notes</th> </tr>
+<tr><th style="min-width:initial"> Code </th> <th> Notes</th> </tr>
+<tr><td colspan="2">Sub-matrices</td></tr>
 <tr>
-<td>Sub-matrices</td> 
 <td> 
 \code 
   sm1.block(startRow, startCol, rows, cols); 
@@ -193,25 +193,31 @@ sm2 = perm * sm1; // Permute the columns
   sm1.bottomLeftCorner( rows, cols);
   sm1.bottomRightCorner( rows, cols);
   \endcode
-</td> <td>  </td>
+</td><td>
+Contrary to dense matrices, here <strong>all these methods are read-only</strong>.\n
+See \ref TutorialSparse_SubMatrices and below for read-write sub-matrices.
+</td>
 </tr>
-<tr> 
-<td> Range </td>
+<tr class="alt"><td colspan="2"> Range </td></tr>
+<tr class="alt">
 <td> 
 \code 
-  sm1.innerVector(outer); 
-  sm1.innerVectors(start, size);
-  sm1.leftCols(size);
-  sm2.rightCols(size);
-  sm1.middleRows(start, numRows);
-  sm1.middleCols(start, numCols);
-  sm1.col(j);
+  sm1.innerVector(outer);           // RW
+  sm1.innerVectors(start, size);    // RW
+  sm1.leftCols(size);               // RW
+  sm2.rightCols(size);              // RO because sm2 is row-major
+  sm1.middleRows(start, numRows);   // RO because sm1 is column-major
+  sm1.middleCols(start, numCols);   // RW
+  sm1.col(j);                       // RW
 \endcode
 </td>
-<td>A inner vector is either a row (for row-major) or a column (for column-major). As stated earlier, the evaluation can be done in a matrix with different storage order </td>
+<td>
+A inner vector is either a row (for row-major) or a column (for column-major).\n
+As stated earlier, for a read-write sub-matrix (RW), the evaluation can be done in a matrix with different storage order.
+</td>
 </tr>
+<tr><td colspan="2"> Triangular and selfadjoint views</td></tr>
 <tr>
-<td> Triangular and selfadjoint views</td>
 <td> 
 \code
   sm2 = sm1.triangularview<Lower>();
@@ -222,26 +228,44 @@ sm2 = perm * sm1; // Permute the columns
 \code 
   \endcode </td>
 </tr>
-<tr> 
-<td>Triangular solve </td>
+<tr class="alt"><td colspan="2">Triangular solve </td></tr>
+<tr class="alt">
 <td> 
 \code 
  dv2 = sm1.triangularView<Upper>().solve(dv1);
- dv2 = sm1.topLeftCorner(size, size).triangularView<Lower>().solve(dv1);
+ dv2 = sm1.topLeftCorner(size, size)
+          .triangularView<Lower>().solve(dv1);
 \endcode 
 </td>
 <td> For general sparse solve, Use any suitable module described at \ref TopicSparseSystems </td>
 </tr>
+<tr><td colspan="2"> Low-level API</td></tr>
 <tr>
-<td> Low-level API</td>
 <td>
 \code
-sm1.valuePtr(); // Pointer to the values
-sm1.innerIndextr(); // Pointer to the indices.
-sm1.outerIndexPtr(); //Pointer to the beginning of each inner vector
+sm1.valuePtr();      // Pointer to the values
+sm1.innerIndextr();  // Pointer to the indices.
+sm1.outerIndexPtr(); // Pointer to the beginning of each inner vector
+\endcode
+</td>
+<td>
+If the matrix is not in compressed form, makeCompressed() should be called before.\n
+Note that these functions are mostly provided for interoperability purposes with external libraries.\n
+A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section</td>
+</tr>
+<tr class="alt"><td colspan="2">Mapping external buffers</td></tr>
+<tr class="alt">
+<td>
+\code
+int outerIndexPtr[cols+1];
+int innerIndices[nnz];
+double values[nnz];
+Map<SparseMatrix<double> > sm1(rows,cols,nnz,outerIndexPtr, // read-write
+                               innerIndices,values);
+Map<const SparseMatrix<double> > sm2(...);                  // read-only
 \endcode
 </td>
-<td> If the matrix is not in compressed form, makeCompressed() should be called before. Note that these functions are mostly provided for interoperability purposes with external libraries. A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section</td>
+<td>As for dense matrices, class Map<SparseMatrixType> can be used to see external buffers as an %Eigen's SparseMatrix object. </td>
 </tr>
 </table>
 */
diff --git a/doc/StlContainers.dox b/doc/StlContainers.dox
index d8d0d529c..e0f8714a9 100644
--- a/doc/StlContainers.dox
+++ b/doc/StlContainers.dox
@@ -4,7 +4,7 @@ namespace Eigen {
 
 \eigenAutoToc
 
-\section summary Executive summary
+\section StlContainers_summary Executive summary
 
 Using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", or classes having members of such types, requires taking the following two steps:
 
@@ -28,7 +28,7 @@ std::map<int, Eigen::Vector4f, std::less<int>,
 \endcode
 Note that the third parameter "std::less<int>" is just the default value, but we have to include it because we want to specify the fourth parameter, which is the allocator type.
 
-\section vector The case of std::vector
+\section StlContainers_vector The case of std::vector
 
 The situation with std::vector was even worse (explanation below) so we had to specialize it for the Eigen::aligned_allocator type. In practice you \b must use the Eigen::aligned_allocator (not another aligned allocator), \b and \#include <Eigen/StdVector>.
 
diff --git a/doc/StructHavingEigenMembers.dox b/doc/StructHavingEigenMembers.dox
index 74a8d5217..7fbed0eb0 100644
--- a/doc/StructHavingEigenMembers.dox
+++ b/doc/StructHavingEigenMembers.dox
@@ -4,11 +4,11 @@ namespace Eigen {
 
 \eigenAutoToc
 
-\section summary Executive Summary
+\section StructHavingEigenMembers_summary Executive Summary
 
-If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must overload its "operator new" so that it generates 16-bytes-aligned pointers. Fortunately, Eigen provides you with a macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW that does that for you.
+If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must overload its "operator new" so that it generates 16-bytes-aligned pointers. Fortunately, %Eigen provides you with a macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW that does that for you.
 
-\section what What kind of code needs to be changed?
+\section StructHavingEigenMembers_what What kind of code needs to be changed?
 
 The kind of code that needs to be changed is this:
 
@@ -27,7 +27,7 @@ Foo *foo = new Foo;
 
 In other words: you have a class that has as a member a \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen object", and then you dynamically create an object of that class.
 
-\section how How should such code be modified?
+\section StructHavingEigenMembers_how How should such code be modified?
 
 Very easy, you just need to put a EIGEN_MAKE_ALIGNED_OPERATOR_NEW macro in a public part of your class, like this:
 
@@ -48,9 +48,9 @@ Foo *foo = new Foo;
 
 This macro makes "new Foo" always return an aligned pointer.
 
-If this approach is too intrusive, see also the \ref othersolutions.
+If this approach is too intrusive, see also the \ref StructHavingEigenMembers_othersolutions "other solutions".
 
-\section why Why is this needed?
+\section StructHavingEigenMembers_why Why is this needed?
 
 OK let's say that your code looks like this:
 
@@ -67,7 +67,7 @@ class Foo
 Foo *foo = new Foo;
 \endcode
 
-A Eigen::Vector2d consists of 2 doubles, which is 128 bits. Which is exactly the size of a SSE packet, which makes it possible to use SSE for all sorts of operations on this vector. But SSE instructions (at least the ones that Eigen uses, which are the fast ones) require 128-bit alignment. Otherwise you get a segmentation fault.
+A Eigen::Vector2d consists of 2 doubles, which is 128 bits. Which is exactly the size of a SSE packet, which makes it possible to use SSE for all sorts of operations on this vector. But SSE instructions (at least the ones that %Eigen uses, which are the fast ones) require 128-bit alignment. Otherwise you get a segmentation fault.
 
 For this reason, Eigen takes care by itself to require 128-bit alignment for Eigen::Vector2d, by doing two things:
 \li Eigen requires 128-bit alignment for the Eigen::Vector2d's array (of 2 doubles). With GCC, this is done with a __attribute__ ((aligned(16))).
@@ -81,7 +81,7 @@ The alignment attribute of the member v is then relative to the start of the cla
 
 The solution is to let class Foo have an aligned "operator new", as we showed in the previous section.
 
-\section movetotop Should I then put all the members of Eigen types at the beginning of my class?
+\section StructHavingEigenMembers_movetotop Should I then put all the members of Eigen types at the beginning of my class?
 
 That's not required. Since Eigen takes care of declaring 128-bit alignment, all members that need it are automatically 128-bit aligned relatively to the class. So code like this works fine:
 
@@ -95,15 +95,15 @@ public:
 };
 \endcode
 
-\section dynamicsize What about dynamic-size matrices and vectors?
+\section StructHavingEigenMembers_dynamicsize What about dynamic-size matrices and vectors?
 
 Dynamic-size matrices and vectors, such as Eigen::VectorXd, allocate dynamically their own array of coefficients, so they take care of requiring absolute alignment automatically. So they don't cause this issue. The issue discussed here is only with \ref TopicFixedSizeVectorizable  "fixed-size vectorizable matrices and vectors".
 
-\section bugineigen So is this a bug in Eigen?
+\section StructHavingEigenMembers_bugineigen So is this a bug in Eigen?
 
 No, it's not our bug. It's more like an inherent problem of the C++98 language specification, and seems to be taken care of in the upcoming language revision: <a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2341.pdf">see this document</a>.
 
-\section conditional What if I want to do this conditionnally (depending on template parameters) ?
+\section StructHavingEigenMembers_conditional What if I want to do this conditionnally (depending on template parameters) ?
 
 For this situation, we offer the macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign). It will generate aligned operators like EIGEN_MAKE_ALIGNED_OPERATOR_NEW if NeedsToAlign is true. It will generate operators with the default alignment if NeedsToAlign is false.
 
@@ -128,7 +128,7 @@ Foo<3> *foo3 = new Foo<3>; // foo3 has only the system default alignment guarant
 \endcode
 
 
-\section othersolutions Other solutions
+\section StructHavingEigenMembers_othersolutions Other solutions
 
 In case putting the EIGEN_MAKE_ALIGNED_OPERATOR_NEW macro everywhere is too intrusive, there exists at least two other solutions.
 
diff --git a/doc/TemplateKeyword.dox b/doc/TemplateKeyword.dox
index f4e4f237e..b84cfdae9 100644
--- a/doc/TemplateKeyword.dox
+++ b/doc/TemplateKeyword.dox
@@ -5,7 +5,8 @@ namespace Eigen {
 There are two uses for the \c template and \c typename keywords in C++. One of them is fairly well known
 amongst programmers: to define templates. The other use is more obscure: to specify that an expression refers
 to a template function or a type. This regularly trips up programmers that use the %Eigen library, often
-leading to error messages from the compiler that are difficult to understand.
+leading to error messages from the compiler that are difficult to understand, such as "expected expression" or
+"no match for operator<".
 
 \eigenAutoToc
 
@@ -72,23 +73,23 @@ for operator<".
 The reason that the \c template keyword is necessary in the last example has to do with the rules for how
 templates are supposed to be compiled in C++. The compiler has to check the code for correct syntax at the
 point where the template is defined, without knowing the actual value of the template arguments (\c Derived1
-and \c Derived2 in the example). That means that the compiler cannot know that <tt>dst.triangularPart</tt> is
+and \c Derived2 in the example). That means that the compiler cannot know that <tt>dst.triangularView</tt> is
 a member template and that the following &lt; symbol is part of the delimiter for the template
-parameter. Another possibility would be that <tt>dst.triangularPart</tt> is a member variable with the &lt;
+parameter. Another possibility would be that <tt>dst.triangularView</tt> is a member variable with the &lt;
 symbol refering to the <tt>operator&lt;()</tt> function. In fact, the compiler should choose the second
-possibility, according to the standard. If <tt>dst.triangularPart</tt> is a member template (as in our case),
+possibility, according to the standard. If <tt>dst.triangularView</tt> is a member template (as in our case),
 the programmer should specify this explicitly with the \c template keyword and write <tt>dst.template
-triangularPart</tt>.
+triangularView</tt>.
 
 The precise rules are rather complicated, but ignoring some subtleties we can summarize them as follows:
 - A <em>dependent name</em> is name that depends (directly or indirectly) on a template parameter. In the
   example, \c dst is a dependent name because it is of type <tt>MatrixBase&lt;Derived1&gt;</tt> which depends
   on the template parameter \c Derived1.
-- If the code contains either one of the contructions <tt>xxx.yyy</tt> or <tt>xxx-&gt;yyy</tt> and \c xxx is a
+- If the code contains either one of the constructs <tt>xxx.yyy</tt> or <tt>xxx-&gt;yyy</tt> and \c xxx is a
   dependent name and \c yyy refers to a member template, then the \c template keyword must be used before 
   \c yyy, leading to <tt>xxx.template yyy</tt> or <tt>xxx-&gt;template yyy</tt>.
-- If the code contains the contruction <tt>xxx::yyy</tt> and \c xxx is a dependent name and \c yyy refers to a
-  member typedef, then the \c typename keyword must be used before the whole construction, leading to
+- If the code contains the construct <tt>xxx::yyy</tt> and \c xxx is a dependent name and \c yyy refers to a
+  member typedef, then the \c typename keyword must be used before the whole construct, leading to
   <tt>typename xxx::yyy</tt>.
 
 As an example where the \c typename keyword is required, consider the following code in \ref TutorialSparse
diff --git a/doc/TopicAliasing.dox b/doc/TopicAliasing.dox
index c2654aed2..a8f164428 100644
--- a/doc/TopicAliasing.dox
+++ b/doc/TopicAliasing.dox
@@ -153,10 +153,11 @@ not necessary to evaluate the right-hand side explicitly.
 
 \section TopicAliasingMatrixMult Aliasing and matrix multiplication
 
-Matrix multiplication is the only operation in %Eigen that assumes aliasing by default. Thus, if \c matA is a
-matrix, then the statement <tt>matA = matA * matA;</tt> is safe. All other operations in %Eigen assume that
-there are no aliasing problems, either because the result is assigned to a different matrix or because it is a
-component-wise operation.
+Matrix multiplication is the only operation in %Eigen that assumes aliasing by default, <strong>under the
+condition that the destination matrix is not resized</strong>.
+Thus, if \c matA is a \b squared matrix, then the statement <tt>matA = matA * matA;</tt> is safe.
+All other operations in %Eigen assume that there are no aliasing problems,
+either because the result is assigned to a different matrix or because it is a component-wise operation.
 
 <table class="example">
 <tr><th>Example</th><th>Output</th></tr>
@@ -198,6 +199,27 @@ may get wrong results:
 \verbinclude TopicAliasing_mult3.out
 </td></tr></table>
 
+Moreover, starting in Eigen 3.3, aliasing is \b not assumed if the destination matrix is resized and the product is not directly assigned to the destination.
+Therefore, the following example is also wrong:
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult4.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult4.out
+</td></tr></table>
+
+As for any aliasing issue, you can resolve it by explicitly evaluating the expression prior to assignment:
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult5.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult5.out
+</td></tr></table>
 
 \section TopicAliasingSummary Summary
 
diff --git a/doc/TopicAssertions.dox b/doc/TopicAssertions.dox
index 4ead40174..c8b4d84f2 100644
--- a/doc/TopicAssertions.dox
+++ b/doc/TopicAssertions.dox
@@ -16,7 +16,7 @@ Both eigen_assert and eigen_plain_assert are defined in Macros.h. Defining eigen
 #include <stdexcept>
 #undef eigen_assert
 #define eigen_assert(x) \
-  if (!x) { throw (std::runtime_error("Put your message here")); }
+  if (!(x)) { throw (std::runtime_error("Put your message here")); }
 \endcode
 
 \subsection DisableAssert Disabling assertions
diff --git a/doc/TopicCMakeGuide.dox b/doc/TopicCMakeGuide.dox
new file mode 100644
index 000000000..896cfa831
--- /dev/null
+++ b/doc/TopicCMakeGuide.dox
@@ -0,0 +1,52 @@
+namespace Eigen {
+
+/**
+
+\page TopicCMakeGuide Using %Eigen in CMake Projects
+
+%Eigen provides native CMake support which allows the library to be easily
+used in CMake projects.
+
+\note %CMake 3.0 (or later) is required to enable this functionality.
+
+%Eigen exports a CMake target called `Eigen3::Eigen` which can be imported
+using the `find_package` CMake command and used by calling
+`target_link_libraries` as in the following example:
+\code{.cmake}
+cmake_minimum_required (VERSION 3.0)
+project (myproject)
+
+find_package (Eigen3 3.3 REQUIRED NO_MODULE)
+
+add_executable (example example.cpp)
+target_link_libraries (example Eigen3::Eigen)
+\endcode
+
+The above code snippet must be placed in a file called `CMakeLists.txt` alongside
+`example.cpp`. After running
+\code{.sh}
+$ cmake path-to-example-directory
+\endcode
+CMake will produce project files that generate an executable called `example`
+which requires at least version 3.3 of %Eigen. Here, `path-to-example-directory`
+is the path to the directory that contains both `CMakeLists.txt` and
+`example.cpp`.
+
+If you have multiple installed version of %Eigen, you can pick your favorite one by setting the \c Eigen3_DIR cmake's variable to the respective path containing the \c Eigen3*.cmake files. For instance:
+\code
+cmake path-to-example-directory -DEigen3_DIR=$HOME/mypackages/share/eigen3/cmake/
+\endcode
+
+If the `REQUIRED` option is omitted when locating %Eigen using
+`find_package`, one can check whether the package was found as follows:
+\code{.cmake}
+find_package (Eigen3 3.3 NO_MODULE)
+
+if (TARGET Eigen3::Eigen)
+  # Use the imported target
+endif (TARGET Eigen3::Eigen)
+\endcode
+
+*/
+
+}
diff --git a/doc/TopicLazyEvaluation.dox b/doc/TopicLazyEvaluation.dox
index 393bc41d8..101ef8c72 100644
--- a/doc/TopicLazyEvaluation.dox
+++ b/doc/TopicLazyEvaluation.dox
@@ -36,7 +36,7 @@ Here is now a more involved example:
 
 Eigen chooses lazy evaluation at every stage in that example, which is clearly the correct choice. In fact, lazy evaluation is the "default choice" and Eigen will choose it except in a few circumstances.
 
-<b>The first circumstance</b> in which Eigen chooses immediate evaluation, is when it sees an assignment <tt>a = b;</tt> and the expression \c b has the evaluate-before-assigning \link flags flag\endlink. The most important example of such an expression is the \link GeneralProduct matrix product expression\endlink. For example, when you do
+<b>The first circumstance</b> in which Eigen chooses immediate evaluation, is when it sees an assignment <tt>a = b;</tt> and the expression \c b has the evaluate-before-assigning \link flags flag\endlink. The most important example of such an expression is the \link Product matrix product expression\endlink. For example, when you do
 
 \code matrix = matrix * matrix; \endcode
 
@@ -48,7 +48,7 @@ What if you know that the result does no alias the operand of the product and wa
 
 Here, since we know that matrix2 is not the same matrix as matrix1, we know that lazy evaluation is not dangerous, so we may force lazy evaluation. Concretely, the effect of noalias() here is to bypass the evaluate-before-assigning \link flags flag\endlink.
 
-<b>The second circumstance</b> in which Eigen chooses immediate evaluation, is when it sees a nested expression such as <tt>a + b</tt> where \c b is already an expression having the evaluate-before-nesting \link flags flag\endlink. Again, the most important example of such an expression is the \link GeneralProduct matrix product expression\endlink. For example, when you do
+<b>The second circumstance</b> in which Eigen chooses immediate evaluation, is when it sees a nested expression such as <tt>a + b</tt> where \c b is already an expression having the evaluate-before-nesting \link flags flag\endlink. Again, the most important example of such an expression is the \link Product matrix product expression\endlink. For example, when you do
 
 \code matrix1 = matrix2 + matrix3 * matrix4; \endcode
 
diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox
index 8649cc27b..491470627 100644
--- a/doc/TopicLinearAlgebraDecompositions.dox
+++ b/doc/TopicLinearAlgebraDecompositions.dox
@@ -4,6 +4,7 @@ namespace Eigen {
 
 This page presents a catalogue of the dense matrix decompositions offered by Eigen.
 For an introduction on linear solvers and decompositions, check this \link TutorialLinearAlgebra page \endlink.
+To get an overview of the true relative speed of the different decomposition, check this \link DenseDecompositionBenchmark benchmark \endlink.
 
 \section TopicLinAlgBigTable Catalogue of decompositions offered by Eigen
 
@@ -116,7 +117,7 @@ For an introduction on linear solvers and decompositions, check this \link Tutor
         <td>JacobiSVD (two-sided)</td>
         <td>-</td>
         <td>Slow (but fast for small matrices)</td>
-        <td>Excellent-Proven<sup><a href="#note3">3</a></sup></td>
+        <td>Proven<sup><a href="#note3">3</a></sup></td>
         <td>Yes</td>
         <td>Singular values/vectors, least squares</td>
         <td>Yes (and does least squares)</td>
@@ -132,7 +133,7 @@ For an introduction on linear solvers and decompositions, check this \link Tutor
         <td>Yes</td>
         <td>Eigenvalues/vectors</td>
         <td>-</td>
-        <td>Good</td>
+        <td>Excellent</td>
         <td><em>Closed forms for 2x2 and 3x3</em></td>
     </tr>
 
@@ -249,13 +250,14 @@ For an introduction on linear solvers and decompositions, check this \link Tutor
   <dt><b>Implicit Multi Threading (MT)</b></dt>
     <dd>Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product rountines.</dd>
   <dt><b>Explicit Multi Threading (MT)</b></dt>
-    <dd>Means the algorithm is explicitely parallelized to take advantage of multicore processors via OpenMP.</dd>
+    <dd>Means the algorithm is explicitly parallelized to take advantage of multicore processors via OpenMP.</dd>
   <dt><b>Meta-unroller</b></dt>
     <dd>Means the algorithm is automatically and explicitly unrolled for very small fixed size matrices.</dd>
   <dt><b></b></dt>
     <dd></dd>
 </dl>
 
+
 */
 
 }
diff --git a/doc/TopicMultithreading.dox b/doc/TopicMultithreading.dox
index 7db2b0e07..47c9b261f 100644
--- a/doc/TopicMultithreading.dox
+++ b/doc/TopicMultithreading.dox
@@ -8,13 +8,13 @@ Some Eigen's algorithms can exploit the multiple cores present in your hardware.
  * GCC: \c -fopenmp
  * ICC: \c -openmp
  * MSVC: check the respective option in the build properties.
-You can control the number of thread that will be used using either the OpenMP API or Eiegn's API using the following priority:
+You can control the number of thread that will be used using either the OpenMP API or Eigen's API using the following priority:
 \code
  OMP_NUM_THREADS=n ./my_program
  omp_set_num_threads(n);
  Eigen::setNbThreads(n);
 \endcode
-Unless setNbThreads has been called, Eigen uses the number of threads specified by OpenMP. You can restore this bahavior by calling \code setNbThreads(0); \endcode
+Unless setNbThreads has been called, Eigen uses the number of threads specified by OpenMP. You can restore this behavior by calling \code setNbThreads(0); \endcode
 You can query the number of threads that will be used with:
 \code
 n = Eigen::nbThreads( );
@@ -22,8 +22,12 @@ n = Eigen::nbThreads( );
 You can disable Eigen's multi threading at compile time by defining the EIGEN_DONT_PARALLELIZE preprocessor token.
 
 Currently, the following algorithms can make use of multi-threading:
- * general matrix - matrix products
- * PartialPivLU
+ - general dense matrix - matrix products
+ - PartialPivLU
+ - row-major-sparse * dense vector/matrix products
+ - ConjugateGradient with \c Lower|Upper as the \c UpLo template parameter.
+ - BiCGSTAB with a row-major sparse matrix format.
+ - LeastSquaresConjugateGradient
 
 \section TopicMultiThreading_UsingEigenWithMT Using Eigen in a multi-threaded application
 
@@ -39,6 +43,10 @@ int main(int argc, char** argv)
 }
 \endcode
 
+\note With Eigen 3.3, and a fully C++11 compliant compiler (i.e., <a href="http://en.cppreference.com/w/cpp/language/storage_duration#Static_local_variables">thread-safe static local variable initialization</a>), then calling \c initParallel() is optional.
+
+\warning note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to Eigen::initParallel(). This is because these functions are based on std::rand which is not re-entrant. For thread-safe random generator, we recommend the use of boost::random or c++11 random feature.
+
 In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallization as detailed in the previous section.
 
 */
diff --git a/doc/TutorialArrayClass.dox b/doc/TutorialArrayClass.dox
index 6432684aa..f6f351091 100644
--- a/doc/TutorialArrayClass.dox
+++ b/doc/TutorialArrayClass.dox
@@ -157,7 +157,7 @@ The following example shows how to use array operations on a Matrix object by em
 * to multiply them coefficient-wise and assigns the result to the matrix variable \c result (this is legal
 because Eigen allows assigning array expressions to matrix variables). 
 
-As a matter of fact, this usage case is so common that Eigen provides a \link MatrixBase::cwiseProduct() const
+As a matter of fact, this usage case is so common that Eigen provides a \link MatrixBase::cwiseProduct const
 .cwiseProduct(.) \endlink method for matrices to compute the coefficient-wise product. This is also shown in
 the example program.
 
diff --git a/doc/TutorialGeometry.dox b/doc/TutorialGeometry.dox
index 372a275de..2e1420f98 100644
--- a/doc/TutorialGeometry.dox
+++ b/doc/TutorialGeometry.dox
@@ -126,11 +126,12 @@ Apply the transformation to a \b vector </td><td>\code
 VectorNf vec1, vec2;
 vec2 = t.linear() * vec1;\endcode</td></tr>
 <tr><td>
-Apply a \em general transformation \n to a \b normal \b vector
-(<a href="http://femto.cs.uiuc.edu/faqs/cga-faq.html#S5.27">explanations</a>)</td><td>\code
+Apply a \em general transformation \n to a \b normal \b vector \n
+</td><td>\code
 VectorNf n1, n2;
 MatrixNf normalMatrix = t.linear().inverse().transpose();
 n2 = (normalMatrix * n1).normalized();\endcode</td></tr>
+<tr><td colspan="2">(See subject 5.27 of this <a href="http://www.faqs.org/faqs/graphics/algorithms-faq">faq</a> for the explanations)</td></tr>
 <tr class="alt"><td>
 Apply a transformation with \em pure \em rotation \n to a \b normal \b vector
 (no scaling, no shear)</td><td>\code
@@ -231,8 +232,8 @@ On the other hand, since there exist 24 different conventions, they are pretty c
 to create a rotation matrix according to the 2-1-2 convention.</td><td>\code
 Matrix3f m;
 m = AngleAxisf(angle1, Vector3f::UnitZ())
-*  * AngleAxisf(angle2, Vector3f::UnitY())
-*  * AngleAxisf(angle3, Vector3f::UnitZ());
+  * AngleAxisf(angle2, Vector3f::UnitY())
+  * AngleAxisf(angle3, Vector3f::UnitZ());
 \endcode</td></tr>
 </table>
 
diff --git a/doc/TutorialLinearAlgebra.dox b/doc/TutorialLinearAlgebra.dox
index b09f3543e..cb92ceeae 100644
--- a/doc/TutorialLinearAlgebra.dox
+++ b/doc/TutorialLinearAlgebra.dox
@@ -40,8 +40,9 @@ depending on your matrix and the trade-off you want to make:
     <tr>
         <th>Decomposition</th>
         <th>Method</th>
-        <th>Requirements on the matrix</th>
-        <th>Speed</th>
+        <th>Requirements<br/>on the matrix</th>
+        <th>Speed<br/> (small-to-medium)</th>
+        <th>Speed<br/> (large)</th>
         <th>Accuracy</th>
     </tr>
     <tr>
@@ -49,6 +50,7 @@ depending on your matrix and the trade-off you want to make:
         <td>partialPivLu()</td>
         <td>Invertible</td>
         <td>++</td>
+        <td>++</td>
         <td>+</td>
     </tr>
     <tr class="alt">
@@ -56,6 +58,7 @@ depending on your matrix and the trade-off you want to make:
         <td>fullPivLu()</td>
         <td>None</td>
         <td>-</td>
+        <td>- -</td>
         <td>+++</td>
     </tr>
     <tr>
@@ -63,20 +66,23 @@ depending on your matrix and the trade-off you want to make:
         <td>householderQr()</td>
         <td>None</td>
         <td>++</td>
+        <td>++</td>
         <td>+</td>
     </tr>
     <tr class="alt">
         <td>ColPivHouseholderQR</td>
         <td>colPivHouseholderQr()</td>
         <td>None</td>
-        <td>+</td>
         <td>++</td>
+        <td>-</td>
+        <td>+++</td>
     </tr>
     <tr>
         <td>FullPivHouseholderQR</td>
         <td>fullPivHouseholderQr()</td>
         <td>None</td>
         <td>-</td>
+        <td>- -</td>
         <td>+++</td>
     </tr>
     <tr class="alt">
@@ -84,21 +90,31 @@ depending on your matrix and the trade-off you want to make:
         <td>llt()</td>
         <td>Positive definite</td>
         <td>+++</td>
+        <td>+++</td>
         <td>+</td>
     </tr>
     <tr>
         <td>LDLT</td>
         <td>ldlt()</td>
-        <td>Positive or negative semidefinite</td>
+        <td>Positive or negative<br/> semidefinite</td>
         <td>+++</td>
+        <td>+</td>
         <td>++</td>
     </tr>
+    <tr class="alt">
+        <td>JacobiSVD</td>
+        <td>jacobiSvd()</td>
+        <td>None</td>
+        <td>- -</td>
+        <td>- - -</td>
+        <td>+++</td>
+    </tr>
 </table>
 
 All of these decompositions offer a solve() method that works as in the above example.
 
 For example, if your matrix is positive definite, the above table says that a very good
-choice is then the LDLT decomposition. Here's an example, also demonstrating that using a general
+choice is then the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general
 matrix (not a vector) as right hand side is possible.
 
 <table class="example">
@@ -167,8 +183,8 @@ Here is an example:
 
 \section TutorialLinAlgLeastsquares Least squares solving
 
-The best way to do least squares solving is with a SVD decomposition. Eigen provides one as the JacobiSVD class, and its solve()
-is doing least-squares solving.
+The most accurate method to do least squares solving is with a SVD decomposition. Eigen provides one
+as the JacobiSVD class, and its solve() is doing least-squares solving.
 
 Here is an example:
 <table class="example">
@@ -179,9 +195,10 @@ Here is an example:
 </tr>
 </table>
 
-Another way, potentially faster but less reliable, is to use a LDLT decomposition
-of the normal matrix. In any case, just read any reference text on least squares, and it will be very easy for you
-to implement any linear least squares computation on top of Eigen.
+Another methods, potentially faster but less reliable, are to use a Cholesky decomposition of the
+normal matrix or a QR decomposition. Our page on \link LeastSquares least squares solving \endlink
+has more details.
+
 
 \section TutorialLinAlgSeparateComputation Separating the computation from the construction
 
diff --git a/doc/TutorialReductionsVisitorsBroadcasting.dox b/doc/TutorialReductionsVisitorsBroadcasting.dox
index 992cf6f34..f5322b4a6 100644
--- a/doc/TutorialReductionsVisitorsBroadcasting.dox
+++ b/doc/TutorialReductionsVisitorsBroadcasting.dox
@@ -32,7 +32,7 @@ Eigen also provides the \link MatrixBase::norm() norm() \endlink method, which r
 
 These operations can also operate on matrices; in that case, a n-by-p matrix is seen as a vector of size (n*p), so for example the \link MatrixBase::norm() norm() \endlink method returns the "Frobenius" or "Hilbert-Schmidt" norm. We refrain from speaking of the \f$\ell^2\f$ norm of a matrix because that can mean different things.
 
-If you want other \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm() lpNnorm<p>() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients.
+If you want other coefficient-wise \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm lpNorm<p>() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients.
 
 The following example demonstrates these methods.
 
@@ -45,6 +45,17 @@ The following example demonstrates these methods.
 \verbinclude Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.out
 </td></tr></table>
 
+\b Operator \b norm: The 1-norm and \f$\infty\f$-norm <a href="https://en.wikipedia.org/wiki/Operator_norm">matrix operator norms</a> can easily be computed as follows:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.out
+</td></tr></table>
+See below for more explanations on the syntax of these expressions.
+
 \subsection TutorialReductionsVisitorsBroadcastingReductionsBool Boolean reductions
 
 The following reductions operate on boolean values:
@@ -79,7 +90,7 @@ Array.
 
 The arguments passed to a visitor are pointers to the variables where the
 row and column position are to be stored. These variables should be of type
-\link DenseBase::Index Index \endlink, as shown below:
+\link Eigen::Index Index \endlink, as shown below:
 
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@@ -90,17 +101,16 @@ row and column position are to be stored. These variables should be of type
 \verbinclude Tutorial_ReductionsVisitorsBroadcasting_visitors.out
 </td></tr></table>
 
-Note that both functions also return the value of the minimum or maximum coefficient if needed,
-as if it was a typical reduction operation.
+Both functions also return the value of the minimum or maximum coefficient.
 
 \section TutorialReductionsVisitorsBroadcastingPartialReductions Partial reductions
 Partial reductions are reductions that can operate column- or row-wise on a Matrix or 
 Array, applying the reduction operation on each column or row and 
-returning a column or row-vector with the corresponding values. Partial reductions are applied 
+returning a column or row vector with the corresponding values. Partial reductions are applied 
 with \link DenseBase::colwise() colwise() \endlink or \link DenseBase::rowwise() rowwise() \endlink.
 
 A simple example is obtaining the maximum of the elements 
-in each column in a given matrix, storing the result in a row-vector:
+in each column in a given matrix, storing the result in a row vector:
 
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@@ -122,8 +132,7 @@ The same operation can be performed row-wise:
 \verbinclude Tutorial_ReductionsVisitorsBroadcasting_rowwise.out
 </td></tr></table>
 
-<b>Note that column-wise operations return a 'row-vector' while row-wise operations
-return a 'column-vector'</b>
+<b>Note that column-wise operations return a row vector, while row-wise operations return a column vector.</b>
 
 \subsection TutorialReductionsVisitorsBroadcastingPartialReductionsCombined Combining partial reductions with other operations
 It is also possible to use the result of a partial reduction to do further processing.
@@ -165,7 +174,7 @@ The concept behind broadcasting is similar to partial reductions, with the diffe
 constructs an expression where a vector (column or row) is interpreted as a matrix by replicating it in 
 one direction.
 
-A simple example is to add a certain column-vector to each column in a matrix. 
+A simple example is to add a certain column vector to each column in a matrix. 
 This can be accomplished with:
 
 <table class="example">
@@ -242,7 +251,7 @@ is a new matrix whose size is the same as matrix <tt>m</tt>: \f[
 \f]
 
   - <tt>(m.colwise() - v).colwise().squaredNorm()</tt> is a partial reduction, computing the squared norm column-wise. The result of
-this operation is a row-vector where each coefficient is the squared Euclidean distance between each column in <tt>m</tt> and <tt>v</tt>: \f[
+this operation is a row vector where each coefficient is the squared Euclidean distance between each column in <tt>m</tt> and <tt>v</tt>: \f[
   \mbox{(m.colwise() - v).colwise().squaredNorm()} =
   \begin{bmatrix}
      1 & 505 & 32 & 50
diff --git a/doc/TutorialReshapeSlicing.dox b/doc/TutorialReshapeSlicing.dox
new file mode 100644
index 000000000..3730a5de6
--- /dev/null
+++ b/doc/TutorialReshapeSlicing.dox
@@ -0,0 +1,65 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialReshapeSlicing Reshape and Slicing
+
+%Eigen does not expose convenient methods to take slices or to reshape a matrix yet.
+Nonetheless, such features can easily be emulated using the Map class.
+
+\eigenAutoToc
+
+\section TutorialReshape Reshape
+
+A reshape operation consists in modifying the sizes of a matrix while keeping the same coefficients.
+Instead of modifying the input matrix itself, which is not possible for compile-time sizes, the approach consist in creating a different \em view on the storage using class Map.
+Here is a typical example creating a 1D linear view of a matrix:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReshapeMat2Vec.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReshapeMat2Vec.out
+</td></tr></table>
+
+Remark how the storage order of the input matrix modifies the order of the coefficients in the linear view.
+Here is another example reshaping a 2x6 matrix to a 6x2 one:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReshapeMat2Mat.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReshapeMat2Mat.out
+</td></tr></table>
+
+
+
+\section TutorialSlicing Slicing
+
+Slicing consists in taking a set of rows, columns, or elements, uniformly spaced within a matrix.
+Again, the class Map allows to easily mimic this feature.
+
+For instance, one can skip every P elements in a vector:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_SlicingVec.cpp
+</td>
+<td>
+\verbinclude Tutorial_SlicingVec.out
+</td></tr></table>
+
+One can olso take one column over three using an adequate outer-stride or inner-stride depending on the actual storage order:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_SlicingCol.cpp
+</td>
+<td>
+\verbinclude Tutorial_SlicingCol.out
+</td></tr></table>
+
+*/
+
+}
diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox
index fa2a3ad8b..352907408 100644
--- a/doc/TutorialSparse.dox
+++ b/doc/TutorialSparse.dox
@@ -83,7 +83,7 @@ There is no notion of compressed/uncompressed mode for a SparseVector.
 
 \section TutorialSparseExample First example
 
-Before describing each individual class, let's start with the following typical example: solving the Laplace equation \f$ \nabla u = 0 \f$ on a regular 2D grid using a finite difference scheme and Dirichlet boundary conditions.
+Before describing each individual class, let's start with the following typical example: solving the Laplace equation \f$ \Delta u = 0 \f$ on a regular 2D grid using a finite difference scheme and Dirichlet boundary conditions.
 Such problem can be mathematically expressed as a linear problem of the form \f$ Ax=b \f$ where \f$ x \f$ is the vector of \c m unknowns (in our case, the values of the pixels), \f$ b \f$ is the right hand side vector resulting from the boundary conditions, and \f$ A \f$ is an \f$ m \times m \f$ matrix containing only a few non-zero elements resulting from the discretization of the Laplacian operator.
 
 <table class="manual">
@@ -241,11 +241,11 @@ In the following \em sm denotes a sparse matrix, \em sv a sparse vector, \em dm
 sm1.real()   sm1.imag()   -sm1                    0.5*sm1
 sm1+sm2      sm1-sm2      sm1.cwiseProduct(sm2)
 \endcode
-However, a strong restriction is that the storage orders must match. For instance, in the following example:
+However, <strong>a strong restriction is that the storage orders must match</strong>. For instance, in the following example:
 \code
 sm4 = sm1 + sm2 + sm3;
 \endcode
-sm1, sm2, and sm3 must all be row-major or all column major.
+sm1, sm2, and sm3 must all be row-major or all column-major.
 On the other hand, there is no restriction on the target matrix sm4.
 For instance, this means that for computing \f$ A^T + A \f$, the matrix \f$ A^T \f$ must be evaluated into a temporary matrix of compatible storage order:
 \code
@@ -253,15 +253,19 @@ SparseMatrix<double> A, B;
 B = SparseMatrix<double>(A.transpose()) + A;
 \endcode
 
-Some binary coefficient-wise operators can also mix sparse and dense expressions:
+Binary coefficient wise operators can also mix sparse and dense expressions:
 \code
 sm2 = sm1.cwiseProduct(dm1);
-dm1 += sm1;
+dm2 = sm1 + dm1;
+dm2 = dm1 - sm1;
 \endcode
+Performance-wise, the adding/subtracting sparse and dense matrices is better performed in two steps. For instance, instead of doing <tt>dm2 = sm1 + dm1</tt>, better write:
+\code
+dm2 = dm1;
+dm2 += sm1;
+\endcode
+This version has the advantage to fully exploit the higher performance of dense storage (no indirection, SIMD, etc.), and to pay the cost of slow sparse evaluation on the few non-zeros of the sparse matrix only.
 
-However, it is not yet possible to add a sparse and a dense matrix as in <tt>dm2 = sm1 + dm1</tt>.
-Please write this as the equivalent <tt>dm2 = dm1; dm2 += sm1</tt> (we plan to lift this restriction
-in the next release of %Eigen).
 
 %Sparse expressions also support transposition:
 \code
@@ -307,6 +311,26 @@ sm2 = sm1.transpose() * P;
     \endcode
 
 
+\subsection TutorialSparse_SubMatrices Block operations
+
+Regarding read-access, sparse matrices expose the same API than for dense matrices to access to sub-matrices such as blocks, columns, and rows. See \ref TutorialBlockOperations for a detailed introduction.
+However, for performance reasons, writing to a sub-sparse-matrix is much more limited, and currently only contiguous sets of columns (resp. rows) of a column-major (resp. row-major) SparseMatrix are writable. Moreover, this information has to be known at compile-time, leaving out methods such as <tt>block(...)</tt> and <tt>corner*(...)</tt>. The available API for write-access to a SparseMatrix are summarized below:
+\code
+SparseMatrix<double,ColMajor> sm1;
+sm1.col(j) = ...;
+sm1.leftCols(ncols) = ...;
+sm1.middleCols(j,ncols) = ...;
+sm1.rightCols(ncols) = ...;
+
+SparseMatrix<double,RowMajor> sm2;
+sm2.row(i) = ...;
+sm2.topRows(nrows) = ...;
+sm2.middleRows(i,nrows) = ...;
+sm2.bottomRows(nrows) = ...;
+\endcode
+
+In addition, sparse matrices expose the SparseMatrixBase::innerVector() and SparseMatrixBase::innerVectors() methods, which are aliases to the col/middleCols methods for a column-major storage, and to the row/middleRows methods for a row-major storage.
+
 \subsection TutorialSparse_TriangularSelfadjoint Triangular and selfadjoint views
 
 Just as with dense matrices, the triangularView() function can be used to address a triangular part of the matrix, and perform triangular solves with a dense right hand side:
diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox
index 8c97d7874..95d95a2d5 100644
--- a/doc/UnalignedArrayAssert.dox
+++ b/doc/UnalignedArrayAssert.dox
@@ -7,8 +7,8 @@ Hello! You are seeing this webpage because your program terminated on an asserti
 my_program: path/to/eigen/Eigen/src/Core/DenseStorage.h:44:
 Eigen::internal::matrix_array<T, Size, MatrixOptions, Align>::internal::matrix_array()
 [with T = double, int Size = 2, int MatrixOptions = 2, bool Align = true]:
-Assertion `(reinterpret_cast<size_t>(array) & 0xf) == 0 && "this assertion
-is explained here: http://eigen.tuxfamily.org/dox/UnalignedArrayAssert.html
+Assertion `(reinterpret_cast<size_t>(array) & (sizemask)) == 0 && "this assertion
+is explained here: http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html
 **** READ THIS WEB PAGE !!! ****"' failed.
 </pre>
 
@@ -46,9 +46,9 @@ then you need to read this separate page: \ref TopicStructHavingEigenMembers "St
 
 Note that here, Eigen::Vector2d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types".
 
-\section c2 Cause 2: STL Containers
+\section c2 Cause 2: STL Containers or manual memory allocation
 
-If you use STL Containers such as std::vector, std::map, ..., with Eigen objects, or with classes containing Eigen objects, like this,
+If you use STL Containers such as std::vector, std::map, ..., with %Eigen objects, or with classes containing %Eigen objects, like this,
 
 \code
 std::vector<Eigen::Matrix2f> my_vector;
@@ -60,6 +60,8 @@ then you need to read this separate page: \ref TopicStlContainers "Using STL Con
 
 Note that here, Eigen::Matrix2f is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member".
 
+The same issue will be exhibited by any classes/functions by-passing operator new to allocate memory, that is, by performing custom memory allocation followed by calls to the placement new operator. This is for instance typically the case of \c std::make_shared or \c std::allocate_shared for which is the solution is to use an \ref aligned_allocator "aligned allocator" as detailed in the \ref TopicStlContainers "solution for STL containers".
+
 \section c3 Cause 3: Passing Eigen objects by value
 
 If some function in your code is getting an Eigen object passed by value, like this,
@@ -90,24 +92,28 @@ Note that here, Eigen::Quaternionf is only used as an example, more generally th
 
 \section explanation General explanation of this assertion
 
-\ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" must absolutely be created at 16-byte-aligned locations, otherwise SIMD instructions adressing them will crash.
+\ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" must absolutely be created at 16-byte-aligned locations, otherwise SIMD instructions addressing them will crash.
 
 Eigen normally takes care of these alignment issues for you, by setting an alignment attribute on them and by overloading their "operator new".
 
 However there are a few corner cases where these alignment settings get overridden: they are the possible causes for this assertion.
 
-\section getrid I don't care about vectorization, how do I get rid of that stuff?
+\section getrid I don't care about optimal vectorization, how do I get rid of that stuff?
 
-Two possibilities:
+Three possibilities:
 <ul>
-  <li>Define EIGEN_DONT_ALIGN_STATICALLY. That disables all 128-bit static alignment code, while keeping 128-bit heap alignment. This has the effect of
-      disabling vectorization for fixed-size objects (like Matrix4d) while keeping vectorization of dynamic-size objects
-      (like MatrixXd). But do note that this breaks ABI compatibility with the default behavior of 128-bit static alignment.</li>
-  <li>Or define both EIGEN_DONT_VECTORIZE and EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT. This keeps the
-      128-bit alignment code and thus preserves ABI compatibility, but completely disables vectorization.</li>
+  <li>Use the \c DontAlign option to Matrix, Array, Quaternion, etc. objects that gives you trouble. This way Eigen won't try to align them, and thus won"t assume any special alignment. On the down side, you will pay the cost of unaligned loads/stores for them, but on modern CPUs, the overhead is either null or marginal. See \link StructHavingEigenMembers_othersolutions here \endlink for an example.</li>
+  <li>Define \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN_STATICALLY \endlink. That disables all 16-byte (and above) static alignment code, while keeping 16-byte (or above) heap alignment. This has the effect of
+      vectorizing fixed-size objects (like Matrix4d) through unaligned stores (as controlled by \link TopicPreprocessorDirectivesPerformance EIGEN_UNALIGNED_VECTORIZE \endlink), while keeping unchanged the vectorization of dynamic-size objects
+      (like MatrixXd). But do note that this breaks ABI compatibility with the default behavior of static alignment.</li>
+  <li>Or define both \link TopicPreprocessorDirectivesPerformance  EIGEN_DONT_VECTORIZE \endlink and EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT. This keeps the
+      16-byte alignment code and thus preserves ABI compatibility, but completely disables vectorization.</li>
 </ul>
 
-For more information, see <a href="http://eigen.tuxfamily.org/index.php?title=FAQ#I_disabled_vectorization.2C_but_I.27m_still_getting_annoyed_about_alignment_issues.21">this FAQ</a>.
+If you want to know why defining EIGEN_DONT_VECTORIZE does not by itself disable 16-byte alignment and the assertion, here's the explanation:
+
+It doesn't disable the assertion, because otherwise code that runs fine without vectorization would suddenly crash when enabling vectorization.
+It doesn't disable 16-byte alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path.
 
 */
 
diff --git a/doc/UsingBlasLapackBackends.dox b/doc/UsingBlasLapackBackends.dox
new file mode 100644
index 000000000..caa597122
--- /dev/null
+++ b/doc/UsingBlasLapackBackends.dox
@@ -0,0 +1,133 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+ Copyright (C) 2011-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Documentation on the use of BLAS/LAPACK libraries through Eigen
+ ********************************************************************************
+*/
+
+namespace Eigen {
+
+/** \page TopicUsingBlasLapack Using BLAS/LAPACK from %Eigen
+
+
+Since %Eigen version 3.3 and later, any F77 compatible BLAS or LAPACK libraries can be used as backends for dense matrix products and dense matrix decompositions.
+For instance, one can use <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php">Intel® MKL</a>, Apple's Accelerate framework on OSX, <a href="http://www.openblas.net/">OpenBLAS</a>, <a href="http://www.netlib.org/lapack">Netlib LAPACK</a>, etc.
+
+Do not miss this \link TopicUsingIntelMKL page \endlink for further discussions on the specific use of Intel® MKL (also includes VML, PARDISO, etc.)
+
+In order to use an external BLAS and/or LAPACK library, you must link you own application to the respective libraries and their dependencies.
+For LAPACK, you must also link to the standard <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> library, which is used as a convenient think layer between %Eigen's C++ code and LAPACK F77 interface. Then you must activate their usage by defining one or multiple of the following macros (\b before including any %Eigen's header):
+
+\note For Mac users, in order to use the lapack version shipped with the Accelerate framework, you also need the lapacke library.
+Using <a href="https://www.macports.org/">MacPorts</a>, this is as easy as:
+\code
+sudo port install lapack
+\endcode
+and then use the following link flags: \c -framework \c Accelerate \c /opt/local/lib/lapack/liblapacke.dylib
+
+<table class="manual">
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (compatible with any F77 BLAS interface)</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> C interface to Lapack (compatible with any F77 LAPACK interface)</td></tr>
+<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithms of lower numerical robustness are disabled. \n This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
+</table>
+
+When doing so, a number of %Eigen's algorithms are silently substituted with calls to BLAS or LAPACK routines.
+These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
+Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
+
+The breadth of %Eigen functionality that can be substituted is listed in the table below.
+<table class="manual">
+<tr><th>Functional domain</th><th>Code example</th><th>BLAS/LAPACK routines</th></tr>
+<tr><td>Matrix-matrix operations \n \c EIGEN_USE_BLAS </td><td>\code
+m1*m2.transpose();
+m1.selfadjointView<Lower>()*m2;
+m1*m2.triangularView<Upper>();
+m1.selfadjointView<Lower>().rankUpdate(m2,1.0);
+\endcode</td><td>\code
+?gemm
+?symm/?hemm
+?trmm
+dsyrk/ssyrk
+\endcode</td></tr>
+<tr class="alt"><td>Matrix-vector operations \n \c EIGEN_USE_BLAS </td><td>\code
+m1.adjoint()*b;
+m1.selfadjointView<Lower>()*b;
+m1.triangularView<Upper>()*b;
+\endcode</td><td>\code
+?gemv
+?symv/?hemv
+?trmv
+\endcode</td></tr>
+<tr><td>LU decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+v1 = m1.lu().solve(v2);
+\endcode</td><td>\code
+?getrf
+\endcode</td></tr>
+<tr class="alt"><td>Cholesky decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+v1 = m2.selfadjointView<Upper>().llt().solve(v2);
+\endcode</td><td>\code
+?potrf
+\endcode</td></tr>
+<tr><td>QR decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+m1.householderQr();
+m1.colPivHouseholderQr();
+\endcode</td><td>\code
+?geqrf
+?geqp3
+\endcode</td></tr>
+<tr class="alt"><td>Singular value decomposition \n \c EIGEN_USE_LAPACKE </td><td>\code
+JacobiSVD<MatrixXd> svd;
+svd.compute(m1, ComputeThinV);
+\endcode</td><td>\code
+?gesvd
+\endcode</td></tr>
+<tr><td>Eigen-value decompositions \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+EigenSolver<MatrixXd> es(m1);
+ComplexEigenSolver<MatrixXcd> ces(m1);
+SelfAdjointEigenSolver<MatrixXd> saes(m1+m1.transpose());
+GeneralizedSelfAdjointEigenSolver<MatrixXd>
+    gsaes(m1+m1.transpose(),m2+m2.transpose());
+\endcode</td><td>\code
+?gees
+?gees
+?syev/?heev
+?syev/?heev,
+?potrf
+\endcode</td></tr>
+<tr class="alt"><td>Schur decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+RealSchur<MatrixXd> schurR(m1);
+ComplexSchur<MatrixXcd> schurC(m1);
+\endcode</td><td>\code
+?gees
+\endcode</td></tr>
+</table>
+In the examples, m1 and m2 are dense matrices and v1 and v2 are dense vectors.
+
+*/
+
+}
diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox
index 4b624a156..a1a3a18f2 100644
--- a/doc/UsingIntelMKL.dox
+++ b/doc/UsingIntelMKL.dox
@@ -32,106 +32,45 @@
 
 namespace Eigen {
 
-/** \page TopicUsingIntelMKL Using Intel® Math Kernel Library from Eigen
+/** \page TopicUsingIntelMKL Using Intel® MKL from %Eigen
 
-\section TopicUsingIntelMKL_Intro Eigen and Intel® Math Kernel Library (Intel® MKL)
+<!-- \section TopicUsingIntelMKL_Intro Eigen and Intel® Math Kernel Library (Intel® MKL) -->
+
+Since %Eigen version 3.1 and later, users can benefit from built-in Intel® Math Kernel Library (MKL) optimizations with an installed copy of Intel MKL 10.3 (or later).
 
-Since Eigen version 3.1 and later, users can benefit from built-in Intel MKL optimizations with an installed copy of Intel MKL 10.3 (or later).
 <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php"> Intel MKL </a> provides highly optimized multi-threaded mathematical routines for x86-compatible architectures.
 Intel MKL is available on Linux, Mac and Windows for both Intel64 and IA32 architectures.
 
-\warning Be aware that Intel® MKL is a proprietary software. It is the responsibility of the users to buy MKL licenses for their products. Moreover, the license of the user product has to allow linking to proprietary software that excludes any unmodified versions of the GPL.
+\note
+Intel® MKL is a proprietary software and it is the responsibility of users to buy or register for community (free) Intel MKL licenses for their products. Moreover, the license of the user product has to allow linking to proprietary software that excludes any unmodified versions of the GPL.
 
-Using Intel MKL through Eigen is easy:
--# define the \c EIGEN_USE_MKL_ALL macro before including any Eigen's header
+Using Intel MKL through %Eigen is easy:
+-# define the \c EIGEN_USE_MKL_ALL macro before including any %Eigen's header
 -# link your program to MKL libraries (see the <a href="http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/">MKL linking advisor</a>)
 -# on a 64bits system, you must use the LP64 interface (not the ILP64 one)
 
-When doing so, a number of Eigen's algorithms are silently substituted with calls to Intel MKL routines.
+When doing so, a number of %Eigen's algorithms are silently substituted with calls to Intel MKL routines.
 These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
 Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
 
-In addition you can coarsely select choose which parts will be substituted by defining one or multiple of the following macros:
+In addition you can choose which parts will be substituted by defining one or multiple of the following macros:
 
 <table class="manual">
-<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (currently works with Intel MKL only)</td></tr>
-<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Intel Lapacke</a> C interface to Lapack (currently works with Intel MKL only)</td></tr>
-<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithm of lower robustness are disabled. This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> C interface to Lapack</td></tr>
+<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithm of lower robustness are disabled. \n This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
 <tr class="alt"><td>\c EIGEN_USE_MKL_VML </td><td>Enables the use of Intel VML (vector operations)</td></tr>
 <tr><td>\c EIGEN_USE_MKL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML </td></tr>
 </table>
 
-Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module.
-
+Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details.
 
-\section TopicUsingIntelMKL_SupportedFeatures List of supported features
+Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module.
 
-The breadth of Eigen functionality covered by Intel MKL is listed in the table below.
+The following table summarizes the list of functions covered by \c EIGEN_USE_MKL_VML:
 <table class="manual">
-<tr><th>Functional domain</th><th>Code example</th><th>MKL routines</th></tr>
-<tr><td>Matrix-matrix operations \n \c EIGEN_USE_BLAS </td><td>\code
-m1*m2.transpose();
-m1.selfadjointView<Lower>()*m2;
-m1*m2.triangularView<Upper>();
-m1.selfadjointView<Lower>().rankUpdate(m2,1.0);
-\endcode</td><td>\code
-?gemm
-?symm/?hemm
-?trmm
-dsyrk/ssyrk
-\endcode</td></tr>
-<tr class="alt"><td>Matrix-vector operations \n \c EIGEN_USE_BLAS </td><td>\code
-m1.adjoint()*b;
-m1.selfadjointView<Lower>()*b;
-m1.triangularView<Upper>()*b;
-\endcode</td><td>\code
-?gemv
-?symv/?hemv
-?trmv
-\endcode</td></tr>
-<tr><td>LU decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-v1 = m1.lu().solve(v2);
-\endcode</td><td>\code
-?getrf
-\endcode</td></tr>
-<tr class="alt"><td>Cholesky decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-v1 = m2.selfadjointView<Upper>().llt().solve(v2);
-\endcode</td><td>\code
-?potrf
-\endcode</td></tr>
-<tr><td>QR decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-m1.householderQr();
-m1.colPivHouseholderQr();
-\endcode</td><td>\code
-?geqrf
-?geqp3
-\endcode</td></tr>
-<tr class="alt"><td>Singular value decomposition \n \c EIGEN_USE_LAPACKE </td><td>\code
-JacobiSVD<MatrixXd> svd;
-svd.compute(m1, ComputeThinV);
-\endcode</td><td>\code
-?gesvd
-\endcode</td></tr>
-<tr><td>Eigen-value decompositions \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-EigenSolver<MatrixXd> es(m1);
-ComplexEigenSolver<MatrixXcd> ces(m1);
-SelfAdjointEigenSolver<MatrixXd> saes(m1+m1.transpose());
-GeneralizedSelfAdjointEigenSolver<MatrixXd>
-    gsaes(m1+m1.transpose(),m2+m2.transpose());
-\endcode</td><td>\code
-?gees
-?gees
-?syev/?heev
-?syev/?heev,
-?potrf
-\endcode</td></tr>
-<tr class="alt"><td>Schur decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-RealSchur<MatrixXd> schurR(m1);
-ComplexSchur<MatrixXcd> schurC(m1);
-\endcode</td><td>\code
-?gees
-\endcode</td></tr>
-<tr><td>Vector Math \n \c EIGEN_USE_MKL_VML </td><td>\code
+<tr><th>Code example</th><th>MKL routines</th></tr>
+<tr><td>\code
 v2=v1.array().sin();
 v2=v1.array().asin();
 v2=v1.array().cos();
@@ -155,7 +94,7 @@ v?Sqr
 v?Powx
 \endcode</td></tr>
 </table>
-In the examples, m1 and m2 are dense matrices and v1 and v2 are dense vectors.
+In the examples, v1 and v2 are dense vectors.
 
 
 \section TopicUsingIntelMKL_Links Links
diff --git a/doc/UsingNVCC.dox b/doc/UsingNVCC.dox
new file mode 100644
index 000000000..f8e755b79
--- /dev/null
+++ b/doc/UsingNVCC.dox
@@ -0,0 +1,32 @@
+
+namespace Eigen {
+
+/** \page TopicCUDA Using Eigen in CUDA kernels
+
+\b Disclaimer: this page is about an \b experimental feature in %Eigen.
+
+Staring from CUDA 5.0, the CUDA compiler, \c nvcc, is able to properly parse %Eigen's code (almost).
+A few adaptations of the %Eigen's code already allows to use some parts of %Eigen in your own CUDA kernels.
+To this end you need the devel branch of %Eigen, CUDA 5.0 or greater with GCC.
+
+Known issues:
+
+ - \c nvcc with MS Visual Studio does not work (patch welcome)
+ 
+ - \c nvcc with \c clang does not work (patch welcome)
+ 
+ - \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \<limits\> header file. To workaround this, you can add the following before including any other files:
+   \code
+    // workaround issue between gcc >= 4.7 and cuda 5.5
+    #if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
+      #undef _GLIBCXX_ATOMIC_BUILTINS
+      #undef _GLIBCXX_USE_INT128
+    #endif
+   \endcode
+   
+ - On 64bits system Eigen uses \c long \c int as the default type for indexes and sizes. On CUDA device, it would make sense to default to 32 bits \c int.
+   However, to keep host and CUDA code compatible, this cannot be done automatically by %Eigen, and the user is thus required to define \c EIGEN_DEFAULT_DENSE_INDEX_TYPE to \c int throughout his code (or only for CUDA code if there is no interaction between host and CUDA code through %Eigen's object).
+
+*/
+
+}
diff --git a/doc/eigendoxy.css b/doc/eigendoxy.css
index efaeb46dc..6274e6c70 100644
--- a/doc/eigendoxy.css
+++ b/doc/eigendoxy.css
@@ -45,7 +45,7 @@ pre.fragment {
 
 /* Common style for all Eigen's tables */
 
-table.example, table.manual, table.manual-vl {
+table.example, table.manual, table.manual-vl, table.manual-hl {
     max-width:100%;
     border-collapse: collapse;
     border-style: solid;
@@ -58,7 +58,7 @@ table.example, table.manual, table.manual-vl {
     -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
 }
 
-table.example th, table.manual th, table.manual-vl th {
+table.example th, table.manual th, table.manual-vl th, table.manual-hl th {
   padding: 0.5em 0.5em 0.5em 0.5em;
   text-align: left;
   padding-right: 1em;
@@ -70,7 +70,7 @@ table.example th, table.manual th, table.manual-vl th {
   filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#FFFFFF', endColorstr='#F4F4E5');
 }
 
-table.example td, table.manual td, table.manual-vl td {
+table.example td, table.manual td, table.manual-vl td, table.manual-hl td {
   vertical-align:top;
   border-width: 1px;
   border-color: #cccccc;
@@ -108,15 +108,15 @@ table.example td {
 
 /* standard class for the manual */
 
-table.manual, table.manual-vl {
+table.manual, table.manual-vl, table.manual-hl {
     padding: 0.2em 0em 0.5em 0em;
 }
 
-table.manual th, table.manual-vl th {
+table.manual th, table.manual-vl th, table.manual-hl th {
   margin: 0em 0em 0.3em 0em;
 }
 
-table.manual td, table.manual-vl td {
+table.manual td, table.manual-vl td, table.manual-hl td {
   padding: 0.3em 0.5em 0.3em 0.5em;
   vertical-align:top;
   border-width: 1px;
@@ -136,6 +136,16 @@ table.manual-vl th.inter {
   border-style: solid solid solid solid;
 }
 
+table.manual-hl td {
+  border-color: #cccccc;
+  border-width: 1px;
+  border-style: solid none solid none;
+}
+
+table td.code {
+  font-family: monospace;
+}
+
 h2 {
   margin-top:2em;
   border-style: none none solid none;
@@ -166,6 +176,11 @@ div.toc ul {
   margin: 0.2em 0 0.4em 0.5em;
 }
 
+span.cpp11,span.cpp14,span.cpp17 {
+  color: #119911;
+  font-weight: bold;
+}
+
 /**** old Eigen's styles ****/
 
 
@@ -177,8 +192,8 @@ table.tutorial_code td {
 
 
 /* Whenever doxygen meets a '\n' or a '<BR/>', it will put 
- * the text containing the characted into a <p class="starttd">.
- * This little hack togehter with table.tutorial_code td.note
+ * the text containing the character into a <p class="starttd">.
+ * This little hack together with table.tutorial_code td.note
  * aims at fixing this issue. */
 table.tutorial_code td.note p.starttd {
   margin: 0px;
@@ -199,13 +214,3 @@ h3.version {
 td.width20em p.endtd {
   width:  20em;
 }
-
-.bigwarning {
-  font-size:2em;
-  font-weight:bold;
-  margin:1em;
-  padding:1em;
-  color:red;
-  border:solid;
-}
-
diff --git a/doc/examples/CMakeLists.txt b/doc/examples/CMakeLists.txt
index 08cf8efd7..f7a19055f 100644
--- a/doc/examples/CMakeLists.txt
+++ b/doc/examples/CMakeLists.txt
@@ -14,3 +14,8 @@ foreach(example_src ${examples_SRCS})
   )
   add_dependencies(all_examples ${example})
 endforeach(example_src)
+
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
+if(EIGEN_COMPILER_SUPPORT_CPP11)
+ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11")
+endif()
+\ No newline at end of file
diff --git a/doc/examples/CustomizingEigen_Inheritance.cpp b/doc/examples/CustomizingEigen_Inheritance.cpp
new file mode 100644
index 000000000..48df64ee3
--- /dev/null
+++ b/doc/examples/CustomizingEigen_Inheritance.cpp
@@ -0,0 +1,30 @@
+#include <Eigen/Core>
+#include <iostream>
+
+class MyVectorType : public Eigen::VectorXd
+{
+public:
+    MyVectorType(void):Eigen::VectorXd() {}
+
+    // This constructor allows you to construct MyVectorType from Eigen expressions
+    template<typename OtherDerived>
+    MyVectorType(const Eigen::MatrixBase<OtherDerived>& other)
+        : Eigen::VectorXd(other)
+    { }
+
+    // This method allows you to assign Eigen expressions to MyVectorType
+    template<typename OtherDerived>
+    MyVectorType& operator=(const Eigen::MatrixBase <OtherDerived>& other)
+    {
+        this->Eigen::VectorXd::operator=(other);
+        return *this;
+    }
+};
+
+int main()
+{
+  MyVectorType v = MyVectorType::Ones(4);
+  v(2) += 10;
+  v = 2 * v;
+  std::cout << v.transpose() << std::endl;
+}
diff --git a/doc/examples/Cwise_erf.cpp b/doc/examples/Cwise_erf.cpp
new file mode 100644
index 000000000..e7cd2c1c0
--- /dev/null
+++ b/doc/examples/Cwise_erf.cpp
@@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(-0.5,2,0,-7);
+  std::cout << v.erf() << std::endl;
+}
diff --git a/doc/examples/Cwise_erfc.cpp b/doc/examples/Cwise_erfc.cpp
new file mode 100644
index 000000000..d8bb04c30
--- /dev/null
+++ b/doc/examples/Cwise_erfc.cpp
@@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(-0.5,2,0,-7);
+  std::cout << v.erfc() << std::endl;
+}
diff --git a/doc/examples/Cwise_lgamma.cpp b/doc/examples/Cwise_lgamma.cpp
new file mode 100644
index 000000000..f1c4f503e
--- /dev/null
+++ b/doc/examples/Cwise_lgamma.cpp
@@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(0.5,10,0,-1);
+  std::cout << v.lgamma() << std::endl;
+}
+\ No newline at end of file
diff --git a/doc/examples/MatrixBase_cwise_const.cpp b/doc/examples/MatrixBase_cwise_const.cpp
deleted file mode 100644
index 23700e0b6..000000000
--- a/doc/examples/MatrixBase_cwise_const.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#define EIGEN2_SUPPORT
-#include <Eigen/Core>
-#include <iostream>
-
-using namespace Eigen;
-using namespace std;
-
-int main()
-{
-  Matrix3i m = Matrix3i::Random();
-  cout << "Here is the matrix m:" << endl << m << endl;
-  Matrix3i n = Matrix3i::Random();
-  cout << "And here is the matrix n:" << endl << n << endl;
-  cout << "The coefficient-wise product of m and n is:" << endl;
-  cout << m.cwise() * n << endl;
-  cout << "Taking the cube of the coefficients of m yields:" << endl;
-  cout << m.cwise().pow(3) << endl;
-}
diff --git a/doc/examples/TutorialInplaceLU.cpp b/doc/examples/TutorialInplaceLU.cpp
new file mode 100644
index 000000000..cb9c59b60
--- /dev/null
+++ b/doc/examples/TutorialInplaceLU.cpp
@@ -0,0 +1,61 @@
+#include <iostream>
+struct init {
+  init() { std::cout << "[" << "init" << "]" << std::endl; }
+};
+init init_obj;
+// [init]
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  MatrixXd A(2,2);
+  A << 2, -1, 1, 3;
+  cout << "Here is the input matrix A before decomposition:\n" << A << endl;
+cout << "[init]" << endl;
+
+cout << "[declaration]" << endl;
+  PartialPivLU<Ref<MatrixXd> > lu(A);
+  cout << "Here is the input matrix A after decomposition:\n" << A << endl;
+cout << "[declaration]" << endl;
+
+cout << "[matrixLU]" << endl;
+  cout << "Here is the matrix storing the L and U factors:\n" << lu.matrixLU() << endl;
+cout << "[matrixLU]" << endl;
+
+cout << "[solve]" << endl;
+  MatrixXd A0(2,2); A0 << 2, -1, 1, 3;
+  VectorXd b(2);    b << 1, 2;
+  VectorXd x = lu.solve(b);
+  cout << "Residual: " << (A0 * x - b).norm() << endl;
+cout << "[solve]" << endl;
+
+cout << "[modifyA]" << endl;
+  A << 3, 4, -2, 1;
+  x = lu.solve(b);
+  cout << "Residual: " << (A0 * x - b).norm() << endl;
+cout << "[modifyA]" << endl;
+
+cout << "[recompute]" << endl;
+  A0 = A; // save A
+  lu.compute(A);
+  x = lu.solve(b);
+  cout << "Residual: " << (A0 * x - b).norm() << endl;
+cout << "[recompute]" << endl;
+
+cout << "[recompute_bis0]" << endl;
+  MatrixXd A1(2,2);
+  A1 << 5,-2,3,4;
+  lu.compute(A1);
+  cout << "Here is the input matrix A1 after decomposition:\n" << A1 << endl;
+cout << "[recompute_bis0]" << endl;
+
+cout << "[recompute_bis1]" << endl;
+  x = lu.solve(b);
+  cout << "Residual: " << (A1 * x - b).norm() << endl;
+cout << "[recompute_bis1]" << endl;
+
+}
diff --git a/doc/examples/TutorialLinAlgInverseDeterminant.cpp b/doc/examples/TutorialLinAlgInverseDeterminant.cpp
index 43970ff05..14dde5b35 100644
--- a/doc/examples/TutorialLinAlgInverseDeterminant.cpp
+++ b/doc/examples/TutorialLinAlgInverseDeterminant.cpp
@@ -13,4 +13,4 @@ int main()
    cout << "Here is the matrix A:\n" << A << endl;
    cout << "The determinant of A is " << A.determinant() << endl;
    cout << "The inverse of A is:\n" << A.inverse() << endl;
-}
-\ No newline at end of file
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp
new file mode 100644
index 000000000..62e28fc31
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp
@@ -0,0 +1,18 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  MatrixXf m(2,2);
+  m << 1,-2,
+       -3,4;
+
+  cout << "1-norm(m)     = " << m.cwiseAbs().colwise().sum().maxCoeff()
+       << " == "             << m.colwise().lpNorm<1>().maxCoeff() << endl;
+
+  cout << "infty-norm(m) = " << m.cwiseAbs().rowwise().sum().maxCoeff()
+       << " == "             << m.rowwise().lpNorm<1>().maxCoeff() << endl;
+}
diff --git a/doc/examples/make_circulant.cpp b/doc/examples/make_circulant.cpp
new file mode 100644
index 000000000..92e6aaa2b
--- /dev/null
+++ b/doc/examples/make_circulant.cpp
@@ -0,0 +1,11 @@
+/*
+This program is presented in several fragments in the doc page.
+Every fragment is in its own file; this file simply combines them.
+*/
+
+#include "make_circulant.cpp.preamble"
+#include "make_circulant.cpp.traits"
+#include "make_circulant.cpp.expression"
+#include "make_circulant.cpp.evaluator"
+#include "make_circulant.cpp.entry"
+#include "make_circulant.cpp.main"
diff --git a/doc/examples/make_circulant.cpp.entry b/doc/examples/make_circulant.cpp.entry
new file mode 100644
index 000000000..f9d2eb8a9
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.entry
@@ -0,0 +1,5 @@
+template <class ArgType>
+Circulant<ArgType> makeCirculant(const Eigen::MatrixBase<ArgType>& arg)
+{
+  return Circulant<ArgType>(arg.derived());
+}
diff --git a/doc/examples/make_circulant.cpp.evaluator b/doc/examples/make_circulant.cpp.evaluator
new file mode 100644
index 000000000..2ba79e783
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.evaluator
@@ -0,0 +1,32 @@
+namespace Eigen {
+  namespace internal {
+    template<typename ArgType>
+    struct evaluator<Circulant<ArgType> >
+      : evaluator_base<Circulant<ArgType> >
+    {
+      typedef Circulant<ArgType> XprType;
+      typedef typename nested_eval<ArgType, XprType::ColsAtCompileTime>::type ArgTypeNested;
+      typedef typename remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+      typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+      enum { 
+        CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
+        Flags = Eigen::ColMajor 
+      };
+      
+      evaluator(const XprType& xpr)
+        : m_argImpl(xpr.m_arg), m_rows(xpr.rows())
+      { }
+
+      CoeffReturnType coeff(Index row, Index col) const
+      {
+        Index index = row - col;
+        if (index < 0) index += m_rows;
+        return m_argImpl.coeff(index);
+      }
+
+      evaluator<ArgTypeNestedCleaned> m_argImpl;
+      const Index m_rows;
+    };
+  }
+}
diff --git a/doc/examples/make_circulant.cpp.expression b/doc/examples/make_circulant.cpp.expression
new file mode 100644
index 000000000..380cd4450
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.expression
@@ -0,0 +1,20 @@
+template <class ArgType>
+class Circulant : public Eigen::MatrixBase<Circulant<ArgType> >
+{
+public:
+  Circulant(const ArgType& arg)
+    : m_arg(arg)
+  { 
+    EIGEN_STATIC_ASSERT(ArgType::ColsAtCompileTime == 1,
+                        YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
+  }
+
+  typedef typename Eigen::internal::ref_selector<Circulant>::type Nested; 
+
+  typedef Eigen::Index Index;
+  Index rows() const { return m_arg.rows(); }
+  Index cols() const { return m_arg.rows(); }
+
+  typedef typename Eigen::internal::ref_selector<ArgType>::type ArgTypeNested;
+  ArgTypeNested m_arg;
+};
diff --git a/doc/examples/make_circulant.cpp.main b/doc/examples/make_circulant.cpp.main
new file mode 100644
index 000000000..877f97f62
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.main
@@ -0,0 +1,8 @@
+int main()
+{
+  Eigen::VectorXd vec(4);
+  vec << 1, 2, 4, 8;
+  Eigen::MatrixXd mat;
+  mat = makeCirculant(vec);
+  std::cout << mat << std::endl;
+}
diff --git a/doc/examples/make_circulant.cpp.preamble b/doc/examples/make_circulant.cpp.preamble
new file mode 100644
index 000000000..e575cce14
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.preamble
@@ -0,0 +1,4 @@
+#include <Eigen/Core>
+#include <iostream>
+
+template <class ArgType> class Circulant;
diff --git a/doc/examples/make_circulant.cpp.traits b/doc/examples/make_circulant.cpp.traits
new file mode 100644
index 000000000..4e04535d3
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.traits
@@ -0,0 +1,19 @@
+namespace Eigen {
+  namespace internal {
+    template <class ArgType>
+    struct traits<Circulant<ArgType> >
+    {
+      typedef Eigen::Dense StorageKind;
+      typedef Eigen::MatrixXpr XprKind;
+      typedef typename ArgType::StorageIndex StorageIndex;
+      typedef typename ArgType::Scalar Scalar;
+      enum { 
+        Flags = Eigen::ColMajor,
+        RowsAtCompileTime = ArgType::RowsAtCompileTime,
+        ColsAtCompileTime = ArgType::RowsAtCompileTime,
+        MaxRowsAtCompileTime = ArgType::MaxRowsAtCompileTime,
+        MaxColsAtCompileTime = ArgType::MaxRowsAtCompileTime
+      };
+    };
+  }
+}
diff --git a/doc/examples/make_circulant2.cpp b/doc/examples/make_circulant2.cpp
new file mode 100644
index 000000000..95d3dd31a
--- /dev/null
+++ b/doc/examples/make_circulant2.cpp
@@ -0,0 +1,52 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+// [circulant_func]
+template<class ArgType>
+class circulant_functor {
+  const ArgType &m_vec;
+public:
+  circulant_functor(const ArgType& arg) : m_vec(arg) {}
+
+  const typename ArgType::Scalar& operator() (Index row, Index col) const {
+    Index index = row - col;
+    if (index < 0) index += m_vec.size();
+    return m_vec(index);
+  }
+};
+// [circulant_func]
+
+// [square]
+template<class ArgType>
+struct circulant_helper {
+  typedef Matrix<typename ArgType::Scalar,
+                 ArgType::SizeAtCompileTime,
+                 ArgType::SizeAtCompileTime,
+                 ColMajor,
+                 ArgType::MaxSizeAtCompileTime,
+                 ArgType::MaxSizeAtCompileTime> MatrixType;
+};
+// [square]
+
+// [makeCirculant]
+template <class ArgType>
+CwiseNullaryOp<circulant_functor<ArgType>, typename circulant_helper<ArgType>::MatrixType>
+makeCirculant(const Eigen::MatrixBase<ArgType>& arg)
+{
+  typedef typename circulant_helper<ArgType>::MatrixType MatrixType;
+  return MatrixType::NullaryExpr(arg.size(), arg.size(), circulant_functor<ArgType>(arg.derived()));
+}
+// [makeCirculant]
+
+// [main]
+int main()
+{
+  Eigen::VectorXd vec(4);
+  vec << 1, 2, 4, 8;
+  Eigen::MatrixXd mat;
+  mat = makeCirculant(vec);
+  std::cout << mat << std::endl;
+}
+// [main]
diff --git a/doc/examples/matrixfree_cg.cpp b/doc/examples/matrixfree_cg.cpp
new file mode 100644
index 000000000..6a205aea3
--- /dev/null
+++ b/doc/examples/matrixfree_cg.cpp
@@ -0,0 +1,128 @@
+#include <iostream>
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <Eigen/IterativeLinearSolvers>
+#include <unsupported/Eigen/IterativeSolvers>
+
+class MatrixReplacement;
+using Eigen::SparseMatrix;
+
+namespace Eigen {
+namespace internal {
+  // MatrixReplacement looks-like a SparseMatrix, so let's inherits its traits:
+  template<>
+  struct traits<MatrixReplacement> :  public Eigen::internal::traits<Eigen::SparseMatrix<double> >
+  {};
+}
+}
+
+// Example of a matrix-free wrapper from a user type to Eigen's compatible type
+// For the sake of simplicity, this example simply wrap a Eigen::SparseMatrix.
+class MatrixReplacement : public Eigen::EigenBase<MatrixReplacement> {
+public:
+  // Required typedefs, constants, and method:
+  typedef double Scalar;
+  typedef double RealScalar;
+  typedef int StorageIndex;
+  enum {
+    ColsAtCompileTime = Eigen::Dynamic,
+    MaxColsAtCompileTime = Eigen::Dynamic,
+    IsRowMajor = false
+  };
+
+  Index rows() const { return mp_mat->rows(); }
+  Index cols() const { return mp_mat->cols(); }
+
+  template<typename Rhs>
+  Eigen::Product<MatrixReplacement,Rhs,Eigen::AliasFreeProduct> operator*(const Eigen::MatrixBase<Rhs>& x) const {
+    return Eigen::Product<MatrixReplacement,Rhs,Eigen::AliasFreeProduct>(*this, x.derived());
+  }
+
+  // Custom API:
+  MatrixReplacement() : mp_mat(0) {}
+
+  void attachMyMatrix(const SparseMatrix<double> &mat) {
+    mp_mat = &mat;
+  }
+  const SparseMatrix<double> my_matrix() const { return *mp_mat; }
+
+private:
+  const SparseMatrix<double> *mp_mat;
+};
+
+
+// Implementation of MatrixReplacement * Eigen::DenseVector though a specialization of internal::generic_product_impl:
+namespace Eigen {
+namespace internal {
+
+  template<typename Rhs>
+  struct generic_product_impl<MatrixReplacement, Rhs, SparseShape, DenseShape, GemvProduct> // GEMV stands for matrix-vector
+  : generic_product_impl_base<MatrixReplacement,Rhs,generic_product_impl<MatrixReplacement,Rhs> >
+  {
+    typedef typename Product<MatrixReplacement,Rhs>::Scalar Scalar;
+
+    template<typename Dest>
+    static void scaleAndAddTo(Dest& dst, const MatrixReplacement& lhs, const Rhs& rhs, const Scalar& alpha)
+    {
+      // This method should implement "dst += alpha * lhs * rhs" inplace,
+      // however, for iterative solvers, alpha is always equal to 1, so let's not bother about it.
+      assert(alpha==Scalar(1) && "scaling is not implemented");
+
+      // Here we could simply call dst.noalias() += lhs.my_matrix() * rhs,
+      // but let's do something fancier (and less efficient):
+      for(Index i=0; i<lhs.cols(); ++i)
+        dst += rhs(i) * lhs.my_matrix().col(i);
+    }
+  };
+
+}
+}
+
+int main()
+{
+  int n = 10;
+  Eigen::SparseMatrix<double> S = Eigen::MatrixXd::Random(n,n).sparseView(0.5,1);
+  S = S.transpose()*S;
+
+  MatrixReplacement A;
+  A.attachMyMatrix(S);
+
+  Eigen::VectorXd b(n), x;
+  b.setRandom();
+
+  // Solve Ax = b using various iterative solver with matrix-free version:
+  {
+    Eigen::ConjugateGradient<MatrixReplacement, Eigen::Lower|Eigen::Upper, Eigen::IdentityPreconditioner> cg;
+    cg.compute(A);
+    x = cg.solve(b);
+    std::cout << "CG:       #iterations: " << cg.iterations() << ", estimated error: " << cg.error() << std::endl;
+  }
+
+  {
+    Eigen::BiCGSTAB<MatrixReplacement, Eigen::IdentityPreconditioner> bicg;
+    bicg.compute(A);
+    x = bicg.solve(b);
+    std::cout << "BiCGSTAB: #iterations: " << bicg.iterations() << ", estimated error: " << bicg.error() << std::endl;
+  }
+
+  {
+    Eigen::GMRES<MatrixReplacement, Eigen::IdentityPreconditioner> gmres;
+    gmres.compute(A);
+    x = gmres.solve(b);
+    std::cout << "GMRES:    #iterations: " << gmres.iterations() << ", estimated error: " << gmres.error() << std::endl;
+  }
+
+  {
+    Eigen::DGMRES<MatrixReplacement, Eigen::IdentityPreconditioner> gmres;
+    gmres.compute(A);
+    x = gmres.solve(b);
+    std::cout << "DGMRES:   #iterations: " << gmres.iterations() << ", estimated error: " << gmres.error() << std::endl;
+  }
+
+  {
+    Eigen::MINRES<MatrixReplacement, Eigen::Lower|Eigen::Upper, Eigen::IdentityPreconditioner> minres;
+    minres.compute(A);
+    x = minres.solve(b);
+    std::cout << "MINRES:   #iterations: " << minres.iterations() << ", estimated error: " << minres.error() << std::endl;
+  }
+}
diff --git a/doc/examples/nullary_indexing.cpp b/doc/examples/nullary_indexing.cpp
new file mode 100644
index 000000000..e27c3585a
--- /dev/null
+++ b/doc/examples/nullary_indexing.cpp
@@ -0,0 +1,66 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+// [functor]
+template<class ArgType, class RowIndexType, class ColIndexType>
+class indexing_functor {
+  const ArgType &m_arg;
+  const RowIndexType &m_rowIndices;
+  const ColIndexType &m_colIndices;
+public:
+  typedef Matrix<typename ArgType::Scalar,
+                 RowIndexType::SizeAtCompileTime,
+                 ColIndexType::SizeAtCompileTime,
+                 ArgType::Flags&RowMajorBit?RowMajor:ColMajor,
+                 RowIndexType::MaxSizeAtCompileTime,
+                 ColIndexType::MaxSizeAtCompileTime> MatrixType;
+
+  indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+    : m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices)
+  {}
+
+  const typename ArgType::Scalar& operator() (Index row, Index col) const {
+    return m_arg(m_rowIndices[row], m_colIndices[col]);
+  }
+};
+// [functor]
+
+// [function]
+template <class ArgType, class RowIndexType, class ColIndexType>
+CwiseNullaryOp<indexing_functor<ArgType,RowIndexType,ColIndexType>, typename indexing_functor<ArgType,RowIndexType,ColIndexType>::MatrixType>
+indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+{
+  typedef indexing_functor<ArgType,RowIndexType,ColIndexType> Func;
+  typedef typename Func::MatrixType MatrixType;
+  return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices));
+}
+// [function]
+
+
+int main()
+{
+  std::cout << "[main1]\n";
+  Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4);
+  Array3i ri(1,2,1);
+  ArrayXi ci(6); ci << 3,2,1,0,0,2;
+  Eigen::MatrixXi B = indexing(A, ri, ci);
+  std::cout << "A =" << std::endl;
+  std::cout << A << std::endl << std::endl;
+  std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl;
+  std::cout << B << std::endl;
+  std::cout << "[main1]\n";
+
+  std::cout << "[main2]\n";
+  B =  indexing(A, ri+1, ci);
+  std::cout << "A(ri+1,ci) =" << std::endl;
+  std::cout << B << std::endl << std::endl;
+#if __cplusplus >= 201103L
+  B =  indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
+  std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl;
+  std::cout << B << std::endl << std::endl;
+#endif
+  std::cout << "[main2]\n";
+}
+
diff --git a/doc/ftv2node.png b/doc/ftv2node.png
new file mode 100644
index 000000000..63c605bb4
--- /dev/null
+++ b/doc/ftv2node.png
diff --git a/doc/ftv2pnode.png b/doc/ftv2pnode.png
new file mode 100644
index 000000000..c6ee22f93
--- /dev/null
+++ b/doc/ftv2pnode.png
diff --git a/doc/snippets/BiCGSTAB_simple.cpp b/doc/snippets/BiCGSTAB_simple.cpp
new file mode 100644
index 000000000..5520f4f1f
--- /dev/null
+++ b/doc/snippets/BiCGSTAB_simple.cpp
@@ -0,0 +1,11 @@
+  int n = 10000;
+  VectorXd x(n), b(n);
+  SparseMatrix<double> A(n,n);
+  /* ... fill A and b ... */ 
+  BiCGSTAB<SparseMatrix<double> > solver;
+  solver.compute(A);
+  x = solver.solve(b);
+  std::cout << "#iterations:     " << solver.iterations() << std::endl;
+  std::cout << "estimated error: " << solver.error()      << std::endl;
+  /* ... update b ... */
+  x = solver.solve(b); // solve again
+\ No newline at end of file
diff --git a/doc/snippets/BiCGSTAB_step_by_step.cpp b/doc/snippets/BiCGSTAB_step_by_step.cpp
new file mode 100644
index 000000000..06147bb81
--- /dev/null
+++ b/doc/snippets/BiCGSTAB_step_by_step.cpp
@@ -0,0 +1,14 @@
+  int n = 10000;
+  VectorXd x(n), b(n);
+  SparseMatrix<double> A(n,n);
+  /* ... fill A and b ... */ 
+  BiCGSTAB<SparseMatrix<double> > solver(A);
+  // start from a random solution
+  x = VectorXd::Random(n);
+  solver.setMaxIterations(1);
+  int i = 0;
+  do {
+    x = solver.solveWithGuess(b,x);
+    std::cout << i << " : " << solver.error() << std::endl;
+    ++i;
+  } while (solver.info()!=Success && i<100);
+\ No newline at end of file
diff --git a/doc/snippets/CMakeLists.txt b/doc/snippets/CMakeLists.txt
index 1135900cf..1baf32fba 100644
--- a/doc/snippets/CMakeLists.txt
+++ b/doc/snippets/CMakeLists.txt
@@ -24,5 +24,3 @@ foreach(snippet_src ${snippets_SRCS})
   set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}
                               PROPERTIES OBJECT_DEPENDS ${snippet_src})
 endforeach(snippet_src)
-
-ei_add_target_property(compile_tut_arithmetic_transpose_aliasing COMPILE_FLAGS -DEIGEN_NO_DEBUG)
diff --git a/doc/snippets/Cwise_arg.cpp b/doc/snippets/Cwise_arg.cpp
new file mode 100644
index 000000000..3f45133b6
--- /dev/null
+++ b/doc/snippets/Cwise_arg.cpp
@@ -0,0 +1,3 @@
+ArrayXcf v = ArrayXcf::Random(3);
+cout << v << endl << endl;
+cout << arg(v) << endl;
diff --git a/doc/snippets/Cwise_array_power_array.cpp b/doc/snippets/Cwise_array_power_array.cpp
new file mode 100644
index 000000000..432a76ee5
--- /dev/null
+++ b/doc/snippets/Cwise_array_power_array.cpp
@@ -0,0 +1,4 @@
+Array<double,1,3> x(8,25,3),
+                  e(1./3.,0.5,2.);
+cout << "[" << x << "]^[" << e << "] = " << x.pow(e) << endl; // using ArrayBase::pow
+cout << "[" << x << "]^[" << e << "] = " << pow(x,e) << endl; // using Eigen::pow
diff --git a/doc/snippets/Cwise_atan.cpp b/doc/snippets/Cwise_atan.cpp
new file mode 100644
index 000000000..446844726
--- /dev/null
+++ b/doc/snippets/Cwise_atan.cpp
@@ -0,0 +1,2 @@
+ArrayXd v = ArrayXd::LinSpaced(5,0,1);
+cout << v.atan() << endl;
diff --git a/doc/snippets/Cwise_boolean_not.cpp b/doc/snippets/Cwise_boolean_not.cpp
new file mode 100644
index 000000000..40009f15a
--- /dev/null
+++ b/doc/snippets/Cwise_boolean_not.cpp
@@ -0,0 +1,5 @@
+Array3d v(1,2,3);
+v(1) *= 0.0/0.0;
+v(2) /= 0.0;
+cout << v << endl << endl;
+cout << !isfinite(v) << endl;
diff --git a/doc/snippets/Cwise_boolean_xor.cpp b/doc/snippets/Cwise_boolean_xor.cpp
new file mode 100644
index 000000000..fafbec806
--- /dev/null
+++ b/doc/snippets/Cwise_boolean_xor.cpp
@@ -0,0 +1,2 @@
+Array3d v(-1,2,1), w(-3,2,3);
+cout << ((v<w) ^ (v<0)) << endl;
diff --git a/doc/snippets/Cwise_ceil.cpp b/doc/snippets/Cwise_ceil.cpp
new file mode 100644
index 000000000..76cf661f4
--- /dev/null
+++ b/doc/snippets/Cwise_ceil.cpp
@@ -0,0 +1,3 @@
+ArrayXd v = ArrayXd::LinSpaced(7,-2,2);
+cout << v << endl << endl;
+cout << ceil(v) << endl;
diff --git a/doc/snippets/Cwise_cosh.cpp b/doc/snippets/Cwise_cosh.cpp
new file mode 100644
index 000000000..80ee75da5
--- /dev/null
+++ b/doc/snippets/Cwise_cosh.cpp
@@ -0,0 +1,2 @@
+ArrayXd v = ArrayXd::LinSpaced(5,0,1);
+cout << cosh(v) << endl;
diff --git a/doc/snippets/Cwise_floor.cpp b/doc/snippets/Cwise_floor.cpp
new file mode 100644
index 000000000..73756b41c
--- /dev/null
+++ b/doc/snippets/Cwise_floor.cpp
@@ -0,0 +1,3 @@
+ArrayXd v = ArrayXd::LinSpaced(7,-2,2);
+cout << v << endl << endl;
+cout << floor(v) << endl;
diff --git a/doc/snippets/Cwise_isFinite.cpp b/doc/snippets/Cwise_isFinite.cpp
new file mode 100644
index 000000000..1da55fd16
--- /dev/null
+++ b/doc/snippets/Cwise_isFinite.cpp
@@ -0,0 +1,5 @@
+Array3d v(1,2,3);
+v(1) *= 0.0/0.0;
+v(2) /= 0.0;
+cout << v << endl << endl;
+cout << isfinite(v) << endl;
diff --git a/doc/snippets/Cwise_isInf.cpp b/doc/snippets/Cwise_isInf.cpp
new file mode 100644
index 000000000..be793081c
--- /dev/null
+++ b/doc/snippets/Cwise_isInf.cpp
@@ -0,0 +1,5 @@
+Array3d v(1,2,3);
+v(1) *= 0.0/0.0;
+v(2) /= 0.0;
+cout << v << endl << endl;
+cout << isinf(v) << endl;
diff --git a/doc/snippets/Cwise_isNaN.cpp b/doc/snippets/Cwise_isNaN.cpp
new file mode 100644
index 000000000..7b2a93082
--- /dev/null
+++ b/doc/snippets/Cwise_isNaN.cpp
@@ -0,0 +1,5 @@
+Array3d v(1,2,3);
+v(1) *= 0.0/0.0;
+v(2) /= 0.0;
+cout << v << endl << endl;
+cout << isnan(v) << endl;
diff --git a/doc/snippets/Cwise_log10.cpp b/doc/snippets/Cwise_log10.cpp
new file mode 100644
index 000000000..b7ae4a834
--- /dev/null
+++ b/doc/snippets/Cwise_log10.cpp
@@ -0,0 +1,2 @@
+Array4d v(-1,0,1,2);
+cout << log10(v) << endl;
diff --git a/doc/snippets/Cwise_round.cpp b/doc/snippets/Cwise_round.cpp
new file mode 100644
index 000000000..e5c88230b
--- /dev/null
+++ b/doc/snippets/Cwise_round.cpp
@@ -0,0 +1,3 @@
+ArrayXd v = ArrayXd::LinSpaced(7,-2,2);
+cout << v << endl << endl;
+cout << round(v) << endl;
diff --git a/doc/snippets/Cwise_scalar_power_array.cpp b/doc/snippets/Cwise_scalar_power_array.cpp
new file mode 100644
index 000000000..c968b2c84
--- /dev/null
+++ b/doc/snippets/Cwise_scalar_power_array.cpp
@@ -0,0 +1,2 @@
+Array<double,1,3> e(2,-3,1./3.);
+cout << "10^[" << e << "] = " << pow(10,e) << endl;
diff --git a/doc/snippets/Cwise_sign.cpp b/doc/snippets/Cwise_sign.cpp
new file mode 100644
index 000000000..49920e4f1
--- /dev/null
+++ b/doc/snippets/Cwise_sign.cpp
@@ -0,0 +1,2 @@
+Array3d v(-3,5,0);
+cout << v.sign() << endl;
diff --git a/doc/snippets/Cwise_sinh.cpp b/doc/snippets/Cwise_sinh.cpp
new file mode 100644
index 000000000..fac9b19a8
--- /dev/null
+++ b/doc/snippets/Cwise_sinh.cpp
@@ -0,0 +1,2 @@
+ArrayXd v = ArrayXd::LinSpaced(5,0,1);
+cout << sinh(v) << endl;
diff --git a/doc/snippets/Cwise_tanh.cpp b/doc/snippets/Cwise_tanh.cpp
new file mode 100644
index 000000000..30cd0450d
--- /dev/null
+++ b/doc/snippets/Cwise_tanh.cpp
@@ -0,0 +1,2 @@
+ArrayXd v = ArrayXd::LinSpaced(5,0,1);
+cout << tanh(v) << endl;
diff --git a/doc/snippets/DenseBase_LinSpacedInt.cpp b/doc/snippets/DenseBase_LinSpacedInt.cpp
new file mode 100644
index 000000000..0d7ae068e
--- /dev/null
+++ b/doc/snippets/DenseBase_LinSpacedInt.cpp
@@ -0,0 +1,8 @@
+cout << "Even spacing inputs:" << endl;
+cout << VectorXi::LinSpaced(8,1,4).transpose() << endl;
+cout << VectorXi::LinSpaced(8,1,8).transpose() << endl;
+cout << VectorXi::LinSpaced(8,1,15).transpose() << endl;
+cout << "Uneven spacing inputs:" << endl;
+cout << VectorXi::LinSpaced(8,1,7).transpose() << endl;
+cout << VectorXi::LinSpaced(8,1,9).transpose() << endl;
+cout << VectorXi::LinSpaced(8,1,16).transpose() << endl;
diff --git a/doc/snippets/DirectionWise_hnormalized.cpp b/doc/snippets/DirectionWise_hnormalized.cpp
new file mode 100644
index 000000000..3410790a8
--- /dev/null
+++ b/doc/snippets/DirectionWise_hnormalized.cpp
@@ -0,0 +1,7 @@
+typedef Matrix<double,4,Dynamic> Matrix4Xd;
+Matrix4Xd M = Matrix4Xd::Random(4,5);
+Projective3d P(Matrix4d::Random());
+cout << "The matrix M is:" << endl << M << endl << endl;
+cout << "M.colwise().hnormalized():" << endl << M.colwise().hnormalized() << endl << endl;
+cout << "P*M:" << endl << P*M << endl << endl;
+cout << "(P*M).colwise().hnormalized():" << endl << (P*M).colwise().hnormalized() << endl << endl;
+\ No newline at end of file
diff --git a/doc/snippets/EigenSolver_eigenvectors.cpp b/doc/snippets/EigenSolver_eigenvectors.cpp
index 0fad4dadb..8355f76c9 100644
--- a/doc/snippets/EigenSolver_eigenvectors.cpp
+++ b/doc/snippets/EigenSolver_eigenvectors.cpp
@@ -1,4 +1,4 @@
 MatrixXd ones = MatrixXd::Ones(3,3);
 EigenSolver<MatrixXd> es(ones);
-cout << "The first eigenvector of the 3x3 matrix of ones is:" 
-     << endl << es.eigenvectors().col(1) << endl;
+cout << "The first eigenvector of the 3x3 matrix of ones is:"
+     << endl << es.eigenvectors().col(0) << endl;
diff --git a/doc/snippets/LeastSquaresNormalEquations.cpp b/doc/snippets/LeastSquaresNormalEquations.cpp
new file mode 100644
index 000000000..997cf1715
--- /dev/null
+++ b/doc/snippets/LeastSquaresNormalEquations.cpp
@@ -0,0 +1,4 @@
+MatrixXf A = MatrixXf::Random(3, 2);
+VectorXf b = VectorXf::Random(3);
+cout << "The solution using normal equations is:\n"
+     << (A.transpose() * A).ldlt().solve(A.transpose() * b) << endl;
diff --git a/doc/snippets/LeastSquaresQR.cpp b/doc/snippets/LeastSquaresQR.cpp
new file mode 100644
index 000000000..6c9704547
--- /dev/null
+++ b/doc/snippets/LeastSquaresQR.cpp
@@ -0,0 +1,4 @@
+MatrixXf A = MatrixXf::Random(3, 2);
+VectorXf b = VectorXf::Random(3);
+cout << "The solution using the QR decomposition is:\n"
+     << A.colPivHouseholderQr().solve(b) << endl;
diff --git a/doc/snippets/MatrixBase_cwiseSign.cpp b/doc/snippets/MatrixBase_cwiseSign.cpp
new file mode 100644
index 000000000..efd717955
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseSign.cpp
@@ -0,0 +1,4 @@
+MatrixXd m(2,3);
+m <<  2, -4, 6,
+     -5,  1, 0;
+cout << m.cwiseSign() << endl;
diff --git a/doc/snippets/MatrixBase_hnormalized.cpp b/doc/snippets/MatrixBase_hnormalized.cpp
new file mode 100644
index 000000000..652cd77c0
--- /dev/null
+++ b/doc/snippets/MatrixBase_hnormalized.cpp
@@ -0,0 +1,6 @@
+Vector4d v = Vector4d::Random();
+Projective3d P(Matrix4d::Random());
+cout << "v                   = " << v.transpose() << "]^T" << endl;
+cout << "v.hnormalized()     = " << v.hnormalized().transpose() << "]^T" << endl;
+cout << "P*v                 = " << (P*v).transpose() << "]^T" << endl;
+cout << "(P*v).hnormalized() = " << (P*v).hnormalized().transpose() << "]^T" << endl;
+\ No newline at end of file
diff --git a/doc/snippets/MatrixBase_homogeneous.cpp b/doc/snippets/MatrixBase_homogeneous.cpp
new file mode 100644
index 000000000..457c28f91
--- /dev/null
+++ b/doc/snippets/MatrixBase_homogeneous.cpp
@@ -0,0 +1,6 @@
+Vector3d v = Vector3d::Random(), w;
+Projective3d P(Matrix4d::Random());
+cout << "v                                   = [" << v.transpose() << "]^T" << endl;
+cout << "h.homogeneous()                     = [" << v.homogeneous().transpose() << "]^T" << endl;
+cout << "(P * v.homogeneous())               = [" << (P * v.homogeneous()).transpose() << "]^T" << endl;
+cout << "(P * v.homogeneous()).hnormalized() = [" << (P * v.homogeneous()).eval().hnormalized().transpose() << "]^T" << endl;
+\ No newline at end of file
diff --git a/doc/snippets/MatrixBase_marked.cpp b/doc/snippets/MatrixBase_marked.cpp
deleted file mode 100644
index f60712178..000000000
--- a/doc/snippets/MatrixBase_marked.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _MSC_VER
-  #warning deprecated
-#endif
-/*
-Matrix3d m = Matrix3d::Zero();
-m.part<Eigen::UpperTriangular>().setOnes();
-cout << "Here is the matrix m:" << endl << m << endl;
-Matrix3d n = Matrix3d::Ones();
-n.part<Eigen::LowerTriangular>() *= 2;
-cout << "Here is the matrix n:" << endl << n << endl;
-cout << "And now here is m.inverse()*n, taking advantage of the fact that"
-        " m is upper-triangular:" << endl
-     << m.marked<Eigen::UpperTriangular>().solveTriangular(n);
-*/
-\ No newline at end of file
diff --git a/doc/snippets/MatrixBase_part.cpp b/doc/snippets/MatrixBase_part.cpp
deleted file mode 100644
index d3e7f482e..000000000
--- a/doc/snippets/MatrixBase_part.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _MSC_VER
-  #warning deprecated
-#endif
-/*
-Matrix3d m = Matrix3d::Zero();
-m.part<Eigen::StrictlyUpperTriangular>().setOnes();
-cout << "Here is the matrix m:" << endl << m << endl;
-cout << "And let us now compute m*m.adjoint() in a very optimized way" << endl
-     << "taking advantage of the symmetry." << endl;
-Matrix3d n;
-n.part<Eigen::SelfAdjoint>() = (m*m.adjoint()).lazy();
-cout << "The result is:" << endl << n << endl;
-*/
-\ No newline at end of file
diff --git a/doc/snippets/MatrixBase_selfadjointView.cpp b/doc/snippets/MatrixBase_selfadjointView.cpp
new file mode 100644
index 000000000..4bd3c7eeb
--- /dev/null
+++ b/doc/snippets/MatrixBase_selfadjointView.cpp
@@ -0,0 +1,6 @@
+Matrix3i m = Matrix3i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the symmetric matrix extracted from the upper part of m:" << endl
+     << Matrix3i(m.selfadjointView<Upper>()) << endl;
+cout << "Here is the symmetric matrix extracted from the lower part of m:" << endl
+     << Matrix3i(m.selfadjointView<Lower>()) << endl;
diff --git a/doc/snippets/MatrixBase_extract.cpp b/doc/snippets/MatrixBase_triangularView.cpp
index c96220f72..03aa303f0 100644
--- a/doc/snippets/MatrixBase_extract.cpp
+++ b/doc/snippets/MatrixBase_triangularView.cpp
@@ -1,13 +1,9 @@
-#ifndef _MSC_VER
-  #warning deprecated
-#endif
-/* deprecated
 Matrix3i m = Matrix3i::Random();
 cout << "Here is the matrix m:" << endl << m << endl;
 cout << "Here is the upper-triangular matrix extracted from m:" << endl
-     << m.part<Eigen::UpperTriangular>() << endl;
+     << Matrix3i(m.triangularView<Eigen::Upper>()) << endl;
 cout << "Here is the strictly-upper-triangular matrix extracted from m:" << endl
-     << m.part<Eigen::StrictlyUpperTriangular>() << endl;
+     << Matrix3i(m.triangularView<Eigen::StrictlyUpper>()) << endl;
 cout << "Here is the unit-lower-triangular matrix extracted from m:" << endl
-     << m.part<Eigen::UnitLowerTriangular>() << endl;
-*/
-\ No newline at end of file
+     << Matrix3i(m.triangularView<Eigen::UnitLower>()) << endl;
+// FIXME need to implement output for triangularViews (Bug 885)
diff --git a/doc/snippets/PartialRedux_count.cpp b/doc/snippets/PartialRedux_count.cpp
index c7b3097e4..1c3b3a28f 100644
--- a/doc/snippets/PartialRedux_count.cpp
+++ b/doc/snippets/PartialRedux_count.cpp
@@ -1,3 +1,5 @@
 Matrix3d m = Matrix3d::Random();
 cout << "Here is the matrix m:" << endl << m << endl;
-cout << "Here is the count of elements larger or equal than 0.5 of each row:" << endl << (m.array() >= 0.5).rowwise().count() << endl;
+Matrix<ptrdiff_t, 3, 1> res = (m.array() >= 0.5).rowwise().count();
+cout << "Here is the count of elements larger or equal than 0.5 of each row:" << endl;
+cout << res << endl;
diff --git a/doc/snippets/SparseMatrix_coeffs.cpp b/doc/snippets/SparseMatrix_coeffs.cpp
new file mode 100644
index 000000000..f71a69b07
--- /dev/null
+++ b/doc/snippets/SparseMatrix_coeffs.cpp
@@ -0,0 +1,9 @@
+SparseMatrix<double> A(3,3);
+A.insert(1,2) = 0;
+A.insert(0,1) = 1;
+A.insert(2,0) = 2;
+A.makeCompressed();
+cout << "The matrix A is:" << endl << MatrixXd(A) << endl;
+cout << "it has " << A.nonZeros() << " stored non zero coefficients that are: " << A.coeffs().transpose() << endl;
+A.coeffs() += 10;
+cout << "After adding 10 to every stored non zero coefficient, the matrix A is:" << endl << MatrixXd(A) << endl;
diff --git a/doc/snippets/TopicAliasing_mult4.cpp b/doc/snippets/TopicAliasing_mult4.cpp
new file mode 100644
index 000000000..8a8992f6c
--- /dev/null
+++ b/doc/snippets/TopicAliasing_mult4.cpp
@@ -0,0 +1,5 @@
+MatrixXf A(2,2), B(3,2);
+B << 2, 0,  0, 3, 1, 1;
+A << 2, 0, 0, -2;
+A = (B * A).cwiseAbs();
+cout << A;
+\ No newline at end of file
diff --git a/doc/snippets/TopicAliasing_mult5.cpp b/doc/snippets/TopicAliasing_mult5.cpp
new file mode 100644
index 000000000..1a36defde
--- /dev/null
+++ b/doc/snippets/TopicAliasing_mult5.cpp
@@ -0,0 +1,5 @@
+MatrixXf A(2,2), B(3,2);
+B << 2, 0,  0, 3, 1, 1;
+A << 2, 0, 0, -2;
+A = (B * A).eval().cwiseAbs();
+cout << A;
diff --git a/doc/snippets/Triangular_solve.cpp b/doc/snippets/Triangular_solve.cpp
new file mode 100644
index 000000000..548442467
--- /dev/null
+++ b/doc/snippets/Triangular_solve.cpp
@@ -0,0 +1,11 @@
+Matrix3d m = Matrix3d::Zero();
+m.triangularView<Eigen::Upper>().setOnes();
+cout << "Here is the matrix m:\n" << m << endl;
+Matrix3d n = Matrix3d::Ones();
+n.triangularView<Eigen::Lower>() *= 2;
+cout << "Here is the matrix n:\n" << n << endl;
+cout << "And now here is m.inverse()*n, taking advantage of the fact that"
+        " m is upper-triangular:\n"
+     << m.triangularView<Eigen::Upper>().solve(n) << endl;
+cout << "And this is n*m.inverse():\n"
+     << m.triangularView<Eigen::Upper>().solve<Eigen::OnTheRight>(n);
diff --git a/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp b/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp
index 84e8715cb..55a21539d 100644
--- a/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp
+++ b/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp
@@ -3,7 +3,7 @@ vec1 << 1, 2, 3;
 std::cout << "vec1 = " << vec1 << std::endl;
 
 RowVectorXd vec2(4);
-vec2 << 1, 4, 9, 16;;
+vec2 << 1, 4, 9, 16;
 std::cout << "vec2 = " << vec2 << std::endl;
 
 RowVectorXd joined(7);
diff --git a/doc/snippets/Tutorial_ReshapeMat2Mat.cpp b/doc/snippets/Tutorial_ReshapeMat2Mat.cpp
new file mode 100644
index 000000000..f84d6e76d
--- /dev/null
+++ b/doc/snippets/Tutorial_ReshapeMat2Mat.cpp
@@ -0,0 +1,6 @@
+MatrixXf M1(2,6);    // Column-major storage
+M1 << 1, 2, 3,  4,  5,  6,
+      7, 8, 9, 10, 11, 12;
+
+Map<MatrixXf> M2(M1.data(), 6,2);
+cout << "M2:" << endl << M2 << endl;
+\ No newline at end of file
diff --git a/doc/snippets/Tutorial_ReshapeMat2Vec.cpp b/doc/snippets/Tutorial_ReshapeMat2Vec.cpp
new file mode 100644
index 000000000..95bd4e0e6
--- /dev/null
+++ b/doc/snippets/Tutorial_ReshapeMat2Vec.cpp
@@ -0,0 +1,11 @@
+MatrixXf M1(3,3);    // Column-major storage
+M1 << 1, 2, 3,
+      4, 5, 6,
+      7, 8, 9;
+
+Map<RowVectorXf> v1(M1.data(), M1.size());
+cout << "v1:" << endl << v1 << endl;
+
+Matrix<float,Dynamic,Dynamic,RowMajor> M2(M1);
+Map<RowVectorXf> v2(M2.data(), M2.size());
+cout << "v2:" << endl << v2 << endl;
+\ No newline at end of file
diff --git a/doc/snippets/Tutorial_SlicingCol.cpp b/doc/snippets/Tutorial_SlicingCol.cpp
new file mode 100644
index 000000000..f667ff689
--- /dev/null
+++ b/doc/snippets/Tutorial_SlicingCol.cpp
@@ -0,0 +1,11 @@
+MatrixXf M1 = MatrixXf::Random(3,8);
+cout << "Column major input:" << endl << M1 << "\n";
+Map<MatrixXf,0,OuterStride<> > M2(M1.data(), M1.rows(), (M1.cols()+2)/3, OuterStride<>(M1.outerStride()*3));
+cout << "1 column over 3:" << endl << M2 << "\n";
+
+typedef Matrix<float,Dynamic,Dynamic,RowMajor> RowMajorMatrixXf;
+RowMajorMatrixXf M3(M1);
+cout << "Row major input:" << endl << M3 << "\n";
+Map<RowMajorMatrixXf,0,Stride<Dynamic,3> > M4(M3.data(), M3.rows(), (M3.cols()+2)/3,
+                                              Stride<Dynamic,3>(M3.outerStride(),3));
+cout << "1 column over 3:" << endl << M4 << "\n";
+\ No newline at end of file
diff --git a/doc/snippets/Tutorial_SlicingVec.cpp b/doc/snippets/Tutorial_SlicingVec.cpp
new file mode 100644
index 000000000..07e10bf69
--- /dev/null
+++ b/doc/snippets/Tutorial_SlicingVec.cpp
@@ -0,0 +1,4 @@
+RowVectorXf v = RowVectorXf::LinSpaced(20,0,19);
+cout << "Input:" << endl << v << endl;
+Map<RowVectorXf,0,InnerStride<2> > v2(v.data(), v.size()/2);
+cout << "Even:" << v2 << endl;
+\ No newline at end of file
diff --git a/doc/snippets/VectorwiseOp_homogeneous.cpp b/doc/snippets/VectorwiseOp_homogeneous.cpp
new file mode 100644
index 000000000..aba4fed0e
--- /dev/null
+++ b/doc/snippets/VectorwiseOp_homogeneous.cpp
@@ -0,0 +1,7 @@
+typedef Matrix<double,3,Dynamic> Matrix3Xd;
+Matrix3Xd M = Matrix3Xd::Random(3,5);
+Projective3d P(Matrix4d::Random());
+cout << "The matrix M is:" << endl << M << endl << endl;
+cout << "M.colwise().homogeneous():" << endl << M.colwise().homogeneous() << endl << endl;
+cout << "P * M.colwise().homogeneous():" << endl << P * M.colwise().homogeneous() << endl << endl;
+cout << "P * M.colwise().homogeneous().hnormalized(): " << endl << (P * M.colwise().homogeneous()).colwise().hnormalized() << endl << endl;
+\ No newline at end of file
diff --git a/doc/snippets/compile_snippet.cpp.in b/doc/snippets/compile_snippet.cpp.in
index 894cd526c..d63f371a3 100644
--- a/doc/snippets/compile_snippet.cpp.in
+++ b/doc/snippets/compile_snippet.cpp.in
@@ -1,5 +1,13 @@
-#include <Eigen/Dense>
+static bool eigen_did_assert = false;
+#define eigen_assert(X) if(!eigen_did_assert && !(X)){ std::cout << "### Assertion raised in " << __FILE__ << ":" << __LINE__ << ":\n" #X << "\n### The following would happen without assertions:\n"; eigen_did_assert = true;}
+
 #include <iostream>
+#include <Eigen/Eigen>
+
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433832795
+#endif
+
 
 using namespace Eigen;
 using namespace std;
diff --git a/doc/special_examples/CMakeLists.txt b/doc/special_examples/CMakeLists.txt
index 3ab75dbfe..101fbc5f9 100644
--- a/doc/special_examples/CMakeLists.txt
+++ b/doc/special_examples/CMakeLists.txt
@@ -19,3 +19,17 @@ if(QT4_FOUND)
   add_dependencies(all_examples Tutorial_sparse_example)
 endif(QT4_FOUND)
 
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
+if(EIGEN_COMPILER_SUPPORT_CPP11)
+  add_executable(random_cpp11 random_cpp11.cpp)
+  target_link_libraries(random_cpp11 ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  add_dependencies(all_examples random_cpp11)
+  ei_add_target_property(random_cpp11 COMPILE_FLAGS "-std=c++11")
+
+  add_custom_command(
+    TARGET random_cpp11
+    POST_BUILD
+    COMMAND random_cpp11
+    ARGS >${CMAKE_CURRENT_BINARY_DIR}/random_cpp11.out
+  )
+endif()
diff --git a/doc/special_examples/Tutorial_sparse_example.cpp b/doc/special_examples/Tutorial_sparse_example.cpp
index 002f19f01..830e196ea 100644
--- a/doc/special_examples/Tutorial_sparse_example.cpp
+++ b/doc/special_examples/Tutorial_sparse_example.cpp
@@ -9,6 +9,8 @@ void saveAsBitmap(const Eigen::VectorXd& x, int n, const char* filename);
 
 int main(int argc, char** argv)
 {
+  assert(argc==2);
+  
   int n = 300;  // size of the image
   int m = n*n;  // number of unknows (=number of pixels)
 
diff --git a/doc/special_examples/Tutorial_sparse_example_details.cpp b/doc/special_examples/Tutorial_sparse_example_details.cpp
index 7d820b44a..bc18b0188 100644
--- a/doc/special_examples/Tutorial_sparse_example_details.cpp
+++ b/doc/special_examples/Tutorial_sparse_example_details.cpp
@@ -8,7 +8,7 @@ typedef Eigen::Triplet<double> T;
 void insertCoefficient(int id, int i, int j, double w, std::vector<T>& coeffs,
                        Eigen::VectorXd& b, const Eigen::VectorXd& boundary)
 {
-  int n = boundary.size();
+  int n = int(boundary.size());
   int id1 = i+j*n;
 
         if(i==-1 || i==n) b(id) -= w * boundary(j); // constrained coefficient
diff --git a/doc/special_examples/random_cpp11.cpp b/doc/special_examples/random_cpp11.cpp
new file mode 100644
index 000000000..33744c051
--- /dev/null
+++ b/doc/special_examples/random_cpp11.cpp
@@ -0,0 +1,14 @@
+#include <Eigen/Core>
+#include <iostream>
+#include <random>
+
+using namespace Eigen;
+
+int main() {
+  std::default_random_engine generator;
+  std::poisson_distribution<int> distribution(4.1);
+  auto poisson = [&] () {return distribution(generator);};
+
+  RowVectorXi v = RowVectorXi::NullaryExpr(10, poisson );
+  std::cout << v << "\n";
+}
diff --git a/eigen3.pc.in b/eigen3.pc.in
index c5855de33..3368a3aa1 100644
--- a/eigen3.pc.in
+++ b/eigen3.pc.in
@@ -1,6 +1,9 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+
 Name: Eigen3
 Description: A C++ template library for linear algebra: vectors, matrices, and related algorithms
 Requires:
-Version: ${EIGEN_VERSION_NUMBER}
+Version: @EIGEN_VERSION_NUMBER@
 Libs:
-Cflags: -I${INCLUDE_INSTALL_DIR}
+Cflags: -I${prefix}/@INCLUDE_INSTALL_DIR@
diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt
index cadc6a255..1a73f05e6 100644
--- a/failtest/CMakeLists.txt
+++ b/failtest/CMakeLists.txt
@@ -7,6 +7,9 @@ ei_add_failtest("block_nonconst_ctor_on_const_xpr_1")
 ei_add_failtest("block_nonconst_ctor_on_const_xpr_2")
 ei_add_failtest("transpose_nonconst_ctor_on_const_xpr")
 ei_add_failtest("diagonal_nonconst_ctor_on_const_xpr")
+ei_add_failtest("cwiseunaryview_nonconst_ctor_on_const_xpr")
+ei_add_failtest("triangularview_nonconst_ctor_on_const_xpr")
+ei_add_failtest("selfadjointview_nonconst_ctor_on_const_xpr")
 
 ei_add_failtest("const_qualified_block_method_retval_0")
 ei_add_failtest("const_qualified_block_method_retval_1")
@@ -25,6 +28,9 @@ ei_add_failtest("block_on_const_type_actually_const_0")
 ei_add_failtest("block_on_const_type_actually_const_1")
 ei_add_failtest("transpose_on_const_type_actually_const")
 ei_add_failtest("diagonal_on_const_type_actually_const")
+ei_add_failtest("cwiseunaryview_on_const_type_actually_const")
+ei_add_failtest("triangularview_on_const_type_actually_const")
+ei_add_failtest("selfadjointview_on_const_type_actually_const")
 
 ei_add_failtest("ref_1")
 ei_add_failtest("ref_2")
@@ -32,6 +38,20 @@ ei_add_failtest("ref_3")
 ei_add_failtest("ref_4")
 ei_add_failtest("ref_5")
 
+ei_add_failtest("swap_1")
+ei_add_failtest("swap_2")
+
+ei_add_failtest("ternary_1")
+ei_add_failtest("ternary_2")
+
+ei_add_failtest("sparse_ref_1")
+ei_add_failtest("sparse_ref_2")
+ei_add_failtest("sparse_ref_3")
+ei_add_failtest("sparse_ref_4")
+ei_add_failtest("sparse_ref_5")
+
+ei_add_failtest("sparse_storage_mismatch")
+
 ei_add_failtest("partialpivlu_int")
 ei_add_failtest("fullpivlu_int")
 ei_add_failtest("llt_int")
@@ -40,6 +60,7 @@ ei_add_failtest("qr_int")
 ei_add_failtest("colpivqr_int")
 ei_add_failtest("fullpivqr_int")
 ei_add_failtest("jacobisvd_int")
+ei_add_failtest("bdcsvd_int")
 ei_add_failtest("eigensolver_int")
 ei_add_failtest("eigensolver_cplx")
 
diff --git a/failtest/bdcsvd_int.cpp b/failtest/bdcsvd_int.cpp
new file mode 100644
index 000000000..670752cf5
--- /dev/null
+++ b/failtest/bdcsvd_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/SVD"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  BDCSVD<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/cwiseunaryview_nonconst_ctor_on_const_xpr.cpp b/failtest/cwiseunaryview_nonconst_ctor_on_const_xpr.cpp
new file mode 100644
index 000000000..e23cf8fd8
--- /dev/null
+++ b/failtest/cwiseunaryview_nonconst_ctor_on_const_xpr.cpp
@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    CwiseUnaryView<internal::scalar_real_ref_op<double>,Matrix3d> t(m);
+}
+
+int main() {}
diff --git a/failtest/cwiseunaryview_on_const_type_actually_const.cpp b/failtest/cwiseunaryview_on_const_type_actually_const.cpp
new file mode 100644
index 000000000..fcd41dfdb
--- /dev/null
+++ b/failtest/cwiseunaryview_on_const_type_actually_const.cpp
@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(){
+    MatrixXf m;
+    CwiseUnaryView<internal::scalar_real_ref_op<double>,CV_QUALIFIER MatrixXf>(m).coeffRef(0, 0) = 1.0f;
+}
+
+int main() {}
diff --git a/failtest/selfadjointview_nonconst_ctor_on_const_xpr.cpp b/failtest/selfadjointview_nonconst_ctor_on_const_xpr.cpp
new file mode 100644
index 000000000..a240f8184
--- /dev/null
+++ b/failtest/selfadjointview_nonconst_ctor_on_const_xpr.cpp
@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    SelfAdjointView<Matrix3d,Upper> t(m);
+}
+
+int main() {}
diff --git a/failtest/selfadjointview_on_const_type_actually_const.cpp b/failtest/selfadjointview_on_const_type_actually_const.cpp
new file mode 100644
index 000000000..19aaad6d0
--- /dev/null
+++ b/failtest/selfadjointview_on_const_type_actually_const.cpp
@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(){
+    MatrixXf m;
+    SelfAdjointView<CV_QUALIFIER MatrixXf,Upper>(m).coeffRef(0, 0) = 1.0f;
+}
+
+int main() {}
diff --git a/failtest/sparse_ref_1.cpp b/failtest/sparse_ref_1.cpp
new file mode 100644
index 000000000..d78d1f9b1
--- /dev/null
+++ b/failtest/sparse_ref_1.cpp
@@ -0,0 +1,18 @@
+#include "../Eigen/Sparse"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) { }
+
+int main()
+{
+  SparseMatrix<float> a(10,10);
+  CV_QUALIFIER SparseMatrix<float>& ac(a);
+  call_ref(ac);
+}
diff --git a/failtest/sparse_ref_2.cpp b/failtest/sparse_ref_2.cpp
new file mode 100644
index 000000000..46c9440c2
--- /dev/null
+++ b/failtest/sparse_ref_2.cpp
@@ -0,0 +1,15 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) { }
+
+int main()
+{
+  SparseMatrix<float> A(10,10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(A.row(3));
+#else
+  call_ref(A.col(3));
+#endif
+}
diff --git a/failtest/sparse_ref_3.cpp b/failtest/sparse_ref_3.cpp
new file mode 100644
index 000000000..a9949b552
--- /dev/null
+++ b/failtest/sparse_ref_3.cpp
@@ -0,0 +1,15 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+void call_ref(Ref<SparseMatrix<float> > a) { }
+#else
+void call_ref(const Ref<const SparseMatrix<float> > &a) { }
+#endif
+
+int main()
+{
+  SparseMatrix<float> a(10,10);
+  call_ref(a+a);
+}
diff --git a/failtest/sparse_ref_4.cpp b/failtest/sparse_ref_4.cpp
new file mode 100644
index 000000000..57bb6a1fc
--- /dev/null
+++ b/failtest/sparse_ref_4.cpp
@@ -0,0 +1,15 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) {}
+
+int main()
+{
+  SparseMatrix<float> A(10,10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(A.transpose());
+#else
+  call_ref(A);
+#endif
+}
diff --git a/failtest/sparse_ref_5.cpp b/failtest/sparse_ref_5.cpp
new file mode 100644
index 000000000..4478f6f2f
--- /dev/null
+++ b/failtest/sparse_ref_5.cpp
@@ -0,0 +1,16 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) { }
+
+int main()
+{
+  SparseMatrix<float> a(10,10);
+  SparseMatrixBase<SparseMatrix<float> > &ac(a);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(ac);
+#else
+  call_ref(ac.derived());
+#endif
+}
diff --git a/failtest/sparse_storage_mismatch.cpp b/failtest/sparse_storage_mismatch.cpp
new file mode 100644
index 000000000..51840d416
--- /dev/null
+++ b/failtest/sparse_storage_mismatch.cpp
@@ -0,0 +1,16 @@
+#include "../Eigen/Sparse"
+using namespace Eigen;
+
+typedef SparseMatrix<double,ColMajor> Mat1;
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+typedef SparseMatrix<double,RowMajor> Mat2;
+#else
+typedef SparseMatrix<double,ColMajor> Mat2;
+#endif
+
+int main()
+{
+  Mat1 a(10,10);
+  Mat2 b(10,10);
+  a += b;
+}
diff --git a/failtest/swap_1.cpp b/failtest/swap_1.cpp
new file mode 100644
index 000000000..106379720
--- /dev/null
+++ b/failtest/swap_1.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+int main()
+{
+  VectorXf a(10), b(10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  const DenseBase<VectorXf> &ac(a);
+#else
+  DenseBase<VectorXf> &ac(a);
+#endif
+  b.swap(ac);
+}
diff --git a/failtest/swap_2.cpp b/failtest/swap_2.cpp
new file mode 100644
index 000000000..c130ba6e4
--- /dev/null
+++ b/failtest/swap_2.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+int main()
+{
+  VectorXf a(10), b(10);
+  VectorXf const &ac(a);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  b.swap(ac);
+#else
+  b.swap(ac.const_cast_derived());
+#endif
+}
+\ No newline at end of file
diff --git a/failtest/ternary_1.cpp b/failtest/ternary_1.cpp
new file mode 100644
index 000000000..b40bcb0cc
--- /dev/null
+++ b/failtest/ternary_1.cpp
@@ -0,0 +1,13 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+int main(int argc,char **)
+{
+  VectorXf a(10), b(10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  b = argc>1 ? 2*a : -a;
+#else
+  b = argc>1 ? 2*a : VectorXf(-a);
+#endif
+}
diff --git a/failtest/ternary_2.cpp b/failtest/ternary_2.cpp
new file mode 100644
index 000000000..a46b12b2b
--- /dev/null
+++ b/failtest/ternary_2.cpp
@@ -0,0 +1,13 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+int main(int argc,char **)
+{
+  VectorXf a(10), b(10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  b = argc>1 ? 2*a : a+a;
+#else
+  b = argc>1 ? VectorXf(2*a) : VectorXf(a+a);
+#endif
+}
diff --git a/failtest/triangularview_nonconst_ctor_on_const_xpr.cpp b/failtest/triangularview_nonconst_ctor_on_const_xpr.cpp
new file mode 100644
index 000000000..807447e4b
--- /dev/null
+++ b/failtest/triangularview_nonconst_ctor_on_const_xpr.cpp
@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+  TriangularView<Matrix3d,Upper> t(m);
+}
+
+int main() {}
diff --git a/failtest/triangularview_on_const_type_actually_const.cpp b/failtest/triangularview_on_const_type_actually_const.cpp
new file mode 100644
index 000000000..0a381a612
--- /dev/null
+++ b/failtest/triangularview_on_const_type_actually_const.cpp
@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(){
+    MatrixXf m;
+    TriangularView<CV_QUALIFIER MatrixXf,Upper>(m).coeffRef(0, 0) = 1.0f;
+}
+
+int main() {}
diff --git a/lapack/complex_double.cpp b/lapack/complex_double.cpp
index 424d2b8ca..c9c575273 100644
--- a/lapack/complex_double.cpp
+++ b/lapack/complex_double.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,3 +15,4 @@
 
 #include "cholesky.cpp"
 #include "lu.cpp"
+#include "svd.cpp"
diff --git a/lapack/complex_single.cpp b/lapack/complex_single.cpp
index c0b2d29ae..6d11b26cd 100644
--- a/lapack/complex_single.cpp
+++ b/lapack/complex_single.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,3 +15,4 @@
 
 #include "cholesky.cpp"
 #include "lu.cpp"
+#include "svd.cpp"
diff --git a/lapack/double.cpp b/lapack/double.cpp
index d86549e19..ea78bb662 100644
--- a/lapack/double.cpp
+++ b/lapack/double.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,3 +15,4 @@
 #include "cholesky.cpp"
 #include "lu.cpp"
 #include "eigenvalues.cpp"
+#include "svd.cpp"
diff --git a/lapack/eigenvalues.cpp b/lapack/eigenvalues.cpp
index a1526ebcd..921c51569 100644
--- a/lapack/eigenvalues.cpp
+++ b/lapack/eigenvalues.cpp
@@ -7,10 +7,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#include "common.h"
+#include "lapack_common.h"
 #include <Eigen/Eigenvalues>
 
-// computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges
+// computes eigen values and vectors of a general N-by-N matrix A
 EIGEN_LAPACK_FUNC(syev,(char *jobz, char *uplo, int* n, Scalar* a, int *lda, Scalar* w, Scalar* /*work*/, int* lwork, int *info))
 {
   // TODO exploit the work buffer
@@ -22,24 +22,7 @@ EIGEN_LAPACK_FUNC(syev,(char *jobz, char *uplo, int* n, Scalar* a, int *lda, Sca
   else  if(*n<0)                                        *info = -3;
   else  if(*lda<std::max(1,*n))                         *info = -5;
   else  if((!query_size) && *lwork<std::max(1,3**n-1))  *info = -8;
-  
-//   if(*info==0)
-//   {
-//     int nb = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
-//          LWKOPT = MAX( 1, ( NB+2 )*N )
-//          WORK( 1 ) = LWKOPT
-// *
-//          IF( LWORK.LT.MAX( 1, 3*N-1 ) .AND. .NOT.LQUERY )
-//      $      INFO = -8
-//       END IF
-// *
-//       IF( INFO.NE.0 ) THEN
-//          CALL XERBLA( 'SSYEV ', -INFO )
-//          RETURN
-//       ELSE IF( LQUERY ) THEN
-//          RETURN
-//       END IF
-  
+    
   if(*info!=0)
   {
     int e = -*info;
@@ -64,14 +47,14 @@ EIGEN_LAPACK_FUNC(syev,(char *jobz, char *uplo, int* n, Scalar* a, int *lda, Sca
   
   if(eig.info()==NoConvergence)
   {
-    vector(w,*n).setZero();
+    make_vector(w,*n).setZero();
     if(computeVectors)
       matrix(a,*n,*n,*lda).setIdentity();
     //*info = 1;
     return 0;
   }
   
-  vector(w,*n) = eig.eigenvalues();
+  make_vector(w,*n) = eig.eigenvalues();
   if(computeVectors)
     matrix(a,*n,*n,*lda) = eig.eigenvectors();
   
diff --git a/lapack/lapack_common.h b/lapack/lapack_common.h
index e558c1409..c872a813e 100644
--- a/lapack/lapack_common.h
+++ b/lapack/lapack_common.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2010-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,6 +11,7 @@
 #define EIGEN_LAPACK_COMMON_H
 
 #include "../blas/common.h"
+#include "../Eigen/src/misc/lapack.h"
 
 #define EIGEN_LAPACK_FUNC(FUNC,ARGLIST)               \
   extern "C" { int EIGEN_BLAS_FUNC(FUNC) ARGLIST; }   \
@@ -18,6 +19,11 @@
 
 typedef Eigen::Map<Eigen::Transpositions<Eigen::Dynamic,Eigen::Dynamic,int> > PivotsType;
 
+#if ISCOMPLEX
+#define EIGEN_LAPACK_ARG_IF_COMPLEX(X) X,
+#else
+#define EIGEN_LAPACK_ARG_IF_COMPLEX(X)
+#endif
 
 
 #endif // EIGEN_LAPACK_COMMON_H
diff --git a/lapack/single.cpp b/lapack/single.cpp
index a64ed44e1..c7da3effa 100644
--- a/lapack/single.cpp
+++ b/lapack/single.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,3 +15,4 @@
 #include "cholesky.cpp"
 #include "lu.cpp"
 #include "eigenvalues.cpp"
+#include "svd.cpp"
diff --git a/lapack/svd.cpp b/lapack/svd.cpp
new file mode 100644
index 000000000..77b302b6b
--- /dev/null
+++ b/lapack/svd.cpp
@@ -0,0 +1,138 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "lapack_common.h"
+#include <Eigen/SVD>
+
+// computes the singular values/vectors a general M-by-N matrix A using divide-and-conquer
+EIGEN_LAPACK_FUNC(gesdd,(char *jobz, int *m, int* n, Scalar* a, int *lda, RealScalar *s, Scalar *u, int *ldu, Scalar *vt, int *ldvt, Scalar* /*work*/, int* lwork,
+                         EIGEN_LAPACK_ARG_IF_COMPLEX(RealScalar */*rwork*/) int * /*iwork*/, int *info))
+{
+  // TODO exploit the work buffer
+  bool query_size = *lwork==-1;
+  int diag_size = (std::min)(*m,*n);
+  
+  *info = 0;
+        if(*jobz!='A' && *jobz!='S' && *jobz!='O' && *jobz!='N')  *info = -1;
+  else  if(*m<0)                                                  *info = -2;
+  else  if(*n<0)                                                  *info = -3;
+  else  if(*lda<std::max(1,*m))                                   *info = -5;
+  else  if(*lda<std::max(1,*m))                                   *info = -8;
+  else  if(*ldu <1 || (*jobz=='A' && *ldu <*m)
+                   || (*jobz=='O' && *m<*n && *ldu<*m))           *info = -8;
+  else  if(*ldvt<1 || (*jobz=='A' && *ldvt<*n)
+                   || (*jobz=='S' && *ldvt<diag_size)
+                   || (*jobz=='O' && *m>=*n && *ldvt<*n))         *info = -10;
+  
+  if(*info!=0)
+  {
+    int e = -*info;
+    return xerbla_(SCALAR_SUFFIX_UP"GESDD ", &e, 6);
+  }
+  
+  if(query_size)
+  {
+    *lwork = 0;
+    return 0;
+  }
+  
+  if(*n==0 || *m==0)
+    return 0;
+  
+  PlainMatrixType mat(*m,*n);
+  mat = matrix(a,*m,*n,*lda);
+  
+  int option = *jobz=='A' ? ComputeFullU|ComputeFullV
+             : *jobz=='S' ? ComputeThinU|ComputeThinV
+             : *jobz=='O' ? ComputeThinU|ComputeThinV
+             : 0;
+
+  BDCSVD<PlainMatrixType> svd(mat,option);
+  
+  make_vector(s,diag_size) = svd.singularValues().head(diag_size);
+
+  if(*jobz=='A')
+  {
+    matrix(u,*m,*m,*ldu)   = svd.matrixU();
+    matrix(vt,*n,*n,*ldvt) = svd.matrixV().adjoint();
+  }
+  else if(*jobz=='S')
+  {
+    matrix(u,*m,diag_size,*ldu)   = svd.matrixU();
+    matrix(vt,diag_size,*n,*ldvt) = svd.matrixV().adjoint();
+  }
+  else if(*jobz=='O' && *m>=*n)
+  {
+    matrix(a,*m,*n,*lda)   = svd.matrixU();
+    matrix(vt,*n,*n,*ldvt) = svd.matrixV().adjoint();
+  }
+  else if(*jobz=='O')
+  {
+    matrix(u,*m,*m,*ldu)        = svd.matrixU();
+    matrix(a,diag_size,*n,*lda) = svd.matrixV().adjoint();
+  }
+    
+  return 0;
+}
+
+// computes the singular values/vectors a general M-by-N matrix A using two sided jacobi algorithm
+EIGEN_LAPACK_FUNC(gesvd,(char *jobu, char *jobv, int *m, int* n, Scalar* a, int *lda, RealScalar *s, Scalar *u, int *ldu, Scalar *vt, int *ldvt, Scalar* /*work*/, int* lwork,
+                         EIGEN_LAPACK_ARG_IF_COMPLEX(RealScalar */*rwork*/) int *info))
+{
+  // TODO exploit the work buffer
+  bool query_size = *lwork==-1;
+  int diag_size = (std::min)(*m,*n);
+  
+  *info = 0;
+        if( *jobu!='A' && *jobu!='S' && *jobu!='O' && *jobu!='N') *info = -1;
+  else  if((*jobv!='A' && *jobv!='S' && *jobv!='O' && *jobv!='N')
+           || (*jobu=='O' && *jobv=='O'))                         *info = -2;
+  else  if(*m<0)                                                  *info = -3;
+  else  if(*n<0)                                                  *info = -4;
+  else  if(*lda<std::max(1,*m))                                   *info = -6;
+  else  if(*ldu <1 || ((*jobu=='A' || *jobu=='S') && *ldu<*m))    *info = -9;
+  else  if(*ldvt<1 || (*jobv=='A' && *ldvt<*n)
+                   || (*jobv=='S' && *ldvt<diag_size))            *info = -11;
+  
+  if(*info!=0)
+  {
+    int e = -*info;
+    return xerbla_(SCALAR_SUFFIX_UP"GESVD ", &e, 6);
+  }
+  
+  if(query_size)
+  {
+    *lwork = 0;
+    return 0;
+  }
+  
+  if(*n==0 || *m==0)
+    return 0;
+  
+  PlainMatrixType mat(*m,*n);
+  mat = matrix(a,*m,*n,*lda);
+  
+  int option = (*jobu=='A' ? ComputeFullU : *jobu=='S' || *jobu=='O' ? ComputeThinU : 0)
+             | (*jobv=='A' ? ComputeFullV : *jobv=='S' || *jobv=='O' ? ComputeThinV : 0);
+  
+  JacobiSVD<PlainMatrixType> svd(mat,option);
+  
+  make_vector(s,diag_size) = svd.singularValues().head(diag_size);
+  {
+        if(*jobu=='A') matrix(u,*m,*m,*ldu)           = svd.matrixU();
+  else  if(*jobu=='S') matrix(u,*m,diag_size,*ldu)    = svd.matrixU();
+  else  if(*jobu=='O') matrix(a,*m,diag_size,*lda)    = svd.matrixU();
+  }
+  {
+        if(*jobv=='A') matrix(vt,*n,*n,*ldvt)         = svd.matrixV().adjoint();
+  else  if(*jobv=='S') matrix(vt,diag_size,*n,*ldvt)  = svd.matrixV().adjoint();
+  else  if(*jobv=='O') matrix(a,diag_size,*n,*lda)    = svd.matrixV().adjoint();
+  }
+  return 0;
+}
diff --git a/scripts/buildtests.in b/scripts/buildtests.in
index 7026373cf..526d5b74b 100755
--- a/scripts/buildtests.in
+++ b/scripts/buildtests.in
@@ -2,7 +2,7 @@
 
 if [[ $# != 1 || $1 == *help ]]
 then
-  echo "usage: ./check regexp"
+  echo "usage: $0 regexp"
   echo "  Builds tests matching the regexp."
   echo "  The EIGEN_MAKE_ARGS environment variable allows to pass args to 'make'."
   echo "    For example, to launch 5 concurrent builds, use EIGEN_MAKE_ARGS='-j5'"
@@ -14,9 +14,9 @@ targets_to_make=`echo "$TESTSLIST" | egrep "$1" | xargs echo`
 
 if [ -n "${EIGEN_MAKE_ARGS:+x}" ]
 then
-  make $targets_to_make ${EIGEN_MAKE_ARGS}
+  @CMAKE_MAKE_PROGRAM@ $targets_to_make ${EIGEN_MAKE_ARGS}
 else
-  make $targets_to_make
+  @CMAKE_MAKE_PROGRAM@ $targets_to_make @EIGEN_TEST_BUILD_FLAGS@
 fi
 exit $?
 
diff --git a/scripts/check.in b/scripts/check.in
index a90061a57..7717e2d93 100755
--- a/scripts/check.in
+++ b/scripts/check.in
@@ -3,7 +3,7 @@
 
 if [[ $# != 1 || $1 == *help ]]
 then
-  echo "usage: ./check regexp"
+  echo "usage: $0 regexp"
   echo "  Builds and runs tests matching the regexp."
   echo "  The EIGEN_MAKE_ARGS environment variable allows to pass args to 'make'."
   echo "    For example, to launch 5 concurrent builds, use EIGEN_MAKE_ARGS='-j5'"
diff --git a/scripts/eigen_gen_docs b/scripts/eigen_gen_docs
index 0e6f9ada2..787dcb325 100644
--- a/scripts/eigen_gen_docs
+++ b/scripts/eigen_gen_docs
@@ -4,7 +4,7 @@
 # You should call this script with USER set as you want, else some default
 # will be used
 USER=${USER:-'orzel'}
-UPLOAD_DIR=dox
+UPLOAD_DIR=dox-devel
 
 #ulimit -v 1024000
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f5f208a37..2141d07c2 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,22 +1,38 @@
-
-# generate split test header file
-message(STATUS ${CMAKE_CURRENT_BINARY_DIR})
-file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
-foreach(i RANGE 1 999)
-  file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
-    "#ifdef EIGEN_TEST_PART_${i}\n"
-    "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
-    "#else\n"
-    "#define CALL_SUBTEST_${i}(FUNC)\n"
-    "#endif\n\n"
+# generate split test header file only if it does not yet exist
+# in order to prevent a rebuild everytime cmake is configured
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)  
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
+  foreach(i RANGE 1 999)
+    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
+      "#ifdef EIGEN_TEST_PART_${i}\n"
+      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
+      "#else\n"
+      "#define CALL_SUBTEST_${i}(FUNC)\n"
+      "#endif\n\n"
     )
-endforeach()
+  endforeach()
+endif()
+
+# check if we have a Fortran compiler
+include("../cmake/language_support.cmake")
+
+workaround_9220(Fortran EIGEN_Fortran_COMPILER_WORKS)
+
+if(EIGEN_Fortran_COMPILER_WORKS)
+  enable_language(Fortran OPTIONAL)
+  if(NOT CMAKE_Fortran_COMPILER)
+    set(EIGEN_Fortran_COMPILER_WORKS OFF)
+  endif()
+endif()
+
+if(NOT EIGEN_Fortran_COMPILER_WORKS)
+  # search for a default Lapack library to complete Eigen's one
+  find_package(LAPACK)
+endif()
 
 # configure blas/lapack (use Eigen's ones)
-set(BLAS_FOUND TRUE)
-set(LAPACK_FOUND TRUE)
-set(BLAS_LIBRARIES eigen_blas)
-set(LAPACK_LIBRARIES eigen_lapack)
+set(EIGEN_BLAS_LIBRARIES eigen_blas)
+set(EIGEN_LAPACK_LIBRARIES eigen_lapack)
 
 set(EIGEN_TEST_MATRIX_DIR "" CACHE STRING "Enable testing of realword sparse matrices contained in the specified path")
 if(EIGEN_TEST_MATRIX_DIR)
@@ -31,33 +47,33 @@ endif(EIGEN_TEST_MATRIX_DIR)
 set(SPARSE_LIBS " ")
 
 find_package(Cholmod)
-if(CHOLMOD_FOUND AND BLAS_FOUND AND LAPACK_FOUND)
+if(CHOLMOD_FOUND)
   add_definitions("-DEIGEN_CHOLMOD_SUPPORT")
   include_directories(${CHOLMOD_INCLUDES})
-  set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
-  set(CHOLMOD_ALL_LIBS  ${CHOLMOD_LIBRARIES} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
+  set(CHOLMOD_ALL_LIBS  ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS "Cholmod, ")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "Cholmod, ")
 endif()
 
 find_package(Umfpack)
-if(UMFPACK_FOUND AND BLAS_FOUND)
+if(UMFPACK_FOUND)
   add_definitions("-DEIGEN_UMFPACK_SUPPORT")
   include_directories(${UMFPACK_INCLUDES})
-  set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${BLAS_LIBRARIES})
-  set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${BLAS_LIBRARIES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS "UmfPack, ")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "UmfPack, ")
 endif()
 
-find_package(SuperLU)
-if(SUPERLU_FOUND AND BLAS_FOUND)
+find_package(SuperLU 4.0)
+if(SUPERLU_FOUND)
   add_definitions("-DEIGEN_SUPERLU_SUPPORT")
   include_directories(${SUPERLU_INCLUDES})
-  set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${BLAS_LIBRARIES})
-  set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${BLAS_LIBRARIES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS  "SuperLU, ")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS  "SuperLU, ")
@@ -67,7 +83,7 @@ endif()
 find_package(Pastix)
 find_package(Scotch)
 find_package(Metis 5.0 REQUIRED)
-if(PASTIX_FOUND AND BLAS_FOUND)
+if(PASTIX_FOUND)
   add_definitions("-DEIGEN_PASTIX_SUPPORT")
   include_directories(${PASTIX_INCLUDES})
   if(SCOTCH_FOUND)
@@ -79,8 +95,8 @@ if(PASTIX_FOUND AND BLAS_FOUND)
   else(SCOTCH_FOUND)
     ei_add_property(EIGEN_MISSING_BACKENDS  "PaStiX, ")
   endif(SCOTCH_FOUND)
-  set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${BLAS_LIBRARIES})
-  set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${BLAS_LIBRARIES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS  "PaStiX, ")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS  "PaStiX, ")
@@ -95,16 +111,14 @@ else()
 endif()
 
 find_package(SPQR)
-if(SPQR_FOUND AND BLAS_FOUND AND LAPACK_FOUND)
-  if(CHOLMOD_FOUND)
-    add_definitions("-DEIGEN_SPQR_SUPPORT")
-    include_directories(${SPQR_INCLUDES})
-    set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
-    set(SPARSE_LIBS ${SPARSE_LIBS} ${SPQR_ALL_LIBS})
-    ei_add_property(EIGEN_TESTED_BACKENDS "SPQR, ")
-  else(CHOLMOD_FOUND)
-    ei_add_property(EIGEN_MISSING_BACKENDS "SPQR, ")
-  endif(CHOLMOD_FOUND)
+if(SPQR_FOUND AND CHOLMOD_FOUND AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) )
+  add_definitions("-DEIGEN_SPQR_SUPPORT")
+  include_directories(${SPQR_INCLUDES})
+  set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${SPQR_ALL_LIBS})
+  ei_add_property(EIGEN_TESTED_BACKENDS "SPQR, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "SPQR, ")
 endif()
 
 option(EIGEN_TEST_NOQT "Disable Qt support in unit tests" OFF)
@@ -125,25 +139,32 @@ endif(TEST_LIB)
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official")
 add_custom_target(BuildOfficial)
 
+ei_add_test(rand)
 ei_add_test(meta)
 ei_add_test(sizeof)
 ei_add_test(dynalloc)
 ei_add_test(nomalloc)
 ei_add_test(first_aligned)
+ei_add_test(nullary)
 ei_add_test(mixingtypes)
-ei_add_test(packetmath)
+ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
 ei_add_test(unalignedassert)
 ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
+ei_add_test(constructor)
 ei_add_test(linearstructure)
 ei_add_test(integer_types)
-ei_add_test(cwiseop)
 ei_add_test(unalignedcount)
-ei_add_test(exceptions)
+if(NOT EIGEN_TEST_NO_EXCEPTIONS)
+  ei_add_test(exceptions)
+endif()
 ei_add_test(redux)
 ei_add_test(visitor)
 ei_add_test(block)
 ei_add_test(corners)
+ei_add_test(swap)
+ei_add_test(resize)
+ei_add_test(conservative_resize)
 ei_add_test(product_small)
 ei_add_test(product_large)
 ei_add_test(product_extra)
@@ -161,6 +182,7 @@ ei_add_test(array_for_matrix)
 ei_add_test(array_replicate)
 ei_add_test(array_reverse)
 ei_add_test(ref)
+ei_add_test(is_same_dense)
 ei_add_test(triangular)
 ei_add_test(selfadjoint)
 ei_add_test(product_selfadjoint)
@@ -172,6 +194,7 @@ ei_add_test(product_trsolve)
 ei_add_test(product_mmtr)
 ei_add_test(product_notemporary)
 ei_add_test(stable_norm)
+ei_add_test(permutationmatrices)
 ei_add_test(bandmatrix)
 ei_add_test(cholesky)
 ei_add_test(lu)
@@ -191,52 +214,75 @@ ei_add_test(real_qz)
 ei_add_test(eigensolver_generalized_real)
 ei_add_test(jacobi)
 ei_add_test(jacobisvd)
+ei_add_test(bdcsvd)
+ei_add_test(householder)
 ei_add_test(geo_orthomethods)
-ei_add_test(geo_homogeneous)
 ei_add_test(geo_quaternion)
-ei_add_test(geo_transformations)
 ei_add_test(geo_eulerangles)
-ei_add_test(geo_hyperplane)
 ei_add_test(geo_parametrizedline)
 ei_add_test(geo_alignedbox)
+ei_add_test(geo_hyperplane)
+ei_add_test(geo_transformations)
+ei_add_test(geo_homogeneous)
 ei_add_test(stdvector)
 ei_add_test(stdvector_overload)
 ei_add_test(stdlist)
+ei_add_test(stdlist_overload)
 ei_add_test(stddeque)
-ei_add_test(resize)
-ei_add_test(sparse_vector)
+ei_add_test(stddeque_overload)
 ei_add_test(sparse_basic)
+ei_add_test(sparse_block)
+ei_add_test(sparse_vector)
 ei_add_test(sparse_product)
+ei_add_test(sparse_ref)
 ei_add_test(sparse_solvers)
-ei_add_test(umeyama)
-ei_add_test(householder)
-ei_add_test(swap)
-ei_add_test(conservative_resize)
-ei_add_test(permutationmatrices)
 ei_add_test(sparse_permutations)
-ei_add_test(nullary)
+ei_add_test(simplicial_cholesky)
+ei_add_test(conjugate_gradient)
+ei_add_test(incomplete_cholesky)
+ei_add_test(bicgstab)
+ei_add_test(lscg)
+ei_add_test(sparselu)
+ei_add_test(sparseqr)
+ei_add_test(umeyama)
 ei_add_test(nesting_ops "${CMAKE_CXX_FLAGS_DEBUG}")
 ei_add_test(zerosized)
 ei_add_test(dontalign)
-ei_add_test(sizeoverflow)
+ei_add_test(evaluators)
+if(NOT EIGEN_TEST_NO_EXCEPTIONS)
+  ei_add_test(sizeoverflow)
+endif()
 ei_add_test(prec_inverse_4x4)
 ei_add_test(vectorwiseop)
 ei_add_test(special_numbers)
+ei_add_test(rvalue_types)
+ei_add_test(dense_storage)
+ei_add_test(ctorleak)
+ei_add_test(mpl2only)
+ei_add_test(inplace_decomposition)
+ei_add_test(half_float)
+ei_add_test(array_of_string)
 
-ei_add_test(simplicial_cholesky)
-ei_add_test(conjugate_gradient)
-ei_add_test(bicgstab)
-ei_add_test(sparselu)
-ei_add_test(sparseqr)
+add_executable(bug1213 bug1213.cpp bug1213_main.cpp)
+
+check_cxx_compiler_flag("-ffast-math" COMPILER_SUPPORT_FASTMATH)
+if(COMPILER_SUPPORT_FASTMATH)
+  set(EIGEN_FASTMATH_FLAGS "-ffast-math")
+else()
+  check_cxx_compiler_flag("/fp:fast" COMPILER_SUPPORT_FPFAST)
+  if(COMPILER_SUPPORT_FPFAST)
+    set(EIGEN_FASTMATH_FLAGS "/fp:fast")
+  endif()
+endif()
 
-# ei_add_test(denseLM)
+ei_add_test(fastmath " ${EIGEN_FASTMATH_FLAGS} ")
+
+# # ei_add_test(denseLM)
 
 if(QT4_FOUND)
   ei_add_test(qtvector "" "${QT_QTCORE_LIBRARY}")
 endif(QT4_FOUND)
 
-ei_add_test(eigen2support)
-
 if(UMFPACK_FOUND)
   ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}")
 endif()
@@ -281,9 +327,54 @@ ei_add_property(EIGEN_TESTING_SUMMARY "Sparse lib flags:  ${SPARSE_LIBS}\n")
 option(EIGEN_TEST_EIGEN2 "Run whole Eigen2 test suite against EIGEN2_SUPPORT" OFF)
 mark_as_advanced(EIGEN_TEST_EIGEN2)
 if(EIGEN_TEST_EIGEN2)
-  add_subdirectory(eigen2)
+  message(WARNING "The Eigen2 test suite has been removed")
+endif()
+
+# boost MP unit test
+find_package(Boost)
+if(Boost_FOUND)
+  include_directories(${Boost_INCLUDE_DIRS})
+  ei_add_test(boostmultiprec "" "${Boost_LIBRARIES}")
+  ei_add_property(EIGEN_TESTED_BACKENDS "Boost.Multiprecision, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "Boost.Multiprecision, ")
+endif()
+
+
+# CUDA unit tests
+option(EIGEN_TEST_CUDA "Enable CUDA support in unit tests" OFF)
+option(EIGEN_TEST_CUDA_CLANG "Use clang instead of nvcc to compile the CUDA tests" OFF)
+
+if(EIGEN_TEST_CUDA_CLANG AND NOT CMAKE_CXX_COMPILER MATCHES "clang")
+  message(WARNING "EIGEN_TEST_CUDA_CLANG is set, but CMAKE_CXX_COMPILER does not appear to be clang.")
 endif()
 
+if(EIGEN_TEST_CUDA)
+
+find_package(CUDA 5.0)
+if(CUDA_FOUND)
+  
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
+  endif()
+  if(EIGEN_TEST_CUDA_CLANG)
+   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")
+  endif()
+  cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR})
+  set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+  
+  ei_add_test(cuda_basic)
+  
+  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+endif(CUDA_FOUND)
+
+endif(EIGEN_TEST_CUDA)
+
+
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests)    
+add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON)
 
 option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen documentation" OFF)
 IF(EIGEN_TEST_BUILD_DOCUMENTATION)
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index ea36f7841..bdea51c10 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -42,6 +42,17 @@ template<> struct adjoint_specific<false> {
     VERIFY_IS_APPROX(v1, v1.norm() * v3);
     VERIFY_IS_APPROX(v3, v1.normalized());
     VERIFY_IS_APPROX(v3.norm(), RealScalar(1));
+
+    // check null inputs
+    VERIFY_IS_APPROX((v1*0).normalized(), (v1*0));
+#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE)
+    RealScalar very_small = (std::numeric_limits<RealScalar>::min)();
+    VERIFY( (v1*very_small).norm() == 0 );
+    VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small));
+    v3 = v1*very_small;
+    v3.normalize();
+    VERIFY_IS_APPROX(v3, (v1*very_small));
+#endif
     
     // check compatibility of dot and adjoint
     ref = NumTraits<Scalar>::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm()));
@@ -64,6 +75,7 @@ template<typename MatrixType> void adjoint(const MatrixType& m)
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+  const Index PacketSize = internal::packet_traits<Scalar>::size;
   
   Index rows = m.rows();
   Index cols = m.cols();
@@ -108,6 +120,17 @@ template<typename MatrixType> void adjoint(const MatrixType& m)
   VERIFY_IS_APPROX(m3,m1.transpose());
   m3.transposeInPlace();
   VERIFY_IS_APPROX(m3,m1);
+  
+  if(PacketSize<m3.rows() && PacketSize<m3.cols())
+  {
+    m3 = m1;
+    Index i = internal::random<Index>(0,m3.rows()-PacketSize);
+    Index j = internal::random<Index>(0,m3.cols()-PacketSize);
+    m3.template block<PacketSize,PacketSize>(i,j).transposeInPlace();
+    VERIFY_IS_APPROX( (m3.template block<PacketSize,PacketSize>(i,j)), (m1.template block<PacketSize,PacketSize>(i,j).transpose()) );
+    m3.template block<PacketSize,PacketSize>(i,j).transposeInPlace();
+    VERIFY_IS_APPROX(m3,m1);
+  }
 
   // check inplace adjoint
   m3 = m1;
@@ -129,14 +152,24 @@ void test_adjoint()
     CALL_SUBTEST_1( adjoint(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( adjoint(Matrix3d()) );
     CALL_SUBTEST_3( adjoint(Matrix4f()) );
+    
     CALL_SUBTEST_4( adjoint(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_5( adjoint(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_6( adjoint(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    
+    // Complement for 128 bits vectorization:
+    CALL_SUBTEST_8( adjoint(Matrix2d()) );
+    CALL_SUBTEST_9( adjoint(Matrix<int,4,4>()) );
+    
+    // 256 bits vectorization:
+    CALL_SUBTEST_10( adjoint(Matrix<float,8,8>()) );
+    CALL_SUBTEST_11( adjoint(Matrix<double,4,4>()) );
+    CALL_SUBTEST_12( adjoint(Matrix<int,8,8>()) );
   }
   // test a large static matrix only once
   CALL_SUBTEST_7( adjoint(Matrix<float, 100, 100>()) );
 
-#ifdef EIGEN_TEST_PART_4
+#ifdef EIGEN_TEST_PART_13
   {
     MatrixXcf a(10,10), b(10,10);
     VERIFY_RAISES_ASSERT(a = a.transpose());
@@ -154,6 +187,13 @@ void test_adjoint()
     a.transpose() = a.adjoint();
     a.transpose() += a.adjoint();
     a.transpose() += a.adjoint() + b;
+
+    // regression tests for check_for_aliasing
+    MatrixXd c(10,10);
+    c = 1.0 * MatrixXd::Ones(10,10) + c;
+    c = MatrixXd::Ones(10,10) * 1.0 + c;
+    c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) );
+    c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10);
   }
 #endif
 }
diff --git a/test/array.cpp b/test/array.cpp
index 68f6b3d7a..15c3266a9 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -13,6 +13,7 @@ template<typename ArrayType> void array(const ArrayType& m)
 {
   typedef typename ArrayType::Index Index;
   typedef typename ArrayType::Scalar Scalar;
+  typedef typename ArrayType::RealScalar RealScalar;
   typedef Array<Scalar, ArrayType::RowsAtCompileTime, 1> ColVectorType;
   typedef Array<Scalar, 1, ArrayType::ColsAtCompileTime> RowVectorType;
 
@@ -22,6 +23,8 @@ template<typename ArrayType> void array(const ArrayType& m)
   ArrayType m1 = ArrayType::Random(rows, cols),
              m2 = ArrayType::Random(rows, cols),
              m3(rows, cols);
+  ArrayType m4 = m1; // copy constructor
+  VERIFY_IS_APPROX(m1, m4);
 
   ColVectorType cv1 = ColVectorType::Random(rows);
   RowVectorType rv1 = RowVectorType::Random(cols);
@@ -70,7 +73,7 @@ template<typename ArrayType> void array(const ArrayType& m)
   VERIFY_IS_MUCH_SMALLER_THAN(abs(m1.rowwise().sum().sum() - m1.sum()), m1.abs().sum());
   if (!internal::isMuchSmallerThan(abs(m1.sum() - (m1+m2).sum()), m1.abs().sum(), test_precision<Scalar>()))
       VERIFY_IS_NOT_APPROX(((m1+m2).rowwise().sum()).sum(), m1.sum());
-  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar>()));
+  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar,Scalar>()));
 
   // vector-wise ops
   m3 = m1;
@@ -81,6 +84,47 @@ template<typename ArrayType> void array(const ArrayType& m)
   VERIFY_IS_APPROX(m3.rowwise() += rv1, m1.rowwise() + rv1);
   m3 = m1;
   VERIFY_IS_APPROX(m3.rowwise() -= rv1, m1.rowwise() - rv1);
+  
+  // Conversion from scalar
+  VERIFY_IS_APPROX((m3 = s1), ArrayType::Constant(rows,cols,s1));
+  VERIFY_IS_APPROX((m3 = 1),  ArrayType::Constant(rows,cols,1));
+  VERIFY_IS_APPROX((m3.topLeftCorner(rows,cols) = 1),  ArrayType::Constant(rows,cols,1));
+  typedef Array<Scalar,
+                ArrayType::RowsAtCompileTime==Dynamic?2:ArrayType::RowsAtCompileTime,
+                ArrayType::ColsAtCompileTime==Dynamic?2:ArrayType::ColsAtCompileTime,
+                ArrayType::Options> FixedArrayType;
+  FixedArrayType f1(s1);
+  VERIFY_IS_APPROX(f1, FixedArrayType::Constant(s1));
+  FixedArrayType f2(numext::real(s1));
+  VERIFY_IS_APPROX(f2, FixedArrayType::Constant(numext::real(s1)));
+  FixedArrayType f3((int)100*numext::real(s1));
+  VERIFY_IS_APPROX(f3, FixedArrayType::Constant((int)100*numext::real(s1)));
+  f1.setRandom();
+  FixedArrayType f4(f1.data());
+  VERIFY_IS_APPROX(f4, f1);
+  
+  // pow
+  VERIFY_IS_APPROX(m1.pow(2), m1.square());
+  VERIFY_IS_APPROX(pow(m1,2), m1.square());
+  VERIFY_IS_APPROX(m1.pow(3), m1.cube());
+  VERIFY_IS_APPROX(pow(m1,3), m1.cube());
+  VERIFY_IS_APPROX((-m1).pow(3), -m1.cube());
+  VERIFY_IS_APPROX(pow(2*m1,3), 8*m1.cube());
+  ArrayType exponents = ArrayType::Constant(rows, cols, RealScalar(2));
+  VERIFY_IS_APPROX(Eigen::pow(m1,exponents), m1.square());
+  VERIFY_IS_APPROX(m1.pow(exponents), m1.square());
+  VERIFY_IS_APPROX(Eigen::pow(2*m1,exponents), 4*m1.square());
+  VERIFY_IS_APPROX((2*m1).pow(exponents), 4*m1.square());
+  VERIFY_IS_APPROX(Eigen::pow(m1,2*exponents), m1.square().square());
+  VERIFY_IS_APPROX(m1.pow(2*exponents), m1.square().square());
+  VERIFY_IS_APPROX(Eigen::pow(m1(0,0), exponents), ArrayType::Constant(rows,cols,m1(0,0)*m1(0,0)));
+
+  // Check possible conflicts with 1D ctor
+  typedef Array<Scalar, Dynamic, 1> OneDArrayType;
+  OneDArrayType o1(rows);
+  VERIFY(o1.size()==rows);
+  OneDArrayType o4((int)rows);
+  VERIFY(o4.size()==rows);
 }
 
 template<typename ArrayType> void comparisons(const ArrayType& m)
@@ -97,8 +141,11 @@ template<typename ArrayType> void comparisons(const ArrayType& m)
         c = internal::random<Index>(0, cols-1);
 
   ArrayType m1 = ArrayType::Random(rows, cols),
-             m2 = ArrayType::Random(rows, cols),
-             m3(rows, cols);            
+            m2 = ArrayType::Random(rows, cols),
+            m3(rows, cols),
+            m4 = m1;
+  
+  m4 = (m4.abs()==Scalar(0)).select(1,m4);
 
   VERIFY(((m1 + Scalar(1)) > m1).all());
   VERIFY(((m1 - Scalar(1)) < m1).all());
@@ -112,11 +159,17 @@ template<typename ArrayType> void comparisons(const ArrayType& m)
   VERIFY(!(m1 > m2 && m1 < m2).any());
   VERIFY((m1 <= m2 || m1 >= m2).all());
 
-  // comparisons to scalar
+  // comparisons array to scalar
   VERIFY( (m1 != (m1(r,c)+1) ).any() );
-  VERIFY( (m1 > (m1(r,c)-1) ).any() );
-  VERIFY( (m1 < (m1(r,c)+1) ).any() );
-  VERIFY( (m1 == m1(r,c) ).any() );
+  VERIFY( (m1 >  (m1(r,c)-1) ).any() );
+  VERIFY( (m1 <  (m1(r,c)+1) ).any() );
+  VERIFY( (m1 ==  m1(r,c)    ).any() );
+
+  // comparisons scalar to array
+  VERIFY( ( (m1(r,c)+1) != m1).any() );
+  VERIFY( ( (m1(r,c)-1) <  m1).any() );
+  VERIFY( ( (m1(r,c)+1) >  m1).any() );
+  VERIFY( (  m1(r,c)    == m1).any() );
 
   // test Select
   VERIFY_IS_APPROX( (m1<m2).select(m1,m2), m1.cwiseMin(m2) );
@@ -164,21 +217,69 @@ template<typename ArrayType> void array_real(const ArrayType& m)
 
   ArrayType m1 = ArrayType::Random(rows, cols),
             m2 = ArrayType::Random(rows, cols),
-            m3(rows, cols);
+            m3(rows, cols),
+            m4 = m1;
+
+  m4 = (m4.abs()==Scalar(0)).select(1,m4);
 
   Scalar  s1 = internal::random<Scalar>();
 
-  // these tests are mostly to check possible compilation issues.
+  // these tests are mostly to check possible compilation issues with free-functions.
   VERIFY_IS_APPROX(m1.sin(), sin(m1));
   VERIFY_IS_APPROX(m1.cos(), cos(m1));
+  VERIFY_IS_APPROX(m1.tan(), tan(m1));
   VERIFY_IS_APPROX(m1.asin(), asin(m1));
   VERIFY_IS_APPROX(m1.acos(), acos(m1));
-  VERIFY_IS_APPROX(m1.tan(), tan(m1));
-  
+  VERIFY_IS_APPROX(m1.atan(), atan(m1));
+  VERIFY_IS_APPROX(m1.sinh(), sinh(m1));
+  VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
+  VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
+
+  VERIFY_IS_APPROX(m1.arg(), arg(m1));
+  VERIFY_IS_APPROX(m1.round(), round(m1));
+  VERIFY_IS_APPROX(m1.floor(), floor(m1));
+  VERIFY_IS_APPROX(m1.ceil(), ceil(m1));
+  VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all());
+  VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all());
+  VERIFY((m1.isFinite() == (Eigen::isfinite)(m1)).all());
+  VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
+  VERIFY_IS_APPROX(m1.abs(), abs(m1));
+  VERIFY_IS_APPROX(m1.abs2(), abs2(m1));
+  VERIFY_IS_APPROX(m1.square(), square(m1));
+  VERIFY_IS_APPROX(m1.cube(), cube(m1));
   VERIFY_IS_APPROX(cos(m1+RealScalar(3)*m2), cos((m1+RealScalar(3)*m2).eval()));
+  VERIFY_IS_APPROX(m1.sign(), sign(m1));
+
 
-  VERIFY_IS_APPROX(m1.abs().sqrt(), sqrt(abs(m1)));
-  VERIFY_IS_APPROX(m1.abs(), sqrt(numext::abs2(m1)));
+  // avoid NaNs with abs() so verification doesn't fail
+  m3 = m1.abs();
+  VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m1)));
+  VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m1)));
+  VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m1)));
+  VERIFY_IS_APPROX(m3.log(), log(m3));
+  VERIFY_IS_APPROX(m3.log1p(), log1p(m3));
+  VERIFY_IS_APPROX(m3.log10(), log10(m3));
+
+
+  VERIFY((!(m1>m2) == (m1<=m2)).all());
+
+  VERIFY_IS_APPROX(sin(m1.asin()), m1);
+  VERIFY_IS_APPROX(cos(m1.acos()), m1);
+  VERIFY_IS_APPROX(tan(m1.atan()), m1);
+  VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1)));
+  VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1)));
+  VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
+  VERIFY_IS_APPROX(arg(m1), ((m1<0).template cast<Scalar>())*std::acos(-1.0));
+  VERIFY((round(m1) <= ceil(m1) && round(m1) >= floor(m1)).all());
+  VERIFY((Eigen::isnan)((m1*0.0)/0.0).all());
+  VERIFY((Eigen::isinf)(m4/0.0).all());
+  VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*0.0/0.0)) && (!(Eigen::isfinite)(m4/0.0))).all());
+  VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
+  VERIFY((abs(m1) == m1 || abs(m1) == -m1).all());
+  VERIFY_IS_APPROX(m3, sqrt(abs2(m1)));
+  VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() );
+  VERIFY_IS_APPROX( m1*m1.sign(),m1.abs());
+  VERIFY_IS_APPROX(m1.sign() * m1.abs(), m1);
 
   VERIFY_IS_APPROX(numext::abs2(numext::real(m1)) + numext::abs2(numext::imag(m1)), numext::abs2(m1));
   VERIFY_IS_APPROX(numext::abs2(real(m1)) + numext::abs2(imag(m1)), numext::abs2(m1));
@@ -187,52 +288,138 @@ template<typename ArrayType> void array_real(const ArrayType& m)
 
   // shift argument of logarithm so that it is not zero
   Scalar smallNumber = NumTraits<Scalar>::dummy_precision();
-  VERIFY_IS_APPROX((m1.abs() + smallNumber).log() , log(abs(m1) + smallNumber));
+  VERIFY_IS_APPROX((m3 + smallNumber).log() , log(abs(m1) + smallNumber));
+  VERIFY_IS_APPROX((m3 + smallNumber + 1).log() , log1p(abs(m1) + smallNumber));
 
   VERIFY_IS_APPROX(m1.exp() * m2.exp(), exp(m1+m2));
   VERIFY_IS_APPROX(m1.exp(), exp(m1));
   VERIFY_IS_APPROX(m1.exp() / m2.exp(),(m1-m2).exp());
 
-  VERIFY_IS_APPROX(m1.pow(2), m1.square());
-  VERIFY_IS_APPROX(pow(m1,2), m1.square());
-
-  ArrayType exponents = ArrayType::Constant(rows, cols, RealScalar(2));
-  VERIFY_IS_APPROX(Eigen::pow(m1,exponents), m1.square());
-
-  m3 = m1.abs();
   VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt());
   VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt());
 
+  VERIFY_IS_APPROX(m3.pow(RealScalar(-0.5)), m3.rsqrt());
+  VERIFY_IS_APPROX(pow(m3,RealScalar(-0.5)), m3.rsqrt());
+
+  VERIFY_IS_APPROX(log10(m3), log(m3)/log(10));
+
   // scalar by array division
   const RealScalar tiny = sqrt(std::numeric_limits<RealScalar>::epsilon());
   s1 += Scalar(tiny);
   m1 += ArrayType::Constant(rows,cols,Scalar(tiny));
   VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse());
-  
+
   // check inplace transpose
   m3 = m1;
   m3.transposeInPlace();
-  VERIFY_IS_APPROX(m3,m1.transpose());
+  VERIFY_IS_APPROX(m3, m1.transpose());
   m3.transposeInPlace();
-  VERIFY_IS_APPROX(m3,m1);
+  VERIFY_IS_APPROX(m3, m1);
 }
 
 template<typename ArrayType> void array_complex(const ArrayType& m)
 {
   typedef typename ArrayType::Index Index;
+  typedef typename ArrayType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
 
   Index rows = m.rows();
   Index cols = m.cols();
 
   ArrayType m1 = ArrayType::Random(rows, cols),
-            m2(rows, cols);
+            m2(rows, cols),
+            m4 = m1;
+  
+  m4.real() = (m4.real().abs()==RealScalar(0)).select(RealScalar(1),m4.real());
+  m4.imag() = (m4.imag().abs()==RealScalar(0)).select(RealScalar(1),m4.imag());
+
+  Array<RealScalar, -1, -1> m3(rows, cols);
 
   for (Index i = 0; i < m.rows(); ++i)
     for (Index j = 0; j < m.cols(); ++j)
       m2(i,j) = sqrt(m1(i,j));
 
-  VERIFY_IS_APPROX(m1.sqrt(), m2);
-  VERIFY_IS_APPROX(m1.sqrt(), Eigen::sqrt(m1));
+  // these tests are mostly to check possible compilation issues with free-functions.
+  VERIFY_IS_APPROX(m1.sin(), sin(m1));
+  VERIFY_IS_APPROX(m1.cos(), cos(m1));
+  VERIFY_IS_APPROX(m1.tan(), tan(m1));
+  VERIFY_IS_APPROX(m1.sinh(), sinh(m1));
+  VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
+  VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
+  VERIFY_IS_APPROX(m1.arg(), arg(m1));
+  VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all());
+  VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all());
+  VERIFY((m1.isFinite() == (Eigen::isfinite)(m1)).all());
+  VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
+  VERIFY_IS_APPROX(m1.log(), log(m1));
+  VERIFY_IS_APPROX(m1.log10(), log10(m1));
+  VERIFY_IS_APPROX(m1.abs(), abs(m1));
+  VERIFY_IS_APPROX(m1.abs2(), abs2(m1));
+  VERIFY_IS_APPROX(m1.sqrt(), sqrt(m1));
+  VERIFY_IS_APPROX(m1.square(), square(m1));
+  VERIFY_IS_APPROX(m1.cube(), cube(m1));
+  VERIFY_IS_APPROX(cos(m1+RealScalar(3)*m2), cos((m1+RealScalar(3)*m2).eval()));
+  VERIFY_IS_APPROX(m1.sign(), sign(m1));
+
+
+  VERIFY_IS_APPROX(m1.exp() * m2.exp(), exp(m1+m2));
+  VERIFY_IS_APPROX(m1.exp(), exp(m1));
+  VERIFY_IS_APPROX(m1.exp() / m2.exp(),(m1-m2).exp());
+
+  VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1)));
+  VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1)));
+  VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
+
+  for (Index i = 0; i < m.rows(); ++i)
+    for (Index j = 0; j < m.cols(); ++j)
+      m3(i,j) = std::atan2(imag(m1(i,j)), real(m1(i,j)));
+  VERIFY_IS_APPROX(arg(m1), m3);
+
+  std::complex<RealScalar> zero(0.0,0.0);
+  VERIFY((Eigen::isnan)(m1*zero/zero).all());
+#if EIGEN_COMP_MSVC
+  // msvc complex division is not robust
+  VERIFY((Eigen::isinf)(m4/RealScalar(0)).all());
+#else
+#if EIGEN_COMP_CLANG
+  // clang's complex division is notoriously broken too
+  if((numext::isinf)(m4(0,0)/RealScalar(0))) {
+#endif
+    VERIFY((Eigen::isinf)(m4/zero).all());
+#if EIGEN_COMP_CLANG
+  }
+  else
+  {
+    VERIFY((Eigen::isinf)(m4.real()/zero.real()).all());
+  }
+#endif
+#endif // MSVC
+
+  VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*zero/zero)) && (!(Eigen::isfinite)(m1/zero))).all());
+
+  VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
+  VERIFY_IS_APPROX(conj(m1.conjugate()), m1);
+  VERIFY_IS_APPROX(abs(m1), sqrt(square(real(m1))+square(imag(m1))));
+  VERIFY_IS_APPROX(abs(m1), sqrt(abs2(m1)));
+  VERIFY_IS_APPROX(log10(m1), log(m1)/log(10));
+
+  VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() );
+  VERIFY_IS_APPROX( m1.sign() * m1.abs(), m1);
+
+  // scalar by array division
+  Scalar  s1 = internal::random<Scalar>();
+  const RealScalar tiny = std::sqrt(std::numeric_limits<RealScalar>::epsilon());
+  s1 += Scalar(tiny);
+  m1 += ArrayType::Constant(rows,cols,Scalar(tiny));
+  VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse());
+
+  // check inplace transpose
+  m2 = m1;
+  m2.transposeInPlace();
+  VERIFY_IS_APPROX(m2, m1.transpose());
+  m2.transposeInPlace();
+  VERIFY_IS_APPROX(m2, m1);
+
 }
 
 template<typename ArrayType> void min_max(const ArrayType& m)
@@ -301,7 +488,7 @@ void test_array()
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<int>::type, int >::value));
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<float>::type, float >::value));
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<Array2i>::type, ArrayBase<Array2i> >::value));
-  typedef CwiseUnaryOp<internal::scalar_sum_op<double>, ArrayXd > Xpr;
+  typedef CwiseUnaryOp<internal::scalar_abs_op<double>, ArrayXd > Xpr;
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<Xpr>::type,
                            ArrayBase<Xpr>
                          >::value));
diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp
index 9667e1f14..c1501947b 100644
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp
@@ -45,7 +45,7 @@ template<typename MatrixType> void array_for_matrix(const MatrixType& m)
   VERIFY_IS_MUCH_SMALLER_THAN(m1.rowwise().sum().sum() - m1.sum(), m1.squaredNorm());
   VERIFY_IS_MUCH_SMALLER_THAN(m1.colwise().sum() + m2.colwise().sum() - (m1+m2).colwise().sum(), (m1+m2).squaredNorm());
   VERIFY_IS_MUCH_SMALLER_THAN(m1.rowwise().sum() - m2.rowwise().sum() - (m1-m2).rowwise().sum(), (m1-m2).squaredNorm());
-  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar>()));
+  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar,Scalar>()));
 
   // vector-wise ops
   m3 = m1;
@@ -68,6 +68,16 @@ template<typename MatrixType> void array_for_matrix(const MatrixType& m)
   const Scalar& ref_a2 = m.array().matrix().coeffRef(0,0);
   VERIFY(&ref_a1 == &ref_m1);
   VERIFY(&ref_a2 == &ref_m2);
+
+  // Check write accessors:
+  m1.array().coeffRef(0,0) = 1;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(1));
+  m1.array()(0,0) = 2;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(2));
+  m1.array().matrix().coeffRef(0,0) = 3;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(3));
+  m1.array().matrix()(0,0) = 4;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(4));
 }
 
 template<typename MatrixType> void comparisons(const MatrixType& m)
@@ -124,6 +134,12 @@ template<typename MatrixType> void comparisons(const MatrixType& m)
   // count
   VERIFY(((m1.array().abs()+1)>RealScalar(0.1)).count() == rows*cols);
 
+  // and/or
+  VERIFY( ((m1.array()<RealScalar(0)).matrix() && (m1.array()>RealScalar(0)).matrix()).count() == 0);
+  VERIFY( ((m1.array()<RealScalar(0)).matrix() || (m1.array()>=RealScalar(0)).matrix()).count() == rows*cols);
+  RealScalar a = m1.cwiseAbs().mean();
+  VERIFY( ((m1.array()<-a).matrix() || (m1.array()>a).matrix()).count() == (m1.cwiseAbs().array()>a).count());
+
   typedef Matrix<typename MatrixType::Index, Dynamic, 1> VectorOfIndices;
 
   // TODO allows colwise/rowwise for array
@@ -134,9 +150,21 @@ template<typename MatrixType> void comparisons(const MatrixType& m)
 template<typename VectorType> void lpNorm(const VectorType& v)
 {
   using std::sqrt;
+  typedef typename VectorType::RealScalar RealScalar;
   VectorType u = VectorType::Random(v.size());
 
-  VERIFY_IS_APPROX(u.template lpNorm<Infinity>(), u.cwiseAbs().maxCoeff());
+  if(v.size()==0)
+  {
+    VERIFY_IS_APPROX(u.template lpNorm<Infinity>(), RealScalar(0));
+    VERIFY_IS_APPROX(u.template lpNorm<1>(), RealScalar(0));
+    VERIFY_IS_APPROX(u.template lpNorm<2>(), RealScalar(0));
+    VERIFY_IS_APPROX(u.template lpNorm<5>(), RealScalar(0));
+  }
+  else
+  {
+    VERIFY_IS_APPROX(u.template lpNorm<Infinity>(), u.cwiseAbs().maxCoeff());
+  }
+
   VERIFY_IS_APPROX(u.template lpNorm<1>(), u.cwiseAbs().sum());
   VERIFY_IS_APPROX(u.template lpNorm<2>(), sqrt(u.array().abs().square().sum()));
   VERIFY_IS_APPROX(numext::pow(u.template lpNorm<5>(), typename VectorType::RealScalar(5)), u.array().abs().pow(5).sum());
@@ -245,6 +273,8 @@ void test_array_for_matrix()
     CALL_SUBTEST_5( lpNorm(VectorXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_4( lpNorm(VectorXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
+  CALL_SUBTEST_5( lpNorm(VectorXf(0)) );
+  CALL_SUBTEST_4( lpNorm(VectorXcf(0)) );
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_4( resize(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_5( resize(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
diff --git a/test/array_of_string.cpp b/test/array_of_string.cpp
new file mode 100644
index 000000000..e23b7c59e
--- /dev/null
+++ b/test/array_of_string.cpp
@@ -0,0 +1,32 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+void test_array_of_string()
+{
+  typedef Array<std::string,1,Dynamic> ArrayXs;
+  ArrayXs a1(3), a2(3), a3(3), a3ref(3);
+  a1 << "one", "two", "three";
+  a2 << "1", "2", "3";
+  a3ref << "one (1)", "two (2)", "three (3)";
+  std::stringstream s1;
+  s1 << a1;
+  VERIFY_IS_EQUAL(s1.str(), std::string("  one    two  three"));
+  a3 = a1 + std::string(" (") + a2 + std::string(")");
+  VERIFY((a3==a3ref).all());
+
+  a3 = a1;
+  a3 += std::string(" (") + a2 + std::string(")");
+  VERIFY((a3==a3ref).all());
+
+  a1.swap(a3);
+  VERIFY((a1==a3ref).all());
+  VERIFY((a3!=a3ref).all());
+}
diff --git a/test/array_replicate.cpp b/test/array_replicate.cpp
index f412d1aed..779c8fc2f 100644
--- a/test/array_replicate.cpp
+++ b/test/array_replicate.cpp
@@ -44,6 +44,19 @@ template<typename MatrixType> void replicate(const MatrixType& m)
   x2 << m2, m2, m2,
         m2, m2, m2;
   VERIFY_IS_APPROX(x2, (m2.template replicate<2,3>()));
+  
+  x2.resize(rows,3*cols);
+  x2 << m2, m2, m2;
+  VERIFY_IS_APPROX(x2, (m2.template replicate<1,3>()));
+  
+  vx1.resize(3*rows,cols);
+  vx1 << m2, m2, m2;
+  VERIFY_IS_APPROX(vx1+vx1, vx1+(m2.template replicate<3,1>()));
+  
+  vx1=m2+(m2.colwise().replicate(1));
+  
+  if(m2.cols()==1)
+    VERIFY_IS_APPROX(m2.coeff(0), (m2.template replicate<3,1>().coeff(m2.rows())));
 
   x2.resize(rows,f1);
   for (int j=0; j<f1; ++j)
diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp
index fbe7a9901..c9d9f90c3 100644
--- a/test/array_reverse.cpp
+++ b/test/array_reverse.cpp
@@ -24,7 +24,7 @@ template<typename MatrixType> void reverse(const MatrixType& m)
 
   // this test relies a lot on Random.h, and there's not much more that we can do
   // to test it, hence I consider that we will have tested Random.h
-  MatrixType m1 = MatrixType::Random(rows, cols);
+  MatrixType m1 = MatrixType::Random(rows, cols), m2;
   VectorType v1 = VectorType::Random(rows);
 
   MatrixType m1_r = m1.reverse();
@@ -96,14 +96,32 @@ template<typename MatrixType> void reverse(const MatrixType& m)
 
   m1.reverse()(r, c) = x;
   VERIFY_IS_APPROX(x, m1(rows - 1 - r, cols - 1 - c));
+  
+  m2 = m1;
+  m2.reverseInPlace();
+  VERIFY_IS_APPROX(m2,m1.reverse().eval());
+  
+  m2 = m1;
+  m2.col(0).reverseInPlace();
+  VERIFY_IS_APPROX(m2.col(0),m1.col(0).reverse().eval());
+  
+  m2 = m1;
+  m2.row(0).reverseInPlace();
+  VERIFY_IS_APPROX(m2.row(0),m1.row(0).reverse().eval());
+  
+  m2 = m1;
+  m2.rowwise().reverseInPlace();
+  VERIFY_IS_APPROX(m2,m1.rowwise().reverse().eval());
+  
+  m2 = m1;
+  m2.colwise().reverseInPlace();
+  VERIFY_IS_APPROX(m2,m1.colwise().reverse().eval());
 
-  /*
   m1.colwise().reverse()(r, c) = x;
   VERIFY_IS_APPROX(x, m1(rows - 1 - r, c));
 
   m1.rowwise().reverse()(r, c) = x;
   VERIFY_IS_APPROX(x, m1(r, cols - 1 - c));
-  */
 }
 
 void test_array_reverse()
@@ -113,11 +131,11 @@ void test_array_reverse()
     CALL_SUBTEST_2( reverse(Matrix2f()) );
     CALL_SUBTEST_3( reverse(Matrix4f()) );
     CALL_SUBTEST_4( reverse(Matrix4d()) );
-    CALL_SUBTEST_5( reverse(MatrixXcf(3, 3)) );
-    CALL_SUBTEST_6( reverse(MatrixXi(6, 3)) );
-    CALL_SUBTEST_7( reverse(MatrixXcd(20, 20)) );
+    CALL_SUBTEST_5( reverse(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( reverse(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( reverse(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_8( reverse(Matrix<float, 100, 100>()) );
-    CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(6,3)) );
+    CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
 #ifdef EIGEN_TEST_PART_3
   Vector4f x; x << 1, 2, 3, 4;
diff --git a/test/bandmatrix.cpp b/test/bandmatrix.cpp
index 5e4e8e07b..f8c38f7c3 100644
--- a/test/bandmatrix.cpp
+++ b/test/bandmatrix.cpp
@@ -11,7 +11,6 @@
 
 template<typename MatrixType> void bandmatrix(const MatrixType& _m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrixType;
@@ -62,8 +61,6 @@ using Eigen::internal::BandMatrix;
 
 void test_bandmatrix()
 {
-  typedef BandMatrix<float>::Index Index;
-
   for(int i = 0; i < 10*g_repeat ; i++) {
     Index rows = internal::random<Index>(1,10);
     Index cols = internal::random<Index>(1,10);
diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp
index 8c0621ecd..99d91f9da 100644
--- a/test/basicstuff.cpp
+++ b/test/basicstuff.cpp
@@ -126,6 +126,20 @@ template<typename MatrixType> void basicStuff(const MatrixType& m)
   for(typename MatrixType::Index i=0;i<rows;++i)
     sm2.col(i).noalias() -= sm1.row(i);
   VERIFY_IS_APPROX(sm2,-sm1.transpose());
+  
+  // check ternary usage
+  {
+    bool b = internal::random<int>(0,10)>5;
+    m3 = b ? m1 : m2;
+    if(b) VERIFY_IS_APPROX(m3,m1);
+    else  VERIFY_IS_APPROX(m3,m2);
+    m3 = b ? -m1 : m2;
+    if(b) VERIFY_IS_APPROX(m3,-m1);
+    else  VERIFY_IS_APPROX(m3,m2);
+    m3 = b ? m1 : -m2;
+    if(b) VERIFY_IS_APPROX(m3,m1);
+    else  VERIFY_IS_APPROX(m3,-m2);
+  }
 }
 
 template<typename MatrixType> void basicStuffComplex(const MatrixType& m)
@@ -180,15 +194,64 @@ void casting()
 template <typename Scalar>
 void fixedSizeMatrixConstruction()
 {
-  const Scalar raw[3] = {1,2,3};
-  Matrix<Scalar,3,1> m(raw);
-  Array<Scalar,3,1> a(raw);
-  VERIFY(m(0) == 1);
-  VERIFY(m(1) == 2);
-  VERIFY(m(2) == 3);
-  VERIFY(a(0) == 1);
-  VERIFY(a(1) == 2);
-  VERIFY(a(2) == 3);  
+  Scalar raw[4];
+  for(int k=0; k<4; ++k)
+    raw[k] = internal::random<Scalar>();
+  
+  {
+    Matrix<Scalar,4,1> m(raw);
+    Array<Scalar,4,1> a(raw);
+    for(int k=0; k<4; ++k) VERIFY(m(k) == raw[k]);
+    for(int k=0; k<4; ++k) VERIFY(a(k) == raw[k]);    
+    VERIFY_IS_EQUAL(m,(Matrix<Scalar,4,1>(raw[0],raw[1],raw[2],raw[3])));
+    VERIFY((a==(Array<Scalar,4,1>(raw[0],raw[1],raw[2],raw[3]))).all());
+  }
+  {
+    Matrix<Scalar,3,1> m(raw);
+    Array<Scalar,3,1> a(raw);
+    for(int k=0; k<3; ++k) VERIFY(m(k) == raw[k]);
+    for(int k=0; k<3; ++k) VERIFY(a(k) == raw[k]);
+    VERIFY_IS_EQUAL(m,(Matrix<Scalar,3,1>(raw[0],raw[1],raw[2])));
+    VERIFY((a==Array<Scalar,3,1>(raw[0],raw[1],raw[2])).all());
+  }
+  {
+    Matrix<Scalar,2,1> m(raw), m2( (DenseIndex(raw[0])), (DenseIndex(raw[1])) );
+    Array<Scalar,2,1> a(raw),  a2( (DenseIndex(raw[0])), (DenseIndex(raw[1])) );
+    for(int k=0; k<2; ++k) VERIFY(m(k) == raw[k]);
+    for(int k=0; k<2; ++k) VERIFY(a(k) == raw[k]);
+    VERIFY_IS_EQUAL(m,(Matrix<Scalar,2,1>(raw[0],raw[1])));
+    VERIFY((a==Array<Scalar,2,1>(raw[0],raw[1])).all());
+    for(int k=0; k<2; ++k) VERIFY(m2(k) == DenseIndex(raw[k]));
+    for(int k=0; k<2; ++k) VERIFY(a2(k) == DenseIndex(raw[k]));
+  }
+  {
+    Matrix<Scalar,1,2> m(raw),
+                       m2( (DenseIndex(raw[0])), (DenseIndex(raw[1])) ),
+                       m3( (int(raw[0])), (int(raw[1])) ),
+                       m4( (float(raw[0])), (float(raw[1])) );
+    Array<Scalar,1,2> a(raw),  a2( (DenseIndex(raw[0])), (DenseIndex(raw[1])) );
+    for(int k=0; k<2; ++k) VERIFY(m(k) == raw[k]);
+    for(int k=0; k<2; ++k) VERIFY(a(k) == raw[k]);
+    VERIFY_IS_EQUAL(m,(Matrix<Scalar,1,2>(raw[0],raw[1])));
+    VERIFY((a==Array<Scalar,1,2>(raw[0],raw[1])).all());
+    for(int k=0; k<2; ++k) VERIFY(m2(k) == DenseIndex(raw[k]));
+    for(int k=0; k<2; ++k) VERIFY(a2(k) == DenseIndex(raw[k]));
+    for(int k=0; k<2; ++k) VERIFY(m3(k) == int(raw[k]));
+    for(int k=0; k<2; ++k) VERIFY((m4(k)) == Scalar(float(raw[k])));
+  }
+  {
+    Matrix<Scalar,1,1> m(raw), m1(raw[0]), m2( (DenseIndex(raw[0])) ), m3( (int(raw[0])) );
+    Array<Scalar,1,1> a(raw), a1(raw[0]), a2( (DenseIndex(raw[0])) );
+    VERIFY(m(0) == raw[0]);
+    VERIFY(a(0) == raw[0]);
+    VERIFY(m1(0) == raw[0]);
+    VERIFY(a1(0) == raw[0]);
+    VERIFY(m2(0) == DenseIndex(raw[0]));
+    VERIFY(a2(0) == DenseIndex(raw[0]));
+    VERIFY(m3(0) == int(raw[0]));
+    VERIFY_IS_EQUAL(m,(Matrix<Scalar,1,1>(raw[0])));
+    VERIFY((a==Array<Scalar,1,1>(raw[0])).all());
+  }
 }
 
 void test_basicstuff()
@@ -207,8 +270,11 @@ void test_basicstuff()
   }
 
   CALL_SUBTEST_1(fixedSizeMatrixConstruction<unsigned char>());
+  CALL_SUBTEST_1(fixedSizeMatrixConstruction<float>());
   CALL_SUBTEST_1(fixedSizeMatrixConstruction<double>());
-  CALL_SUBTEST_1(fixedSizeMatrixConstruction<double>());
+  CALL_SUBTEST_1(fixedSizeMatrixConstruction<int>());
+  CALL_SUBTEST_1(fixedSizeMatrixConstruction<long int>());
+  CALL_SUBTEST_1(fixedSizeMatrixConstruction<std::ptrdiff_t>());
 
   CALL_SUBTEST_2(casting());
 }
diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp
new file mode 100644
index 000000000..f9f687aac
--- /dev/null
+++ b/test/bdcsvd.cpp
@@ -0,0 +1,111 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/
+
+// discard stack allocation as that too bypasses malloc
+#define EIGEN_STACK_ALLOCATION_LIMIT 0
+#define EIGEN_RUNTIME_NO_MALLOC
+
+#include "main.h"
+#include <Eigen/SVD>
+#include <iostream>
+#include <Eigen/LU>
+
+
+#define SVD_DEFAULT(M) BDCSVD<M>
+#define SVD_FOR_MIN_NORM(M) BDCSVD<M>
+#include "svd_common.h"
+
+// Check all variants of JacobiSVD
+template<typename MatrixType>
+void bdcsvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
+{
+  MatrixType m = a;
+  if(pickrandom)
+    svd_fill_random(m);
+
+  CALL_SUBTEST(( svd_test_all_computation_options<BDCSVD<MatrixType> >(m, false)  ));
+}
+
+template<typename MatrixType>
+void bdcsvd_method()
+{
+  enum { Size = MatrixType::RowsAtCompileTime };
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<RealScalar, Size, 1> RealVecType;
+  MatrixType m = MatrixType::Identity();
+  VERIFY_IS_APPROX(m.bdcSvd().singularValues(), RealVecType::Ones());
+  VERIFY_RAISES_ASSERT(m.bdcSvd().matrixU());
+  VERIFY_RAISES_ASSERT(m.bdcSvd().matrixV());
+  VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).solve(m), m);
+}
+
+// compare the Singular values returned with Jacobi and Bdc
+template<typename MatrixType> 
+void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0)
+{
+  MatrixType m = MatrixType::Random(a.rows(), a.cols());
+  BDCSVD<MatrixType> bdc_svd(m);
+  JacobiSVD<MatrixType> jacobi_svd(m);
+  VERIFY_IS_APPROX(bdc_svd.singularValues(), jacobi_svd.singularValues());
+  if(computationOptions & ComputeFullU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU());
+  if(computationOptions & ComputeThinU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU());
+  if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
+  if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
+}
+
+void test_bdcsvd()
+{
+  CALL_SUBTEST_3(( svd_verify_assert<BDCSVD<Matrix3f>  >(Matrix3f()) ));
+  CALL_SUBTEST_4(( svd_verify_assert<BDCSVD<Matrix4d>  >(Matrix4d()) ));
+  CALL_SUBTEST_7(( svd_verify_assert<BDCSVD<MatrixXf>  >(MatrixXf(10,12)) ));
+  CALL_SUBTEST_8(( svd_verify_assert<BDCSVD<MatrixXcd> >(MatrixXcd(7,5)) ));
+  
+  CALL_SUBTEST_101(( svd_all_trivial_2x2(bdcsvd<Matrix2cd>) ));
+  CALL_SUBTEST_102(( svd_all_trivial_2x2(bdcsvd<Matrix2d>) ));
+
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_3(( bdcsvd<Matrix3f>() ));
+    CALL_SUBTEST_4(( bdcsvd<Matrix4d>() ));
+    CALL_SUBTEST_5(( bdcsvd<Matrix<float,3,5> >() ));
+
+    int r = internal::random<int>(1, EIGEN_TEST_MAX_SIZE/2),
+        c = internal::random<int>(1, EIGEN_TEST_MAX_SIZE/2);
+    
+    TEST_SET_BUT_UNUSED_VARIABLE(r)
+    TEST_SET_BUT_UNUSED_VARIABLE(c)
+    
+    CALL_SUBTEST_6((  bdcsvd(Matrix<double,Dynamic,2>(r,2)) ));
+    CALL_SUBTEST_7((  bdcsvd(MatrixXf(r,c)) ));
+    CALL_SUBTEST_7((  compare_bdc_jacobi(MatrixXf(r,c)) ));
+    CALL_SUBTEST_10(( bdcsvd(MatrixXd(r,c)) ));
+    CALL_SUBTEST_10(( compare_bdc_jacobi(MatrixXd(r,c)) ));
+    CALL_SUBTEST_8((  bdcsvd(MatrixXcd(r,c)) ));
+    CALL_SUBTEST_8((  compare_bdc_jacobi(MatrixXcd(r,c)) ));
+
+    // Test on inf/nan matrix
+    CALL_SUBTEST_7(  (svd_inf_nan<BDCSVD<MatrixXf>, MatrixXf>()) );
+    CALL_SUBTEST_10( (svd_inf_nan<BDCSVD<MatrixXd>, MatrixXd>()) );
+  }
+
+  // test matrixbase method
+  CALL_SUBTEST_1(( bdcsvd_method<Matrix2cd>() ));
+  CALL_SUBTEST_3(( bdcsvd_method<Matrix3f>() ));
+
+  // Test problem size constructors
+  CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
+
+  // Check that preallocation avoids subsequent mallocs
+  CALL_SUBTEST_9( svd_preallocate<void>() );
+
+  CALL_SUBTEST_2( svd_underoverflow<void>() );
+}
+
diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp
index f327e2fac..4cc0dd31c 100644
--- a/test/bicgstab.cpp
+++ b/test/bicgstab.cpp
@@ -10,13 +10,16 @@
 #include "sparse_solver.h"
 #include <Eigen/IterativeLinearSolvers>
 
-template<typename T> void test_bicgstab_T()
+template<typename T, typename I> void test_bicgstab_T()
 {
-  BiCGSTAB<SparseMatrix<T>, DiagonalPreconditioner<T> > bicgstab_colmajor_diag;
-  BiCGSTAB<SparseMatrix<T>, IdentityPreconditioner    > bicgstab_colmajor_I;
-  BiCGSTAB<SparseMatrix<T>, IncompleteLUT<T> >           bicgstab_colmajor_ilut;
+  BiCGSTAB<SparseMatrix<T,0,I>, DiagonalPreconditioner<T> >     bicgstab_colmajor_diag;
+  BiCGSTAB<SparseMatrix<T,0,I>, IdentityPreconditioner    >     bicgstab_colmajor_I;
+  BiCGSTAB<SparseMatrix<T,0,I>, IncompleteLUT<T,I> >              bicgstab_colmajor_ilut;
   //BiCGSTAB<SparseMatrix<T>, SSORPreconditioner<T> >     bicgstab_colmajor_ssor;
 
+  bicgstab_colmajor_diag.setTolerance(NumTraits<T>::epsilon()*4);
+  bicgstab_colmajor_ilut.setTolerance(NumTraits<T>::epsilon()*4);
+  
   CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_diag)  );
 //   CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_I)     );
   CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_ilut)     );
@@ -25,6 +28,7 @@ template<typename T> void test_bicgstab_T()
 
 void test_bicgstab()
 {
-  CALL_SUBTEST_1(test_bicgstab_T<double>());
-  CALL_SUBTEST_2(test_bicgstab_T<std::complex<double> >());
+  CALL_SUBTEST_1((test_bicgstab_T<double,int>()) );
+  CALL_SUBTEST_2((test_bicgstab_T<std::complex<double>, int>()));
+  CALL_SUBTEST_3((test_bicgstab_T<double,long int>()));
 }
diff --git a/test/block.cpp b/test/block.cpp
index 9ed5d7bc5..39565af83 100644
--- a/test/block.cpp
+++ b/test/block.cpp
@@ -130,6 +130,14 @@ template<typename MatrixType> void block(const MatrixType& m)
 
   VERIFY(numext::real(ones.col(c1).dot(ones.col(c2))) == RealScalar(rows));
   VERIFY(numext::real(ones.row(r1).dot(ones.row(r2))) == RealScalar(cols));
+  
+  // chekc that linear acccessors works on blocks
+  m1 = m1_copy;
+  if((MatrixType::Flags&RowMajorBit)==0)
+    VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1+c1*rows), m1(r1,c1));
+  else
+    VERIFY_IS_EQUAL(m1.topRows(r1).coeff(c1+r1*cols), m1(r1,c1));
+  
 
   // now test some block-inside-of-block.
   
@@ -141,11 +149,11 @@ template<typename MatrixType> void block(const MatrixType& m)
   VERIFY_IS_EQUAL( (m1.transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , (m1.row(r1).segment(c1,c2-c1+1)).transpose() );
 
   // expressions without direct access
-  VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) );
-  VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
-  VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) );
-  VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
-  VERIFY_IS_EQUAL( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
+  VERIFY_IS_APPROX( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
 
   // evaluation into plain matrices from expressions with direct access (stress MapBase)
   DynamicMatrixType dm;
@@ -173,6 +181,19 @@ template<typename MatrixType> void block(const MatrixType& m)
   dm = m1.row(r1).segment(c1,c2-c1+1).transpose();
   dv = m1.transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0);
   VERIFY_IS_EQUAL(dv, dm);
+
+  VERIFY_IS_EQUAL( (m1.template block<Dynamic,1>(1,0,0,1)), m1.block(1,0,0,1));
+  VERIFY_IS_EQUAL( (m1.template block<1,Dynamic>(0,1,1,0)), m1.block(0,1,1,0));
+  VERIFY_IS_EQUAL( ((m1*1).template block<Dynamic,1>(1,0,0,1)), m1.block(1,0,0,1));
+  VERIFY_IS_EQUAL( ((m1*1).template block<1,Dynamic>(0,1,1,0)), m1.block(0,1,1,0));
+
+  if (rows>=2 && cols>=2)
+  {
+    VERIFY_RAISES_ASSERT( m1 += m1.col(0) );
+    VERIFY_RAISES_ASSERT( m1 -= m1.col(0) );
+    VERIFY_RAISES_ASSERT( m1.array() *= m1.col(0).array() );
+    VERIFY_RAISES_ASSERT( m1.array() /= m1.col(0).array() );
+  }
 }
 
 
diff --git a/test/boostmultiprec.cpp b/test/boostmultiprec.cpp
new file mode 100644
index 000000000..e06e9bdaf
--- /dev/null
+++ b/test/boostmultiprec.cpp
@@ -0,0 +1,201 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <sstream>
+
+#ifdef EIGEN_TEST_MAX_SIZE
+#undef EIGEN_TEST_MAX_SIZE
+#endif
+
+#define EIGEN_TEST_MAX_SIZE 50
+
+#ifdef EIGEN_TEST_PART_1
+#include "cholesky.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+#include "lu.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_3
+#include "qr.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_4
+#include "qr_colpivoting.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_5
+#include "qr_fullpivoting.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_6
+#include "eigensolver_selfadjoint.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_7
+#include "eigensolver_generic.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_8
+#include "eigensolver_generalized_real.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_9
+#include "jacobisvd.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_10
+#include "bdcsvd.cpp"
+#endif
+
+#include <Eigen/Dense>
+
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+
+#include <boost/multiprecision/cpp_dec_float.hpp>
+#include <boost/multiprecision/number.hpp>
+#include <boost/math/special_functions.hpp>
+#include <boost/math/complex.hpp>
+
+namespace mp = boost::multiprecision;
+typedef mp::number<mp::cpp_dec_float<100>, mp::et_on> Real;
+
+namespace Eigen {
+  template<> struct NumTraits<Real> : GenericNumTraits<Real> {
+    static inline Real dummy_precision() { return 1e-50; }
+  };
+
+  template<typename T1,typename T2,typename T3,typename T4,typename T5>
+  struct NumTraits<boost::multiprecision::detail::expression<T1,T2,T3,T4,T5> > : NumTraits<Real> {};
+
+  template<>
+  Real test_precision<Real>() { return 1e-50; }
+
+  // needed in C++93 mode where number does not support explicit cast.
+  namespace internal {
+    template<typename NewType>
+    struct cast_impl<Real,NewType> {
+      static inline NewType run(const Real& x) {
+        return x.template convert_to<NewType>();
+      }
+    };
+
+    template<>
+    struct cast_impl<Real,std::complex<Real> > {
+      static inline std::complex<Real>  run(const Real& x) {
+        return std::complex<Real>(x);
+      }
+    };
+  }
+}
+
+namespace boost {
+namespace multiprecision {
+  // to make ADL works as expected:
+  using boost::math::isfinite;
+  using boost::math::isnan;
+  using boost::math::isinf;
+  using boost::math::copysign;
+  using boost::math::hypot;
+
+  // The following is needed for std::complex<Real>:
+  Real fabs(const Real& a) { return abs EIGEN_NOT_A_MACRO (a); }
+  Real fmax(const Real& a, const Real& b) { using std::max; return max(a,b); }
+
+  // some specialization for the unit tests:
+  inline bool test_isMuchSmallerThan(const Real& a, const Real& b) {
+    return internal::isMuchSmallerThan(a, b, test_precision<Real>());
+  }
+
+  inline bool test_isApprox(const Real& a, const Real& b) {
+    return internal::isApprox(a, b, test_precision<Real>());
+  }
+
+  inline bool test_isApproxOrLessThan(const Real& a, const Real& b) {
+    return internal::isApproxOrLessThan(a, b, test_precision<Real>());
+  }
+
+  Real get_test_precision(const Real&) {
+    return test_precision<Real>();
+  }
+
+  Real test_relative_error(const Real &a, const Real &b) {
+    using Eigen::numext::abs2;
+    return sqrt(abs2<Real>(a-b)/Eigen::numext::mini<Real>(abs2(a),abs2(b)));
+  }
+}
+}
+
+namespace Eigen {
+
+}
+
+void test_boostmultiprec()
+{
+  typedef Matrix<Real,Dynamic,Dynamic> Mat;
+  typedef Matrix<std::complex<Real>,Dynamic,Dynamic> MatC;
+
+  std::cout << "NumTraits<Real>::epsilon()         = " << NumTraits<Real>::epsilon() << std::endl;
+  std::cout << "NumTraits<Real>::dummy_precision() = " << NumTraits<Real>::dummy_precision() << std::endl;
+  std::cout << "NumTraits<Real>::lowest()          = " << NumTraits<Real>::lowest() << std::endl;
+  std::cout << "NumTraits<Real>::highest()         = " << NumTraits<Real>::highest() << std::endl;
+  std::cout << "NumTraits<Real>::digits10()        = " << NumTraits<Real>::digits10() << std::endl;
+
+  // chekc stream output
+  {
+    Mat A(10,10);
+    A.setRandom();
+    std::stringstream ss;
+    ss << A;
+  }
+  {
+    MatC A(10,10);
+    A.setRandom();
+    std::stringstream ss;
+    ss << A;
+  }
+
+  for(int i = 0; i < g_repeat; i++) {
+    int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+
+    CALL_SUBTEST_1( cholesky(Mat(s,s)) );
+
+    CALL_SUBTEST_2( lu_non_invertible<Mat>() );
+    CALL_SUBTEST_2( lu_invertible<Mat>() );
+    CALL_SUBTEST_2( lu_non_invertible<MatC>() );
+    CALL_SUBTEST_2( lu_invertible<MatC>() );
+
+    CALL_SUBTEST_3( qr(Mat(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_3( qr_invertible<Mat>() );
+
+    CALL_SUBTEST_4( qr<Mat>() );
+    CALL_SUBTEST_4( cod<Mat>() );
+    CALL_SUBTEST_4( qr_invertible<Mat>() );
+
+    CALL_SUBTEST_5( qr<Mat>() );
+    CALL_SUBTEST_5( qr_invertible<Mat>() );
+
+    CALL_SUBTEST_6( selfadjointeigensolver(Mat(s,s)) );
+
+    CALL_SUBTEST_7( eigensolver(Mat(s,s)) );
+
+    CALL_SUBTEST_8( generalized_eigensolver_real(Mat(s,s)) );
+
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+
+  CALL_SUBTEST_9(( jacobisvd(Mat(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
+  CALL_SUBTEST_10(( bdcsvd(Mat(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
+}
+
diff --git a/test/bug1213.cpp b/test/bug1213.cpp
new file mode 100644
index 000000000..581760c1a
--- /dev/null
+++ b/test/bug1213.cpp
@@ -0,0 +1,13 @@
+
+// This anonymous enum is essential to trigger the linking issue
+enum {
+  Foo
+};
+
+#include "bug1213.h"
+
+bool bug1213_1(const Eigen::Vector3f& x)
+{
+  return bug1213_2(x);
+}
+
diff --git a/test/bug1213.h b/test/bug1213.h
new file mode 100644
index 000000000..040e5a470
--- /dev/null
+++ b/test/bug1213.h
@@ -0,0 +1,8 @@
+
+#include <Eigen/Core>
+
+template<typename T, int dim>
+bool bug1213_2(const Eigen::Matrix<T,dim,1>& x);
+
+bool bug1213_1(const Eigen::Vector3f& x);
+
diff --git a/test/bug1213_main.cpp b/test/bug1213_main.cpp
new file mode 100644
index 000000000..4802c0003
--- /dev/null
+++ b/test/bug1213_main.cpp
@@ -0,0 +1,18 @@
+
+// This is a regression unit regarding a weird linking issue with gcc.
+
+#include "bug1213.h"
+
+int main()
+{
+  return 0;
+}
+
+
+template<typename T, int dim>
+bool bug1213_2(const Eigen::Matrix<T,dim,1>& )
+{
+  return true;
+}
+
+template bool bug1213_2<float,3>(const Eigen::Vector3f&);
diff --git a/test/cholesky.cpp b/test/cholesky.cpp
index 56885deb7..8ad5ac639 100644
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@@ -11,20 +11,17 @@
 #define EIGEN_NO_ASSERTION_CHECKING
 #endif
 
-static int nb_temporaries;
-
-#define EIGEN_DENSE_STORAGE_CTOR_PLUGIN { if(size!=0) nb_temporaries++; }
+#define TEST_ENABLE_TEMPORARY_TRACKING
 
 #include "main.h"
 #include <Eigen/Cholesky>
 #include <Eigen/QR>
 
-#define VERIFY_EVALUATION_COUNT(XPR,N) {\
-    nb_temporaries = 0; \
-    XPR; \
-    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
-    VERIFY( (#XPR) && nb_temporaries==N ); \
-  }
+template<typename MatrixType, int UpLo>
+typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
+  MatrixType symm = m.template selfadjointView<UpLo>();
+  return symm.cwiseAbs().colwise().sum().maxCoeff();
+}
 
 template<typename MatrixType,template <typename,int> class CholType> void test_chol_update(const MatrixType& symm)
 {
@@ -83,14 +80,10 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     symm += a1 * a1.adjoint();
   }
 
-  // to test if really Cholesky only uses the upper triangular part, uncomment the following
-  // FIXME: currently that fails !!
-  //symm.template part<StrictlyLower>().setZero();
-
   {
     SquareMatrixType symmUp = symm.template triangularView<Upper>();
     SquareMatrixType symmLo = symm.template triangularView<Lower>();
-    
+
     LLT<SquareMatrixType,Lower> chollo(symmLo);
     VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix());
     vecX = chollo.solve(vecB);
@@ -98,6 +91,14 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     matX = chollo.solve(matB);
     VERIFY_IS_APPROX(symm * matX, matB);
 
+    const MatrixType symmLo_inverse = chollo.solve(MatrixType::Identity(rows,cols));
+    RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) /
+                             matrix_l1_norm<MatrixType, Lower>(symmLo_inverse);
+    RealScalar rcond_est = chollo.rcond();
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
     // test the upper mode
     LLT<SquareMatrixType,Upper> cholup(symmUp);
     VERIFY_IS_APPROX(symm, cholup.reconstructedMatrix());
@@ -106,6 +107,15 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     matX = cholup.solve(matB);
     VERIFY_IS_APPROX(symm * matX, matB);
 
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    const MatrixType symmUp_inverse = cholup.solve(MatrixType::Identity(rows,cols));
+    rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Upper>(symmUp)) /
+                             matrix_l1_norm<MatrixType, Upper>(symmUp_inverse);
+    rcond_est = cholup.rcond();
+    VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
+
     MatrixType neg = -symmLo;
     chollo.compute(neg);
     VERIFY(chollo.info()==NumericalIssue);
@@ -114,7 +124,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     VERIFY_IS_APPROX(MatrixType(chollo.matrixU().transpose().conjugate()), MatrixType(chollo.matrixL()));
     VERIFY_IS_APPROX(MatrixType(cholup.matrixL().transpose().conjugate()), MatrixType(cholup.matrixU()));
     VERIFY_IS_APPROX(MatrixType(cholup.matrixU().transpose().conjugate()), MatrixType(cholup.matrixL()));
-    
+
     // test some special use cases of SelfCwiseBinaryOp:
     MatrixType m1 = MatrixType::Random(rows,cols), m2(rows,cols);
     m2 = m1;
@@ -144,19 +154,38 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     SquareMatrixType symmLo = symm.template triangularView<Lower>();
 
     LDLT<SquareMatrixType,Lower> ldltlo(symmLo);
+    VERIFY(ldltlo.info()==Success);
     VERIFY_IS_APPROX(symm, ldltlo.reconstructedMatrix());
     vecX = ldltlo.solve(vecB);
     VERIFY_IS_APPROX(symm * vecX, vecB);
     matX = ldltlo.solve(matB);
     VERIFY_IS_APPROX(symm * matX, matB);
 
+    const MatrixType symmLo_inverse = ldltlo.solve(MatrixType::Identity(rows,cols));
+    RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) /
+                             matrix_l1_norm<MatrixType, Lower>(symmLo_inverse);
+    RealScalar rcond_est = ldltlo.rcond();
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
+
     LDLT<SquareMatrixType,Upper> ldltup(symmUp);
+    VERIFY(ldltup.info()==Success);
     VERIFY_IS_APPROX(symm, ldltup.reconstructedMatrix());
     vecX = ldltup.solve(vecB);
     VERIFY_IS_APPROX(symm * vecX, vecB);
     matX = ldltup.solve(matB);
     VERIFY_IS_APPROX(symm * matX, matB);
 
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    const MatrixType symmUp_inverse = ldltup.solve(MatrixType::Identity(rows,cols));
+    rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Upper>(symmUp)) /
+                             matrix_l1_norm<MatrixType, Upper>(symmUp_inverse);
+    rcond_est = ldltup.rcond();
+    VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
     VERIFY_IS_APPROX(MatrixType(ldltlo.matrixL().transpose().conjugate()), MatrixType(ldltlo.matrixU()));
     VERIFY_IS_APPROX(MatrixType(ldltlo.matrixU().transpose().conjugate()), MatrixType(ldltlo.matrixL()));
     VERIFY_IS_APPROX(MatrixType(ldltup.matrixL().transpose().conjugate()), MatrixType(ldltup.matrixU()));
@@ -185,7 +214,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     if(rows>=3)
     {
       SquareMatrixType A = symm;
-      int c = internal::random<int>(0,rows-2);
+      Index c = internal::random<Index>(0,rows-2);
       A.bottomRightCorner(c,c).setZero();
       // Make sure a solution exists:
       vecX.setRandom();
@@ -196,11 +225,11 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
       vecX = ldltlo.solve(vecB);
       VERIFY_IS_APPROX(A * vecX, vecB);
     }
-    
+
     // check non-full rank matrices
     if(rows>=3)
     {
-      int r = internal::random<int>(1,rows-1);
+      Index r = internal::random<Index>(1,rows-1);
       Matrix<Scalar,Dynamic,Dynamic> a = Matrix<Scalar,Dynamic,Dynamic>::Random(rows,r);
       SquareMatrixType A = a * a.adjoint();
       // Make sure a solution exists:
@@ -212,15 +241,17 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
       vecX = ldltlo.solve(vecB);
       VERIFY_IS_APPROX(A * vecX, vecB);
     }
-    
+
     // check matrices with a wide spectrum
     if(rows>=3)
     {
+      using std::pow;
+      using std::sqrt;
       RealScalar s = (std::min)(16,std::numeric_limits<RealScalar>::max_exponent10/8);
       Matrix<Scalar,Dynamic,Dynamic> a = Matrix<Scalar,Dynamic,Dynamic>::Random(rows,rows);
       Matrix<RealScalar,Dynamic,1> d =  Matrix<RealScalar,Dynamic,1>::Random(rows);
-      for(int k=0; k<rows; ++k)
-        d(k) = d(k)*std::pow(RealScalar(10),internal::random<RealScalar>(-s,s));
+      for(Index k=0; k<rows; ++k)
+        d(k) = d(k)*pow(RealScalar(10),internal::random<RealScalar>(-s,s));
       SquareMatrixType A = a * d.asDiagonal() * a.adjoint();
       // Make sure a solution exists:
       vecX.setRandom();
@@ -229,7 +260,20 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
       ldltlo.compute(A);
       VERIFY_IS_APPROX(A, ldltlo.reconstructedMatrix());
       vecX = ldltlo.solve(vecB);
-      VERIFY_IS_APPROX(A * vecX, vecB);
+
+      if(ldltlo.vectorD().real().cwiseAbs().minCoeff()>RealScalar(0))
+      {
+        VERIFY_IS_APPROX(A * vecX,vecB);
+      }
+      else
+      {
+        RealScalar large_tol =  sqrt(test_precision<RealScalar>());
+        VERIFY((A * vecX).isApprox(vecB, large_tol));
+
+        ++g_test_level;
+        VERIFY_IS_APPROX(A * vecX,vecB);
+        --g_test_level;
+      }
     }
   }
 
@@ -289,6 +333,7 @@ template<typename MatrixType> void cholesky_cplx(const MatrixType& m)
     RealMatrixType symmLo = symm.template triangularView<Lower>();
 
     LDLT<RealMatrixType,Lower> ldltlo(symmLo);
+    VERIFY(ldltlo.info()==Success);
     VERIFY_IS_APPROX(symm, ldltlo.reconstructedMatrix());
     vecX = ldltlo.solve(vecB);
     VERIFY_IS_APPROX(symm * vecX, vecB);
@@ -314,46 +359,101 @@ template<typename MatrixType> void cholesky_bug241(const MatrixType& m)
 }
 
 // LDLT is not guaranteed to work for indefinite matrices, but happens to work fine if matrix is diagonal.
-// This test checks that LDLT reports correctly that matrix is indefinite. 
+// This test checks that LDLT reports correctly that matrix is indefinite.
 // See http://forum.kde.org/viewtopic.php?f=74&t=106942 and bug 736
 template<typename MatrixType> void cholesky_definiteness(const MatrixType& m)
 {
   eigen_assert(m.rows() == 2 && m.cols() == 2);
   MatrixType mat;
   LDLT<MatrixType> ldlt(2);
-  
+
   {
     mat << 1, 0, 0, -1;
     ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
     VERIFY(!ldlt.isNegative());
     VERIFY(!ldlt.isPositive());
   }
   {
     mat << 1, 2, 2, 1;
     ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
     VERIFY(!ldlt.isNegative());
     VERIFY(!ldlt.isPositive());
   }
   {
     mat << 0, 0, 0, 0;
     ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
     VERIFY(ldlt.isNegative());
     VERIFY(ldlt.isPositive());
   }
   {
     mat << 0, 0, 0, 1;
     ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
     VERIFY(!ldlt.isNegative());
     VERIFY(ldlt.isPositive());
   }
   {
     mat << -1, 0, 0, 0;
     ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
     VERIFY(ldlt.isNegative());
     VERIFY(!ldlt.isPositive());
   }
 }
 
+template<typename>
+void cholesky_faillure_cases()
+{
+  MatrixXd mat;
+  LDLT<MatrixXd> ldlt;
+
+  {
+    mat.resize(2,2);
+    mat << 0, 1, 1, 0;
+    ldlt.compute(mat);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+    VERIFY(ldlt.info()==NumericalIssue);
+  }
+#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE_SSE2)
+  {
+    mat.resize(3,3);
+    mat << -1, -3, 3,
+           -3, -8.9999999999999999999, 1,
+            3, 1, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==NumericalIssue);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+#endif
+  {
+    mat.resize(3,3);
+    mat <<  1, 2, 3,
+            2, 4, 1,
+            3, 1, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==NumericalIssue);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+
+  {
+    mat.resize(8,8);
+    mat <<  0.1, 0, -0.1, 0, 0, 0, 1, 0,
+            0, 4.24667, 0, 2.00333, 0, 0, 0, 0,
+            -0.1, 0, 0.2, 0, -0.1, 0, 0, 0,
+            0, 2.00333, 0, 8.49333, 0, 2.00333, 0, 0,
+            0, 0, -0.1, 0, 0.1, 0, 0, 1,
+            0, 0, 0, 2.00333, 0, 4.24667, 0, 0,
+            1, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 1, 0, 0, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==NumericalIssue);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+}
+
 template<typename MatrixType> void cholesky_verify_assert()
 {
   MatrixType tmp;
@@ -384,10 +484,14 @@ void test_cholesky()
     CALL_SUBTEST_3( cholesky_definiteness(Matrix2d()) );
     CALL_SUBTEST_4( cholesky(Matrix3f()) );
     CALL_SUBTEST_5( cholesky(Matrix4d()) );
+
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_2( cholesky(MatrixXd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_6( cholesky_cplx(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
 
   CALL_SUBTEST_4( cholesky_verify_assert<Matrix3f>() );
@@ -398,7 +502,8 @@ void test_cholesky()
   // Test problem size constructors
   CALL_SUBTEST_9( LLT<MatrixXf>(10) );
   CALL_SUBTEST_9( LDLT<MatrixXf>(10) );
-  
-  TEST_SET_BUT_UNUSED_VARIABLE(s)
+
+  CALL_SUBTEST_2( cholesky_faillure_cases<void>() );
+
   TEST_SET_BUT_UNUSED_VARIABLE(nb_temporaries)
 }
diff --git a/test/cholmod_support.cpp b/test/cholmod_support.cpp
index 8f8be3c0e..a7eda28f7 100644
--- a/test/cholmod_support.cpp
+++ b/test/cholmod_support.cpp
@@ -7,6 +7,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
 #include "sparse_solver.h"
 
 #include <Eigen/CholmodSupport>
@@ -40,13 +41,13 @@ template<typename T> void test_cholmod_T()
   check_sparse_spd_solving(llt_colmajor_upper);
   check_sparse_spd_solving(ldlt_colmajor_lower);
   check_sparse_spd_solving(ldlt_colmajor_upper);
-  
-//  check_sparse_spd_determinant(chol_colmajor_lower);
-//  check_sparse_spd_determinant(chol_colmajor_upper);
-//  check_sparse_spd_determinant(llt_colmajor_lower);
-//  check_sparse_spd_determinant(llt_colmajor_upper);
-//  check_sparse_spd_determinant(ldlt_colmajor_lower);
-//  check_sparse_spd_determinant(ldlt_colmajor_upper);
+
+  check_sparse_spd_determinant(chol_colmajor_lower);
+  check_sparse_spd_determinant(chol_colmajor_upper);
+  check_sparse_spd_determinant(llt_colmajor_lower);
+  check_sparse_spd_determinant(llt_colmajor_upper);
+  check_sparse_spd_determinant(ldlt_colmajor_lower);
+  check_sparse_spd_determinant(ldlt_colmajor_upper);
 }
 
 void test_cholmod_support()
diff --git a/test/commainitializer.cpp b/test/commainitializer.cpp
index 99102b966..9844adbd2 100644
--- a/test/commainitializer.cpp
+++ b/test/commainitializer.cpp
@@ -9,6 +9,62 @@
 
 #include "main.h"
 
+
+template<int M1, int M2, int N1, int N2>
+void test_blocks()
+{
+  Matrix<int, M1+M2, N1+N2> m_fixed;
+  MatrixXi m_dynamic(M1+M2, N1+N2);
+
+  Matrix<int, M1, N1> mat11; mat11.setRandom();
+  Matrix<int, M1, N2> mat12; mat12.setRandom();
+  Matrix<int, M2, N1> mat21; mat21.setRandom();
+  Matrix<int, M2, N2> mat22; mat22.setRandom();
+
+  MatrixXi matx11 = mat11, matx12 = mat12, matx21 = mat21, matx22 = mat22;
+
+  {
+    VERIFY_IS_EQUAL((m_fixed << mat11, mat12, mat21, matx22).finished(), (m_dynamic << mat11, matx12, mat21, matx22).finished());
+    VERIFY_IS_EQUAL((m_fixed.template topLeftCorner<M1,N1>()), mat11);
+    VERIFY_IS_EQUAL((m_fixed.template topRightCorner<M1,N2>()), mat12);
+    VERIFY_IS_EQUAL((m_fixed.template bottomLeftCorner<M2,N1>()), mat21);
+    VERIFY_IS_EQUAL((m_fixed.template bottomRightCorner<M2,N2>()), mat22);
+    VERIFY_IS_EQUAL((m_fixed << mat12, mat11, matx21, mat22).finished(), (m_dynamic << mat12, matx11, matx21, mat22).finished());
+  }
+
+  if(N1 > 0)
+  {
+    VERIFY_RAISES_ASSERT((m_fixed << mat11, mat12, mat11, mat21, mat22));
+    VERIFY_RAISES_ASSERT((m_fixed << mat11, mat12, mat21, mat21, mat22));
+  }
+  else
+  {
+    // allow insertion of zero-column blocks:
+    VERIFY_IS_EQUAL((m_fixed << mat11, mat12, mat11, mat11, mat21, mat21, mat22).finished(), (m_dynamic << mat12, mat22).finished());
+  }
+  if(M1 != M2)
+  {
+    VERIFY_RAISES_ASSERT((m_fixed << mat11, mat21, mat12, mat22));
+  }
+}
+
+
+template<int N>
+struct test_block_recursion
+{
+  static void run()
+  {
+    test_blocks<(N>>6)&3, (N>>4)&3, (N>>2)&3, N & 3>();
+    test_block_recursion<N-1>::run();
+  }
+};
+
+template<>
+struct test_block_recursion<-1>
+{
+  static void run() { }
+};
+
 void test_commainitializer()
 {
   Matrix3d m3;
@@ -43,4 +99,8 @@ void test_commainitializer()
         4, 5, 6,
         vec[2].transpose();
   VERIFY_IS_APPROX(m3, ref);
+
+
+  // recursively test all block-sizes from 0 to 3:
+  test_block_recursion<(1<<8) - 1>();
 }
diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp
index 019cc4d64..9622fd86d 100644
--- a/test/conjugate_gradient.cpp
+++ b/test/conjugate_gradient.cpp
@@ -10,13 +10,14 @@
 #include "sparse_solver.h"
 #include <Eigen/IterativeLinearSolvers>
 
-template<typename T> void test_conjugate_gradient_T()
+template<typename T, typename I> void test_conjugate_gradient_T()
 {
-  ConjugateGradient<SparseMatrix<T>, Lower      > cg_colmajor_lower_diag;
-  ConjugateGradient<SparseMatrix<T>, Upper      > cg_colmajor_upper_diag;
-  ConjugateGradient<SparseMatrix<T>, Lower|Upper> cg_colmajor_loup_diag;
-  ConjugateGradient<SparseMatrix<T>, Lower, IdentityPreconditioner> cg_colmajor_lower_I;
-  ConjugateGradient<SparseMatrix<T>, Upper, IdentityPreconditioner> cg_colmajor_upper_I;
+  typedef SparseMatrix<T,0,I> SparseMatrixType;
+  ConjugateGradient<SparseMatrixType, Lower      > cg_colmajor_lower_diag;
+  ConjugateGradient<SparseMatrixType, Upper      > cg_colmajor_upper_diag;
+  ConjugateGradient<SparseMatrixType, Lower|Upper> cg_colmajor_loup_diag;
+  ConjugateGradient<SparseMatrixType, Lower, IdentityPreconditioner> cg_colmajor_lower_I;
+  ConjugateGradient<SparseMatrixType, Upper, IdentityPreconditioner> cg_colmajor_upper_I;
 
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_diag)  );
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_diag)  );
@@ -27,6 +28,7 @@ template<typename T> void test_conjugate_gradient_T()
 
 void test_conjugate_gradient()
 {
-  CALL_SUBTEST_1(test_conjugate_gradient_T<double>());
-  CALL_SUBTEST_2(test_conjugate_gradient_T<std::complex<double> >());
+  CALL_SUBTEST_1(( test_conjugate_gradient_T<double,int>() ));
+  CALL_SUBTEST_2(( test_conjugate_gradient_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_conjugate_gradient_T<double,long int>() ));
 }
diff --git a/test/constructor.cpp b/test/constructor.cpp
new file mode 100644
index 000000000..eec9e2192
--- /dev/null
+++ b/test/constructor.cpp
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
+#include "main.h"
+
+template<typename MatrixType> struct Wrapper
+{
+  MatrixType m_mat;
+  inline Wrapper(const MatrixType &x) : m_mat(x) {}
+  inline operator const MatrixType& () const { return m_mat; }
+  inline operator MatrixType& () { return m_mat; }
+};
+
+template<typename MatrixType> void ctor_init1(const MatrixType& m)
+{
+  // Check logic in PlainObjectBase::_init1
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m0 = MatrixType::Random(rows,cols);
+
+  VERIFY_EVALUATION_COUNT( MatrixType m1(m0), 1);
+  VERIFY_EVALUATION_COUNT( MatrixType m2(m0+m0), 1);
+  VERIFY_EVALUATION_COUNT( MatrixType m2(m0.block(0,0,rows,cols)) , 1);
+
+  Wrapper<MatrixType> wrapper(m0);
+  VERIFY_EVALUATION_COUNT( MatrixType m3(wrapper) , 1);
+}
+
+
+void test_constructor()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( ctor_init1(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( ctor_init1(Matrix4d()) );
+    CALL_SUBTEST_1( ctor_init1(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_1( ctor_init1(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  {
+    Matrix<Index,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Matrix<Index,1,1> a(123.0);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Matrix<float,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123.f);
+  }
+  {
+    Array<Index,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Array<Index,1,1> a(123.0);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Array<float,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123.f);
+  }
+  {
+    Array<Index,3,3> a(123);
+    VERIFY_IS_EQUAL(a(4), 123);
+  }
+  {
+    Array<Index,3,3> a(123.0);
+    VERIFY_IS_EQUAL(a(4), 123);
+  }
+  {
+    Array<float,3,3> a(123);
+    VERIFY_IS_EQUAL(a(4), 123.f);
+  }
+}
diff --git a/test/ctorleak.cpp b/test/ctorleak.cpp
new file mode 100644
index 000000000..c158f5e4e
--- /dev/null
+++ b/test/ctorleak.cpp
@@ -0,0 +1,69 @@
+#include "main.h"
+
+#include <exception>  // std::exception
+
+struct Foo
+{
+  static Index object_count;
+  static Index object_limit;
+  int dummy;
+
+  Foo()
+  {
+#ifdef EIGEN_EXCEPTIONS
+    // TODO: Is this the correct way to handle this?
+    if (Foo::object_count > Foo::object_limit) { std::cout << "\nThrow!\n"; throw Foo::Fail(); }
+#endif
+	  std::cout << '+';
+    ++Foo::object_count;
+  }
+
+  ~Foo()
+  {
+	  std::cout << '-';
+    --Foo::object_count;
+  }
+
+  class Fail : public std::exception {};
+};
+
+Index Foo::object_count = 0;
+Index Foo::object_limit = 0;
+
+#undef EIGEN_TEST_MAX_SIZE
+#define EIGEN_TEST_MAX_SIZE 3
+
+void test_ctorleak()
+{
+  typedef Matrix<Foo, Dynamic, Dynamic> MatrixX;
+  typedef Matrix<Foo, Dynamic, 1> VectorX;
+  Foo::object_count = 0;
+  for(int i = 0; i < g_repeat; i++) {
+    Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
+    Foo::object_limit = internal::random<Index>(0, rows*cols - 2);
+    std::cout << "object_limit =" << Foo::object_limit << std::endl;
+#ifdef EIGEN_EXCEPTIONS
+    try
+    {
+#endif
+    	std::cout <<       "\nMatrixX m(" << rows << ", " << cols << ");\n";
+      MatrixX m(rows, cols);
+#ifdef EIGEN_EXCEPTIONS
+      VERIFY(false);  // not reached if exceptions are enabled
+    }
+    catch (const Foo::Fail&) { /* ignore */ }
+#endif
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+
+    {
+      Foo::object_limit = (rows+1)*(cols+1);
+      MatrixX A(rows, cols);
+      VERIFY_IS_EQUAL(Foo::object_count, rows*cols);
+      VectorX v=A.row(0);
+      VERIFY_IS_EQUAL(Foo::object_count, (rows+1)*cols);
+      v = A.col(0);
+      VERIFY_IS_EQUAL(Foo::object_count, rows*(cols+1));
+    }
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+  }
+}
diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu
new file mode 100644
index 000000000..cb2e4167a
--- /dev/null
+++ b/test/cuda_basic.cu
@@ -0,0 +1,173 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// workaround issue between gcc >= 4.7 and cuda 5.5
+#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
+  #undef _GLIBCXX_ATOMIC_BUILTINS
+  #undef _GLIBCXX_USE_INT128
+#endif
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cuda_basic
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#include <math_constants.h>
+#include <cuda.h>
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include "cuda_common.h"
+
+// Check that dense modules can be properly parsed by nvcc
+#include <Eigen/Dense>
+
+// struct Foo{
+//   EIGEN_DEVICE_FUNC
+//   void operator()(int i, const float* mats, float* vecs) const {
+//     using namespace Eigen;
+//   //   Matrix3f M(data);
+//   //   Vector3f x(data+9);
+//   //   Map<Vector3f>(data+9) = M.inverse() * x;
+//     Matrix3f M(mats+i/16);
+//     Vector3f x(vecs+i*3);
+//   //   using std::min;
+//   //   using std::sqrt;
+//     Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() *  x) / x.x();
+//     //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
+//   }
+// };
+
+template<typename T>
+struct coeff_wise {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T x1(in+i);
+    T x2(in+i+1);
+    T x3(in+i+2);
+    Map<T> res(out+i*T::MaxSizeAtCompileTime);
+    
+    res.array() += (in[0] * x1 + x2).array() * x3.array();
+  }
+};
+
+template<typename T>
+struct replicate {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T x1(in+i);
+    int step   = x1.size() * 4;
+    int stride = 3 * step;
+    
+    typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
+    MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
+    MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
+    MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
+  }
+};
+
+template<typename T>
+struct redux {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    int N = 10;
+    T x1(in+i);
+    out[i*N+0] = x1.minCoeff();
+    out[i*N+1] = x1.maxCoeff();
+    out[i*N+2] = x1.sum();
+    out[i*N+3] = x1.prod();
+    out[i*N+4] = x1.matrix().squaredNorm();
+    out[i*N+5] = x1.matrix().norm();
+    out[i*N+6] = x1.colwise().sum().maxCoeff();
+    out[i*N+7] = x1.rowwise().maxCoeff().sum();
+    out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
+  }
+};
+
+template<typename T1, typename T2>
+struct prod_test {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
+    T1 x1(in+i);
+    T2 x2(in+i+1);
+    Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
+    res += in[i] * x1 * x2;
+  }
+};
+
+template<typename T1, typename T2>
+struct diagonal {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
+  {
+    using namespace Eigen;
+    T1 x1(in+i);
+    Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
+    res += x1.diagonal();
+  }
+};
+
+template<typename T>
+struct eigenvalues {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
+    T M(in+i);
+    Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
+    T A = M*M.adjoint();
+    SelfAdjointEigenSolver<T> eig;
+    eig.computeDirect(M);
+    res = eig.eigenvalues();
+  }
+};
+
+void test_cuda_basic()
+{
+  ei_test_init_cuda();
+  
+  int nthreads = 100;
+  Eigen::VectorXf in, out;
+  
+  #ifndef __CUDA_ARCH__
+  int data_size = nthreads * 512;
+  in.setRandom(data_size);
+  out.setRandom(data_size);
+  #endif
+  
+  CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise<Vector3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise<Array44f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_cuda(replicate<Array4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(replicate<Array33f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_cuda(redux<Array4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(redux<Matrix3f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix2f>(), nthreads, in, out) );
+
+}
diff --git a/test/cuda_common.h b/test/cuda_common.h
new file mode 100644
index 000000000..9737693ac
--- /dev/null
+++ b/test/cuda_common.h
@@ -0,0 +1,101 @@
+
+#ifndef EIGEN_TEST_CUDA_COMMON_H
+#define EIGEN_TEST_CUDA_COMMON_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <iostream>
+
+#ifndef __CUDACC__
+dim3 threadIdx, blockDim, blockIdx;
+#endif
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  for(int i=0; i<n; i++)
+    ker(i, in.data(), out.data());
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+__global__
+void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
+{
+  int i = threadIdx.x + blockIdx.x*blockDim.x;
+  if(i<n) {
+    ker(i, in, out);
+  }
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_cuda(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  typename Input::Scalar*  d_in;
+  typename Output::Scalar* d_out;
+  std::ptrdiff_t in_bytes  = in.size()  * sizeof(typename Input::Scalar);
+  std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
+  
+  cudaMalloc((void**)(&d_in),  in_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+  
+  cudaMemcpy(d_in,  in.data(),  in_bytes,  cudaMemcpyHostToDevice);
+  cudaMemcpy(d_out, out.data(), out_bytes, cudaMemcpyHostToDevice);
+  
+  // Simple and non-optimal 1D mapping assuming n is not too large
+  // That's only for unit testing!
+  dim3 Blocks(128);
+  dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
+
+  cudaThreadSynchronize();
+  run_on_cuda_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
+  cudaThreadSynchronize();
+  
+  // check inputs have not been modified
+  cudaMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  cudaMemcpyDeviceToHost);
+  cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost);
+  
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_and_compare_to_cuda(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  Input  in_ref,  in_cuda;
+  Output out_ref, out_cuda;
+  #ifndef __CUDA_ARCH__
+  in_ref = in_cuda = in;
+  out_ref = out_cuda = out;
+  #endif
+  run_on_cpu (ker, n, in_ref,  out_ref);
+  run_on_cuda(ker, n, in_cuda, out_cuda);
+  #ifndef __CUDA_ARCH__
+  VERIFY_IS_APPROX(in_ref, in_cuda);
+  VERIFY_IS_APPROX(out_ref, out_cuda);
+  #endif
+}
+
+
+void ei_test_init_cuda()
+{
+  int device = 0;
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, device);
+  std::cout << "CUDA device info:\n";
+  std::cout << "  name:                        " << deviceProp.name << "\n";
+  std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << "\n";
+  std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << "\n";
+  std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n";
+  std::cout << "  warpSize:                    " << deviceProp.warpSize << "\n";
+  std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << "\n";
+  std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << "\n";
+  std::cout << "  clockRate:                   " << deviceProp.clockRate << "\n";
+  std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << "\n";
+  std::cout << "  computeMode:                 " << deviceProp.computeMode << "\n";
+}
+
+#endif // EIGEN_TEST_CUDA_COMMON_H
diff --git a/test/cwiseop.cpp b/test/cwiseop.cpp
deleted file mode 100644
index e3361da17..000000000
--- a/test/cwiseop.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN2_SUPPORT
-#define EIGEN_NO_EIGEN2_DEPRECATED_WARNING
-
-#define EIGEN_NO_STATIC_ASSERT
-#include "main.h"
-#include <functional>
-
-#ifdef min
-#undef min
-#endif
-
-#ifdef max
-#undef max
-#endif
-
-using namespace std;
-
-template<typename Scalar> struct AddIfNull {
-    const Scalar operator() (const Scalar a, const Scalar b) const {return a<=1e-3 ? b : a;}
-    enum { Cost = NumTraits<Scalar>::AddCost };
-};
-
-template<typename MatrixType>
-typename Eigen::internal::enable_if<!NumTraits<typename MatrixType::Scalar>::IsInteger,typename MatrixType::Scalar>::type
-cwiseops_real_only(MatrixType& m1, MatrixType& m2, MatrixType& m3, MatrixType& mones)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  
-  VERIFY_IS_APPROX(m1.cwise() / m2,    m1.cwise() * (m2.cwise().inverse()));
-  m3 = m1.cwise().abs().cwise().sqrt();
-  VERIFY_IS_APPROX(m3.cwise().square(), m1.cwise().abs());
-  VERIFY_IS_APPROX(m1.cwise().square().cwise().sqrt(), m1.cwise().abs());
-  VERIFY_IS_APPROX(m1.cwise().abs().cwise().log().cwise().exp() , m1.cwise().abs());
-
-  VERIFY_IS_APPROX(m1.cwise().pow(2), m1.cwise().square());
-  m3 = (m1.cwise().abs().cwise()<=RealScalar(0.01)).select(mones,m1);
-  VERIFY_IS_APPROX(m3.cwise().pow(-1), m3.cwise().inverse());
-  m3 = m1.cwise().abs();
-  VERIFY_IS_APPROX(m3.cwise().pow(RealScalar(0.5)), m3.cwise().sqrt());
-
-//   VERIFY_IS_APPROX(m1.cwise().tan(), m1.cwise().sin().cwise() / m1.cwise().cos());
-  VERIFY_IS_APPROX(mones, m1.cwise().sin().cwise().square() + m1.cwise().cos().cwise().square());
-  m3 = m1;
-  m3.cwise() /= m2;
-  VERIFY_IS_APPROX(m3, m1.cwise() / m2);
-  
-  return Scalar(0);
-}
-
-template<typename MatrixType>
-typename Eigen::internal::enable_if<NumTraits<typename MatrixType::Scalar>::IsInteger,typename MatrixType::Scalar>::type
-cwiseops_real_only(MatrixType& , MatrixType& , MatrixType& , MatrixType& )
-{
-  return 0;
-}
-
-template<typename MatrixType> void cwiseops(const MatrixType& m)
-{
-  typedef typename MatrixType::Index Index;
-  typedef typename MatrixType::Scalar Scalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-
-  Index rows = m.rows();
-  Index cols = m.cols();
-
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m1bis = m1,
-             m2 = MatrixType::Random(rows, cols),
-             m3(rows, cols),
-             m4(rows, cols),
-             mzero = MatrixType::Zero(rows, cols),
-             mones = MatrixType::Ones(rows, cols),
-             identity = Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime>
-                              ::Identity(rows, rows);
-  VectorType vzero = VectorType::Zero(rows),
-             vones = VectorType::Ones(rows),
-             v3(rows);
-
-  Index r = internal::random<Index>(0, rows-1),
-        c = internal::random<Index>(0, cols-1);
-
-  Scalar s1 = internal::random<Scalar>();
-
-  // test Zero, Ones, Constant, and the set* variants
-  m3 = MatrixType::Constant(rows, cols, s1);
-  for (int j=0; j<cols; ++j)
-    for (int i=0; i<rows; ++i)
-    {
-      VERIFY_IS_APPROX(mzero(i,j), Scalar(0));
-      VERIFY_IS_APPROX(mones(i,j), Scalar(1));
-      VERIFY_IS_APPROX(m3(i,j), s1);
-    }
-  VERIFY(mzero.isZero());
-  VERIFY(mones.isOnes());
-  VERIFY(m3.isConstant(s1));
-  VERIFY(identity.isIdentity());
-  VERIFY_IS_APPROX(m4.setConstant(s1), m3);
-  VERIFY_IS_APPROX(m4.setConstant(rows,cols,s1), m3);
-  VERIFY_IS_APPROX(m4.setZero(), mzero);
-  VERIFY_IS_APPROX(m4.setZero(rows,cols), mzero);
-  VERIFY_IS_APPROX(m4.setOnes(), mones);
-  VERIFY_IS_APPROX(m4.setOnes(rows,cols), mones);
-  m4.fill(s1);
-  VERIFY_IS_APPROX(m4, m3);
-
-  VERIFY_IS_APPROX(v3.setConstant(rows, s1), VectorType::Constant(rows,s1));
-  VERIFY_IS_APPROX(v3.setZero(rows), vzero);
-  VERIFY_IS_APPROX(v3.setOnes(rows), vones);
-
-  m2 = m2.template binaryExpr<AddIfNull<Scalar> >(mones);
-
-  VERIFY_IS_APPROX(m1.cwise().pow(2), m1.cwise().abs2());
-  VERIFY_IS_APPROX(m1.cwise().pow(2), m1.cwise().square());
-  VERIFY_IS_APPROX(m1.cwise().pow(3), m1.cwise().cube());
-
-  VERIFY_IS_APPROX(m1 + mones, m1.cwise()+Scalar(1));
-  VERIFY_IS_APPROX(m1 - mones, m1.cwise()-Scalar(1));
-  m3 = m1; m3.cwise() += 1;
-  VERIFY_IS_APPROX(m1 + mones, m3);
-  m3 = m1; m3.cwise() -= 1;
-  VERIFY_IS_APPROX(m1 - mones, m3);
-
-  VERIFY_IS_APPROX(m2, m2.cwise() * mones);
-  VERIFY_IS_APPROX(m1.cwise() * m2,  m2.cwise() * m1);
-  m3 = m1;
-  m3.cwise() *= m2;
-  VERIFY_IS_APPROX(m3, m1.cwise() * m2);
-
-  VERIFY_IS_APPROX(mones,    m2.cwise()/m2);
-
-  // check min
-  VERIFY_IS_APPROX( m1.cwise().min(m2), m2.cwise().min(m1) );
-  VERIFY_IS_APPROX( m1.cwise().min(m1+mones), m1 );
-  VERIFY_IS_APPROX( m1.cwise().min(m1-mones), m1-mones );
-
-  // check max
-  VERIFY_IS_APPROX( m1.cwise().max(m2), m2.cwise().max(m1) );
-  VERIFY_IS_APPROX( m1.cwise().max(m1-mones), m1 );
-  VERIFY_IS_APPROX( m1.cwise().max(m1+mones), m1+mones );
-
-  VERIFY( (m1.cwise() == m1).all() );
-  VERIFY( (m1.cwise() != m2).any() );
-  VERIFY(!(m1.cwise() == (m1+mones)).any() );
-  if (rows*cols>1)
-  {
-    m3 = m1;
-    m3(r,c) += 1;
-    VERIFY( (m1.cwise() == m3).any() );
-    VERIFY( !(m1.cwise() == m3).all() );
-  }
-  VERIFY( (m1.cwise().min(m2).cwise() <= m2).all() );
-  VERIFY( (m1.cwise().max(m2).cwise() >= m2).all() );
-  VERIFY( (m1.cwise().min(m2).cwise() < (m1+mones)).all() );
-  VERIFY( (m1.cwise().max(m2).cwise() > (m1-mones)).all() );
-
-  VERIFY( (m1.cwise()<m1.unaryExpr(bind2nd(plus<Scalar>(), Scalar(1)))).all() );
-  VERIFY( !(m1.cwise()<m1bis.unaryExpr(bind2nd(minus<Scalar>(), Scalar(1)))).all() );
-  VERIFY( !(m1.cwise()>m1bis.unaryExpr(bind2nd(plus<Scalar>(), Scalar(1)))).any() );
-  
-  cwiseops_real_only(m1, m2, m3, mones);
-}
-
-void test_cwiseop()
-{
-  for(int i = 0; i < g_repeat ; i++) {
-    CALL_SUBTEST_1( cwiseops(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( cwiseops(Matrix4d()) );
-    CALL_SUBTEST_3( cwiseops(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-    CALL_SUBTEST_4( cwiseops(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-    CALL_SUBTEST_5( cwiseops(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-    CALL_SUBTEST_6( cwiseops(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-  }
-}
diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp
new file mode 100644
index 000000000..e63712b1a
--- /dev/null
+++ b/test/dense_storage.cpp
@@ -0,0 +1,76 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/Core>
+
+template <typename T, int Rows, int Cols>
+void dense_storage_copy()
+{
+  static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols);
+  typedef DenseStorage<T,Size, Rows,Cols, 0> DenseStorageType;
+  
+  const int rows = (Rows==Dynamic) ? 4 : Rows;
+  const int cols = (Cols==Dynamic) ? 3 : Cols;
+  const int size = rows*cols;
+  DenseStorageType reference(size, rows, cols);
+  T* raw_reference = reference.data();
+  for (int i=0; i<size; ++i)
+    raw_reference[i] = static_cast<T>(i);
+    
+  DenseStorageType copied_reference(reference);
+  const T* raw_copied_reference = copied_reference.data();
+  for (int i=0; i<size; ++i)
+    VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
+}
+
+template <typename T, int Rows, int Cols>
+void dense_storage_assignment()
+{
+  static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols);
+  typedef DenseStorage<T,Size, Rows,Cols, 0> DenseStorageType;
+  
+  const int rows = (Rows==Dynamic) ? 4 : Rows;
+  const int cols = (Cols==Dynamic) ? 3 : Cols;
+  const int size = rows*cols;
+  DenseStorageType reference(size, rows, cols);
+  T* raw_reference = reference.data();
+  for (int i=0; i<size; ++i)
+    raw_reference[i] = static_cast<T>(i);
+    
+  DenseStorageType copied_reference;
+  copied_reference = reference;
+  const T* raw_copied_reference = copied_reference.data();
+  for (int i=0; i<size; ++i)
+    VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
+}
+
+void test_dense_storage()
+{
+  dense_storage_copy<int,Dynamic,Dynamic>();  
+  dense_storage_copy<int,Dynamic,3>();
+  dense_storage_copy<int,4,Dynamic>();
+  dense_storage_copy<int,4,3>();
+
+  dense_storage_copy<float,Dynamic,Dynamic>();
+  dense_storage_copy<float,Dynamic,3>();
+  dense_storage_copy<float,4,Dynamic>();  
+  dense_storage_copy<float,4,3>();
+  
+  dense_storage_assignment<int,Dynamic,Dynamic>();  
+  dense_storage_assignment<int,Dynamic,3>();
+  dense_storage_assignment<int,4,Dynamic>();
+  dense_storage_assignment<int,4,3>();
+
+  dense_storage_assignment<float,Dynamic,Dynamic>();
+  dense_storage_assignment<float,Dynamic,3>();
+  dense_storage_assignment<float,4,Dynamic>();  
+  dense_storage_assignment<float,4,3>();  
+}
diff --git a/test/diagonal.cpp b/test/diagonal.cpp
index 53814a588..c1546e97d 100644
--- a/test/diagonal.cpp
+++ b/test/diagonal.cpp
@@ -20,6 +20,8 @@ template<typename MatrixType> void diagonal(const MatrixType& m)
   MatrixType m1 = MatrixType::Random(rows, cols),
              m2 = MatrixType::Random(rows, cols);
 
+  Scalar s1 = internal::random<Scalar>();
+
   //check diagonal()
   VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal());
   m2.diagonal() = 2 * m1.diagonal();
@@ -58,6 +60,26 @@ template<typename MatrixType> void diagonal(const MatrixType& m)
     VERIFY_IS_APPROX(m2.template diagonal<N2>(), static_cast<Scalar>(2) * m1.diagonal(N2));
     m2.diagonal(N2)[0] *= 3;
     VERIFY_IS_APPROX(m2.diagonal(N2)[0], static_cast<Scalar>(6) * m1.diagonal(N2)[0]);
+
+    m2.diagonal(N2).x() = s1;
+    VERIFY_IS_APPROX(m2.diagonal(N2).x(), s1);
+    m2.diagonal(N2).coeffRef(0) = Scalar(2)*s1;
+    VERIFY_IS_APPROX(m2.diagonal(N2).coeff(0), Scalar(2)*s1);
+  }
+}
+
+template<typename MatrixType> void diagonal_assert(const MatrixType& m) {
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols);
+
+  if (rows>=2 && cols>=2)
+  {
+    VERIFY_RAISES_ASSERT( m1 += m1.diagonal() );
+    VERIFY_RAISES_ASSERT( m1 -= m1.diagonal() );
+    VERIFY_RAISES_ASSERT( m1.array() *= m1.diagonal().array() );
+    VERIFY_RAISES_ASSERT( m1.array() /= m1.diagonal().array() );
   }
 }
 
@@ -74,4 +96,6 @@ void test_diagonal()
     CALL_SUBTEST_1( diagonal(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_1( diagonal(Matrix<float,Dynamic,4>(3, 4)) );
   }
+
+  CALL_SUBTEST_1( diagonal_assert(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
 }
diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp
index 149f1db2f..cd6dc8cf0 100644
--- a/test/diagonalmatrices.cpp
+++ b/test/diagonalmatrices.cpp
@@ -17,6 +17,7 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
   typedef Matrix<Scalar, Rows, 1> VectorType;
   typedef Matrix<Scalar, 1, Cols> RowVectorType;
   typedef Matrix<Scalar, Rows, Rows> SquareMatrixType;
+  typedef Matrix<Scalar, Dynamic, Dynamic> DynMatrixType;
   typedef DiagonalMatrix<Scalar, Rows> LeftDiagonalMatrix;
   typedef DiagonalMatrix<Scalar, Cols> RightDiagonalMatrix;
   typedef Matrix<Scalar, Rows==Dynamic?Dynamic:2*Rows, Cols==Dynamic?Dynamic:2*Cols> BigMatrix;
@@ -64,6 +65,13 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
   VERIFY_IS_APPROX( (((v1+v2).asDiagonal() * (m1+m2))(i,j))  , (v1+v2)(i) * (m1+m2)(i,j) );
   VERIFY_IS_APPROX( ((m1 * (rv1+rv2).asDiagonal())(i,j))  , (rv1+rv2)(j) * m1(i,j) );
   VERIFY_IS_APPROX( (((m1+m2) * (rv1+rv2).asDiagonal())(i,j))  , (rv1+rv2)(j) * (m1+m2)(i,j) );
+  
+  if(rows>1)
+  {
+    DynMatrixType tmp = m1.topRows(rows/2), res;
+    VERIFY_IS_APPROX( (res = m1.topRows(rows/2) * rv1.asDiagonal()), tmp * rv1.asDiagonal() );
+    VERIFY_IS_APPROX( (res = v1.head(rows/2).asDiagonal()*m1.topRows(rows/2)), v1.head(rows/2).asDiagonal()*tmp );
+  }
 
   BigMatrix big;
   big.setZero(2*rows, 2*cols);
@@ -84,6 +92,24 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
   
   VERIFY_IS_APPROX(m1 * (rdm1 * s1), (m1 * rdm1) * s1);
   VERIFY_IS_APPROX(m1 * (s1 * rdm1), (m1 * rdm1) * s1);
+  
+  // Diagonal to dense
+  sq_m1.setRandom();
+  sq_m2 = sq_m1;
+  VERIFY_IS_APPROX( (sq_m1 += (s1*v1).asDiagonal()), sq_m2 += (s1*v1).asDiagonal().toDenseMatrix() );
+  VERIFY_IS_APPROX( (sq_m1 -= (s1*v1).asDiagonal()), sq_m2 -= (s1*v1).asDiagonal().toDenseMatrix() );
+  VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() );
+}
+
+template<int>
+void bug987()
+{
+  Matrix3Xd points = Matrix3Xd::Random(3, 3);
+  Vector2d diag = Vector2d::Random();
+  Matrix2Xd tmp1 = points.topRows<2>(), res1, res2;
+  VERIFY_IS_APPROX( res1 = diag.asDiagonal() * points.topRows<2>(), res2 = diag.asDiagonal() * tmp1 );
+  Matrix2d tmp2 = points.topLeftCorner<2,2>();
+  VERIFY_IS_APPROX(( res1 = points.topLeftCorner<2,2>()*diag.asDiagonal()) , res2 = tmp2*diag.asDiagonal() );
 }
 
 void test_diagonalmatrices()
@@ -99,4 +125,5 @@ void test_diagonalmatrices()
     CALL_SUBTEST_8( diagonalmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
+  CALL_SUBTEST_10( bug987<0>() );
 }
diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp
index 7e41bfa97..f1cc70bee 100644
--- a/test/dynalloc.cpp
+++ b/test/dynalloc.cpp
@@ -9,18 +9,20 @@
 
 #include "main.h"
 
-#if EIGEN_ALIGN
-#define ALIGNMENT 16
+#if EIGEN_MAX_ALIGN_BYTES>0
+#define ALIGNMENT EIGEN_MAX_ALIGN_BYTES
 #else
 #define ALIGNMENT 1
 #endif
 
+typedef Matrix<float,8,1> Vector8f;
+
 void check_handmade_aligned_malloc()
 {
   for(int i = 1; i < 1000; i++)
   {
     char *p = (char*)internal::handmade_aligned_malloc(i);
-    VERIFY(size_t(p)%ALIGNMENT==0);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
     // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
     for(int j = 0; j < i; j++) p[j]=0;
     internal::handmade_aligned_free(p);
@@ -29,10 +31,10 @@ void check_handmade_aligned_malloc()
 
 void check_aligned_malloc()
 {
-  for(int i = 1; i < 1000; i++)
+  for(int i = ALIGNMENT; i < 1000; i++)
   {
     char *p = (char*)internal::aligned_malloc(i);
-    VERIFY(size_t(p)%ALIGNMENT==0);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
     // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
     for(int j = 0; j < i; j++) p[j]=0;
     internal::aligned_free(p);
@@ -41,10 +43,10 @@ void check_aligned_malloc()
 
 void check_aligned_new()
 {
-  for(int i = 1; i < 1000; i++)
+  for(int i = ALIGNMENT; i < 1000; i++)
   {
     float *p = internal::aligned_new<float>(i);
-    VERIFY(size_t(p)%ALIGNMENT==0);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
     // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
     for(int j = 0; j < i; j++) p[j]=0;
     internal::aligned_delete(p,i);
@@ -53,10 +55,10 @@ void check_aligned_new()
 
 void check_aligned_stack_alloc()
 {
-  for(int i = 1; i < 400; i++)
+  for(int i = ALIGNMENT; i < 400; i++)
   {
     ei_declare_aligned_stack_constructed_variable(float,p,i,0);
-    VERIFY(size_t(p)%ALIGNMENT==0);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
     // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
     for(int j = 0; j < i; j++) p[j]=0;
   }
@@ -68,7 +70,7 @@ struct MyStruct
 {
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW
   char dummychar;
-  Vector4f avec;
+  Vector8f avec;
 };
 
 class MyClassA
@@ -76,15 +78,45 @@ class MyClassA
   public:
     EIGEN_MAKE_ALIGNED_OPERATOR_NEW
     char dummychar;
-    Vector4f avec;
+    Vector8f avec;
 };
 
 template<typename T> void check_dynaligned()
 {
-  T* obj = new T;
-  VERIFY(T::NeedsToAlign==1);
-  VERIFY(size_t(obj)%ALIGNMENT==0);
-  delete obj;
+  // TODO have to be updated once we support multiple alignment values
+  if(T::SizeAtCompileTime % ALIGNMENT == 0)
+  {
+    T* obj = new T;
+    VERIFY(T::NeedsToAlign==1);
+    VERIFY(internal::UIntPtr(obj)%ALIGNMENT==0);
+    delete obj;
+  }
+}
+
+template<typename T> void check_custom_new_delete()
+{
+  {
+    T* t = new T;
+    delete t;
+  }
+  
+  {
+    std::size_t N = internal::random<std::size_t>(1,10);
+    T* t = new T[N];
+    delete[] t;
+  }
+  
+#if EIGEN_MAX_ALIGN_BYTES>0
+  {
+    T* t = static_cast<T *>((T::operator new)(sizeof(T)));
+    (T::operator delete)(t, sizeof(T));
+  }
+  
+  {
+    T* t = static_cast<T *>((T::operator new)(sizeof(T)));
+    (T::operator delete)(t);
+  }
+#endif
 }
 
 void test_dynalloc()
@@ -97,25 +129,34 @@ void test_dynalloc()
 
   for (int i=0; i<g_repeat*100; ++i)
   {
+    CALL_SUBTEST( check_custom_new_delete<Vector4f>() );
+    CALL_SUBTEST( check_custom_new_delete<Vector2f>() );
+    CALL_SUBTEST( check_custom_new_delete<Matrix4f>() );
+    CALL_SUBTEST( check_custom_new_delete<MatrixXi>() );
+  }
+  
+  // check static allocation, who knows ?
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES
+  for (int i=0; i<g_repeat*100; ++i)
+  {
     CALL_SUBTEST(check_dynaligned<Vector4f>() );
     CALL_SUBTEST(check_dynaligned<Vector2d>() );
     CALL_SUBTEST(check_dynaligned<Matrix4f>() );
     CALL_SUBTEST(check_dynaligned<Vector4d>() );
     CALL_SUBTEST(check_dynaligned<Vector4i>() );
+    CALL_SUBTEST(check_dynaligned<Vector8f>() );
   }
-  
-  // check static allocation, who knows ?
-  #if EIGEN_ALIGN_STATICALLY
+
   {
-    MyStruct foo0;  VERIFY(size_t(foo0.avec.data())%ALIGNMENT==0);
-    MyClassA fooA;  VERIFY(size_t(fooA.avec.data())%ALIGNMENT==0);
+    MyStruct foo0;  VERIFY(internal::UIntPtr(foo0.avec.data())%ALIGNMENT==0);
+    MyClassA fooA;  VERIFY(internal::UIntPtr(fooA.avec.data())%ALIGNMENT==0);
   }
   
   // dynamic allocation, single object
   for (int i=0; i<g_repeat*100; ++i)
   {
-    MyStruct *foo0 = new MyStruct();  VERIFY(size_t(foo0->avec.data())%ALIGNMENT==0);
-    MyClassA *fooA = new MyClassA();  VERIFY(size_t(fooA->avec.data())%ALIGNMENT==0);
+    MyStruct *foo0 = new MyStruct();  VERIFY(internal::UIntPtr(foo0->avec.data())%ALIGNMENT==0);
+    MyClassA *fooA = new MyClassA();  VERIFY(internal::UIntPtr(fooA->avec.data())%ALIGNMENT==0);
     delete foo0;
     delete fooA;
   }
@@ -124,8 +165,8 @@ void test_dynalloc()
   const int N = 10;
   for (int i=0; i<g_repeat*100; ++i)
   {
-    MyStruct *foo0 = new MyStruct[N];  VERIFY(size_t(foo0->avec.data())%ALIGNMENT==0);
-    MyClassA *fooA = new MyClassA[N];  VERIFY(size_t(fooA->avec.data())%ALIGNMENT==0);
+    MyStruct *foo0 = new MyStruct[N];  VERIFY(internal::UIntPtr(foo0->avec.data())%ALIGNMENT==0);
+    MyClassA *fooA = new MyClassA[N];  VERIFY(internal::UIntPtr(fooA->avec.data())%ALIGNMENT==0);
     delete[] foo0;
     delete[] fooA;
   }
diff --git a/test/eigen2/CMakeLists.txt b/test/eigen2/CMakeLists.txt
deleted file mode 100644
index 9615a6026..000000000
--- a/test/eigen2/CMakeLists.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-add_custom_target(eigen2_buildtests)
-add_custom_target(eigen2_check COMMAND "ctest -R eigen2")
-add_dependencies(eigen2_check eigen2_buildtests)
-add_dependencies(buildtests eigen2_buildtests)
-
-add_definitions("-DEIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API")
-add_definitions("-DEIGEN_NO_EIGEN2_DEPRECATED_WARNING")
-
-ei_add_test(eigen2_meta)
-ei_add_test(eigen2_sizeof)
-ei_add_test(eigen2_dynalloc)
-ei_add_test(eigen2_nomalloc)
-#ei_add_test(eigen2_first_aligned)
-ei_add_test(eigen2_mixingtypes)
-#ei_add_test(eigen2_packetmath)
-ei_add_test(eigen2_unalignedassert)
-#ei_add_test(eigen2_vectorization_logic)
-ei_add_test(eigen2_basicstuff)
-ei_add_test(eigen2_linearstructure)
-ei_add_test(eigen2_cwiseop)
-ei_add_test(eigen2_sum)
-ei_add_test(eigen2_product_small)
-ei_add_test(eigen2_product_large ${EI_OFLAG})
-ei_add_test(eigen2_adjoint)
-ei_add_test(eigen2_submatrices)
-ei_add_test(eigen2_miscmatrices)
-ei_add_test(eigen2_commainitializer)
-ei_add_test(eigen2_smallvectors)
-ei_add_test(eigen2_map)
-ei_add_test(eigen2_array)
-ei_add_test(eigen2_triangular)
-ei_add_test(eigen2_cholesky " " "${GSL_LIBRARIES}")
-ei_add_test(eigen2_lu ${EI_OFLAG})
-ei_add_test(eigen2_determinant ${EI_OFLAG})
-ei_add_test(eigen2_inverse)
-ei_add_test(eigen2_qr)
-ei_add_test(eigen2_eigensolver " " "${GSL_LIBRARIES}")
-ei_add_test(eigen2_svd)
-ei_add_test(eigen2_geometry)
-ei_add_test(eigen2_geometry_with_eigen2_prefix)
-ei_add_test(eigen2_hyperplane)
-ei_add_test(eigen2_parametrizedline)
-ei_add_test(eigen2_alignedbox)
-ei_add_test(eigen2_regression)
-ei_add_test(eigen2_stdvector)
-ei_add_test(eigen2_newstdvector)
-if(QT4_FOUND)
-  ei_add_test(eigen2_qtvector " " "${QT_QTCORE_LIBRARY}")
-endif(QT4_FOUND)
-# no support for eigen2 sparse module
-# if(NOT EIGEN_DEFAULT_TO_ROW_MAJOR)
-#   ei_add_test(eigen2_sparse_vector)
-#   ei_add_test(eigen2_sparse_basic)
-#   ei_add_test(eigen2_sparse_solvers " " "${SPARSE_LIBS}")
-#   ei_add_test(eigen2_sparse_product)
-# endif()
-ei_add_test(eigen2_swap)
-ei_add_test(eigen2_visitor)
-ei_add_test(eigen2_bug_132)
-
-ei_add_test(eigen2_prec_inverse_4x4 ${EI_OFLAG})
diff --git a/test/eigen2/eigen2_adjoint.cpp b/test/eigen2/eigen2_adjoint.cpp
deleted file mode 100644
index c0f811459..000000000
--- a/test/eigen2/eigen2_adjoint.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename MatrixType> void adjoint(const MatrixType& m)
-{
-  /* this test covers the following files:
-     Transpose.h Conjugate.h Dot.h
-  */
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
-  int rows = m.rows();
-  int cols = m.cols();
-
-  RealScalar largerEps = test_precision<RealScalar>();
-  if (ei_is_same_type<RealScalar,float>::ret)
-    largerEps = RealScalar(1e-3f);
-
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols),
-             m3(rows, cols),
-             square = SquareMatrixType::Random(rows, rows);
-  VectorType v1 = VectorType::Random(rows),
-             v2 = VectorType::Random(rows),
-             v3 = VectorType::Random(rows),
-             vzero = VectorType::Zero(rows);
-
-  Scalar s1 = ei_random<Scalar>(),
-         s2 = ei_random<Scalar>();
-
-  // check basic compatibility of adjoint, transpose, conjugate
-  VERIFY_IS_APPROX(m1.transpose().conjugate().adjoint(),    m1);
-  VERIFY_IS_APPROX(m1.adjoint().conjugate().transpose(),    m1);
-
-  // check multiplicative behavior
-  VERIFY_IS_APPROX((m1.adjoint() * m2).adjoint(),           m2.adjoint() * m1);
-  VERIFY_IS_APPROX((s1 * m1).adjoint(),                     ei_conj(s1) * m1.adjoint());
-
-  // check basic properties of dot, norm, norm2
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  VERIFY(ei_isApprox((s1 * v1 + s2 * v2).eigen2_dot(v3),   s1 * v1.eigen2_dot(v3) + s2 * v2.eigen2_dot(v3), largerEps));
-  VERIFY(ei_isApprox(v3.eigen2_dot(s1 * v1 + s2 * v2),     ei_conj(s1)*v3.eigen2_dot(v1)+ei_conj(s2)*v3.eigen2_dot(v2), largerEps));
-  VERIFY_IS_APPROX(ei_conj(v1.eigen2_dot(v2)),               v2.eigen2_dot(v1));
-  VERIFY_IS_APPROX(ei_real(v1.eigen2_dot(v1)),               v1.squaredNorm());
-  if(NumTraits<Scalar>::HasFloatingPoint)
-    VERIFY_IS_APPROX(v1.squaredNorm(),                      v1.norm() * v1.norm());
-  VERIFY_IS_MUCH_SMALLER_THAN(ei_abs(vzero.eigen2_dot(v1)),  static_cast<RealScalar>(1));
-  if(NumTraits<Scalar>::HasFloatingPoint)
-    VERIFY_IS_MUCH_SMALLER_THAN(vzero.norm(),         static_cast<RealScalar>(1));
-
-  // check compatibility of dot and adjoint
-  VERIFY(ei_isApprox(v1.eigen2_dot(square * v2), (square.adjoint() * v1).eigen2_dot(v2), largerEps));
-
-  // like in testBasicStuff, test operator() to check const-qualification
-  int r = ei_random<int>(0, rows-1),
-      c = ei_random<int>(0, cols-1);
-  VERIFY_IS_APPROX(m1.conjugate()(r,c), ei_conj(m1(r,c)));
-  VERIFY_IS_APPROX(m1.adjoint()(c,r), ei_conj(m1(r,c)));
-
-  if(NumTraits<Scalar>::HasFloatingPoint)
-  {
-    // check that Random().normalized() works: tricky as the random xpr must be evaluated by
-    // normalized() in order to produce a consistent result.
-    VERIFY_IS_APPROX(VectorType::Random(rows).normalized().norm(), RealScalar(1));
-  }
-
-  // check inplace transpose
-  m3 = m1;
-  m3.transposeInPlace();
-  VERIFY_IS_APPROX(m3,m1.transpose());
-  m3.transposeInPlace();
-  VERIFY_IS_APPROX(m3,m1);
-  
-}
-
-void test_eigen2_adjoint()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( adjoint(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( adjoint(Matrix3d()) );
-    CALL_SUBTEST_3( adjoint(Matrix4f()) );
-    CALL_SUBTEST_4( adjoint(MatrixXcf(4, 4)) );
-    CALL_SUBTEST_5( adjoint(MatrixXi(8, 12)) );
-    CALL_SUBTEST_6( adjoint(MatrixXf(21, 21)) );
-  }
-  // test a large matrix only once
-  CALL_SUBTEST_7( adjoint(Matrix<float, 100, 100>()) );
-}
-
diff --git a/test/eigen2/eigen2_alignedbox.cpp b/test/eigen2/eigen2_alignedbox.cpp
deleted file mode 100644
index 35043b958..000000000
--- a/test/eigen2/eigen2_alignedbox.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/Geometry>
-#include <Eigen/LU>
-#include <Eigen/QR>
-
-template<typename BoxType> void alignedbox(const BoxType& _box)
-{
-  /* this test covers the following files:
-     AlignedBox.h
-  */
-
-  const int dim = _box.dim();
-  typedef typename BoxType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
-
-  VectorType p0 = VectorType::Random(dim);
-  VectorType p1 = VectorType::Random(dim);
-  RealScalar s1 = ei_random<RealScalar>(0,1);
-
-  BoxType b0(dim);
-  BoxType b1(VectorType::Random(dim),VectorType::Random(dim));
-  BoxType b2;
-
-  b0.extend(p0);
-  b0.extend(p1);
-  VERIFY(b0.contains(p0*s1+(Scalar(1)-s1)*p1));
-  VERIFY(!b0.contains(p0 + (1+s1)*(p1-p0)));
-
-  (b2 = b0).extend(b1);
-  VERIFY(b2.contains(b0));
-  VERIFY(b2.contains(b1));
-  VERIFY_IS_APPROX(b2.clamp(b0), b0);
-
-  // casting
-  const int Dim = BoxType::AmbientDimAtCompileTime;
-  typedef typename GetDifferentType<Scalar>::type OtherScalar;
-  AlignedBox<OtherScalar,Dim> hp1f = b0.template cast<OtherScalar>();
-  VERIFY_IS_APPROX(hp1f.template cast<Scalar>(),b0);
-  AlignedBox<Scalar,Dim> hp1d = b0.template cast<Scalar>();
-  VERIFY_IS_APPROX(hp1d.template cast<Scalar>(),b0);
-}
-
-void test_eigen2_alignedbox()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( alignedbox(AlignedBox<float,2>()) );
-    CALL_SUBTEST_2( alignedbox(AlignedBox<float,3>()) );
-    CALL_SUBTEST_3( alignedbox(AlignedBox<double,4>()) );
-  }
-}
diff --git a/test/eigen2/eigen2_array.cpp b/test/eigen2/eigen2_array.cpp
deleted file mode 100644
index c1ff40ce7..000000000
--- a/test/eigen2/eigen2_array.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/Array>
-
-template<typename MatrixType> void array(const MatrixType& m)
-{
-  /* this test covers the following files:
-     Array.cpp
-  */
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols),
-             m3(rows, cols);
-
-  Scalar  s1 = ei_random<Scalar>(),
-          s2 = ei_random<Scalar>();
-
-  // scalar addition
-  VERIFY_IS_APPROX(m1.cwise() + s1, s1 + m1.cwise());
-  VERIFY_IS_APPROX(m1.cwise() + s1, MatrixType::Constant(rows,cols,s1) + m1);
-  VERIFY_IS_APPROX((m1*Scalar(2)).cwise() - s2, (m1+m1) - MatrixType::Constant(rows,cols,s2) );
-  m3 = m1;
-  m3.cwise() += s2;
-  VERIFY_IS_APPROX(m3, m1.cwise() + s2);
-  m3 = m1;
-  m3.cwise() -= s1;
-  VERIFY_IS_APPROX(m3, m1.cwise() - s1);
-
-  // reductions
-  VERIFY_IS_APPROX(m1.colwise().sum().sum(), m1.sum());
-  VERIFY_IS_APPROX(m1.rowwise().sum().sum(), m1.sum());
-  if (!ei_isApprox(m1.sum(), (m1+m2).sum()))
-    VERIFY_IS_NOT_APPROX(((m1+m2).rowwise().sum()).sum(), m1.sum());
-  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar>()));
-}
-
-template<typename MatrixType> void comparisons(const MatrixType& m)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  int r = ei_random<int>(0, rows-1),
-      c = ei_random<int>(0, cols-1);
-
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols),
-             m3(rows, cols);
-
-  VERIFY(((m1.cwise() + Scalar(1)).cwise() > m1).all());
-  VERIFY(((m1.cwise() - Scalar(1)).cwise() < m1).all());
-  if (rows*cols>1)
-  {
-    m3 = m1;
-    m3(r,c) += 1;
-    VERIFY(! (m1.cwise() < m3).all() );
-    VERIFY(! (m1.cwise() > m3).all() );
-  }
-  
-  // comparisons to scalar
-  VERIFY( (m1.cwise() != (m1(r,c)+1) ).any() );
-  VERIFY( (m1.cwise() > (m1(r,c)-1) ).any() );
-  VERIFY( (m1.cwise() < (m1(r,c)+1) ).any() );
-  VERIFY( (m1.cwise() == m1(r,c) ).any() );
-  
-  // test Select
-  VERIFY_IS_APPROX( (m1.cwise()<m2).select(m1,m2), m1.cwise().min(m2) );
-  VERIFY_IS_APPROX( (m1.cwise()>m2).select(m1,m2), m1.cwise().max(m2) );
-  Scalar mid = (m1.cwise().abs().minCoeff() + m1.cwise().abs().maxCoeff())/Scalar(2);
-  for (int j=0; j<cols; ++j)
-  for (int i=0; i<rows; ++i)
-    m3(i,j) = ei_abs(m1(i,j))<mid ? 0 : m1(i,j);
-  VERIFY_IS_APPROX( (m1.cwise().abs().cwise()<MatrixType::Constant(rows,cols,mid))
-                        .select(MatrixType::Zero(rows,cols),m1), m3);
-  // shorter versions:
-  VERIFY_IS_APPROX( (m1.cwise().abs().cwise()<MatrixType::Constant(rows,cols,mid))
-                        .select(0,m1), m3);
-  VERIFY_IS_APPROX( (m1.cwise().abs().cwise()>=MatrixType::Constant(rows,cols,mid))
-                        .select(m1,0), m3);
-  // even shorter version:
-  VERIFY_IS_APPROX( (m1.cwise().abs().cwise()<mid).select(0,m1), m3);
-  
-  // count
-  VERIFY(((m1.cwise().abs().cwise()+1).cwise()>RealScalar(0.1)).count() == rows*cols);
-  VERIFY_IS_APPROX(((m1.cwise().abs().cwise()+1).cwise()>RealScalar(0.1)).colwise().count().template cast<int>(), RowVectorXi::Constant(cols,rows));
-  VERIFY_IS_APPROX(((m1.cwise().abs().cwise()+1).cwise()>RealScalar(0.1)).rowwise().count().template cast<int>(), VectorXi::Constant(rows, cols));
-}
-
-template<typename VectorType> void lpNorm(const VectorType& v)
-{
-  VectorType u = VectorType::Random(v.size());
-
-  VERIFY_IS_APPROX(u.template lpNorm<Infinity>(), u.cwise().abs().maxCoeff());
-  VERIFY_IS_APPROX(u.template lpNorm<1>(), u.cwise().abs().sum());
-  VERIFY_IS_APPROX(u.template lpNorm<2>(), ei_sqrt(u.cwise().abs().cwise().square().sum()));
-  VERIFY_IS_APPROX(ei_pow(u.template lpNorm<5>(), typename VectorType::RealScalar(5)), u.cwise().abs().cwise().pow(5).sum());
-}
-
-void test_eigen2_array()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( array(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( array(Matrix2f()) );
-    CALL_SUBTEST_3( array(Matrix4d()) );
-    CALL_SUBTEST_4( array(MatrixXcf(3, 3)) );
-    CALL_SUBTEST_5( array(MatrixXf(8, 12)) );
-    CALL_SUBTEST_6( array(MatrixXi(8, 12)) );
-  }
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( comparisons(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( comparisons(Matrix2f()) );
-    CALL_SUBTEST_3( comparisons(Matrix4d()) );
-    CALL_SUBTEST_5( comparisons(MatrixXf(8, 12)) );
-    CALL_SUBTEST_6( comparisons(MatrixXi(8, 12)) );
-  }
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( lpNorm(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( lpNorm(Vector2f()) );
-    CALL_SUBTEST_3( lpNorm(Vector3d()) );
-    CALL_SUBTEST_4( lpNorm(Vector4f()) );
-    CALL_SUBTEST_5( lpNorm(VectorXf(16)) );
-    CALL_SUBTEST_7( lpNorm(VectorXcd(10)) );
-  }
-}
diff --git a/test/eigen2/eigen2_basicstuff.cpp b/test/eigen2/eigen2_basicstuff.cpp
deleted file mode 100644
index dd2dec1ef..000000000
--- a/test/eigen2/eigen2_basicstuff.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename MatrixType> void basicStuff(const MatrixType& m)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  // this test relies a lot on Random.h, and there's not much more that we can do
-  // to test it, hence I consider that we will have tested Random.h
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols),
-             m3(rows, cols),
-             mzero = MatrixType::Zero(rows, cols),
-             square = Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime>::Random(rows, rows);
-  VectorType v1 = VectorType::Random(rows),
-             vzero = VectorType::Zero(rows);
-
-  Scalar x = ei_random<Scalar>();
-
-  int r = ei_random<int>(0, rows-1),
-      c = ei_random<int>(0, cols-1);
-
-  m1.coeffRef(r,c) = x;
-  VERIFY_IS_APPROX(x, m1.coeff(r,c));
-  m1(r,c) = x;
-  VERIFY_IS_APPROX(x, m1(r,c));
-  v1.coeffRef(r) = x;
-  VERIFY_IS_APPROX(x, v1.coeff(r));
-  v1(r) = x;
-  VERIFY_IS_APPROX(x, v1(r));
-  v1[r] = x;
-  VERIFY_IS_APPROX(x, v1[r]);
-
-  VERIFY_IS_APPROX(               v1,    v1);
-  VERIFY_IS_NOT_APPROX(           v1,    2*v1);
-  VERIFY_IS_MUCH_SMALLER_THAN(    vzero, v1);
-  if(NumTraits<Scalar>::HasFloatingPoint)
-    VERIFY_IS_MUCH_SMALLER_THAN(  vzero, v1.norm());
-  VERIFY_IS_NOT_MUCH_SMALLER_THAN(v1,    v1);
-  VERIFY_IS_APPROX(               vzero, v1-v1);
-  VERIFY_IS_APPROX(               m1,    m1);
-  VERIFY_IS_NOT_APPROX(           m1,    2*m1);
-  VERIFY_IS_MUCH_SMALLER_THAN(    mzero, m1);
-  VERIFY_IS_NOT_MUCH_SMALLER_THAN(m1,    m1);
-  VERIFY_IS_APPROX(               mzero, m1-m1);
-
-  // always test operator() on each read-only expression class,
-  // in order to check const-qualifiers.
-  // indeed, if an expression class (here Zero) is meant to be read-only,
-  // hence has no _write() method, the corresponding MatrixBase method (here zero())
-  // should return a const-qualified object so that it is the const-qualified
-  // operator() that gets called, which in turn calls _read().
-  VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows,cols)(r,c), static_cast<Scalar>(1));
-
-  // now test copying a row-vector into a (column-)vector and conversely.
-  square.col(r) = square.row(r).eval();
-  Matrix<Scalar, 1, MatrixType::RowsAtCompileTime> rv(rows);
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> cv(rows);
-  rv = square.row(r);
-  cv = square.col(r);
-  VERIFY_IS_APPROX(rv, cv.transpose());
-
-  if(cols!=1 && rows!=1 && MatrixType::SizeAtCompileTime!=Dynamic)
-  {
-    VERIFY_RAISES_ASSERT(m1 = (m2.block(0,0, rows-1, cols-1)));
-  }
-
-  VERIFY_IS_APPROX(m3 = m1,m1);
-  MatrixType m4;
-  VERIFY_IS_APPROX(m4 = m1,m1);
-
-  // test swap
-  m3 = m1;
-  m1.swap(m2);
-  VERIFY_IS_APPROX(m3, m2);
-  if(rows*cols>=3)
-  {
-    VERIFY_IS_NOT_APPROX(m3, m1);
-  }
-}
-
-void test_eigen2_basicstuff()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( basicStuff(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( basicStuff(Matrix4d()) );
-    CALL_SUBTEST_3( basicStuff(MatrixXcf(3, 3)) );
-    CALL_SUBTEST_4( basicStuff(MatrixXi(8, 12)) );
-    CALL_SUBTEST_5( basicStuff(MatrixXcd(20, 20)) );
-    CALL_SUBTEST_6( basicStuff(Matrix<float, 100, 100>()) );
-    CALL_SUBTEST_7( basicStuff(Matrix<long double,Dynamic,Dynamic>(10,10)) );
-  }
-}
diff --git a/test/eigen2/eigen2_bug_132.cpp b/test/eigen2/eigen2_bug_132.cpp
deleted file mode 100644
index 7fe3610ce..000000000
--- a/test/eigen2/eigen2_bug_132.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-void test_eigen2_bug_132() {
-  int size = 100;
-  MatrixXd A(size, size);
-  VectorXd b(size), c(size);
-  {
-    VectorXd y = A.transpose() * (b-c);  // bug 132: infinite recursion in coeffRef
-    VectorXd z = (b-c).transpose() * A;  // bug 132: infinite recursion in coeffRef
-  }
-
-  // the following ones weren't failing, but let's include them for completeness:
-  {
-    VectorXd y = A * (b-c);
-    VectorXd z = (b-c).transpose() * A.transpose();
-  }
-}
diff --git a/test/eigen2/eigen2_cholesky.cpp b/test/eigen2/eigen2_cholesky.cpp
deleted file mode 100644
index 9c4b6f561..000000000
--- a/test/eigen2/eigen2_cholesky.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_NO_ASSERTION_CHECKING
-#include "main.h"
-#include <Eigen/Cholesky>
-#include <Eigen/LU>
-
-#ifdef HAS_GSL
-#include "gsl_helper.h"
-#endif
-
-template<typename MatrixType> void cholesky(const MatrixType& m)
-{
-  /* this test covers the following files:
-     LLT.h LDLT.h
-  */
-  int rows = m.rows();
-  int cols = m.cols();
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-
-  MatrixType a0 = MatrixType::Random(rows,cols);
-  VectorType vecB = VectorType::Random(rows), vecX(rows);
-  MatrixType matB = MatrixType::Random(rows,cols), matX(rows,cols);
-  SquareMatrixType symm =  a0 * a0.adjoint();
-  // let's make sure the matrix is not singular or near singular
-  MatrixType a1 = MatrixType::Random(rows,cols);
-  symm += a1 * a1.adjoint();
-
-  #ifdef HAS_GSL
-  if (ei_is_same_type<RealScalar,double>::ret)
-  {
-    typedef GslTraits<Scalar> Gsl;
-    typename Gsl::Matrix gMatA=0, gSymm=0;
-    typename Gsl::Vector gVecB=0, gVecX=0;
-    convert<MatrixType>(symm, gSymm);
-    convert<MatrixType>(symm, gMatA);
-    convert<VectorType>(vecB, gVecB);
-    convert<VectorType>(vecB, gVecX);
-    Gsl::cholesky(gMatA);
-    Gsl::cholesky_solve(gMatA, gVecB, gVecX);
-    VectorType vecX(rows), _vecX, _vecB;
-    convert(gVecX, _vecX);
-    symm.llt().solve(vecB, &vecX);
-    Gsl::prod(gSymm, gVecX, gVecB);
-    convert(gVecB, _vecB);
-    // test gsl itself !
-    VERIFY_IS_APPROX(vecB, _vecB);
-    VERIFY_IS_APPROX(vecX, _vecX);
-
-    Gsl::free(gMatA);
-    Gsl::free(gSymm);
-    Gsl::free(gVecB);
-    Gsl::free(gVecX);
-  }
-  #endif
-
-  {
-    LDLT<SquareMatrixType> ldlt(symm);
-    VERIFY(ldlt.isPositiveDefinite());
-    // in eigen3, LDLT is pivoting
-    //VERIFY_IS_APPROX(symm, ldlt.matrixL() * ldlt.vectorD().asDiagonal() * ldlt.matrixL().adjoint());
-    ldlt.solve(vecB, &vecX);
-    VERIFY_IS_APPROX(symm * vecX, vecB);
-    ldlt.solve(matB, &matX);
-    VERIFY_IS_APPROX(symm * matX, matB);
-  }
-
-  {
-    LLT<SquareMatrixType> chol(symm);
-    VERIFY(chol.isPositiveDefinite());
-    VERIFY_IS_APPROX(symm, chol.matrixL() * chol.matrixL().adjoint());
-    chol.solve(vecB, &vecX);
-    VERIFY_IS_APPROX(symm * vecX, vecB);
-    chol.solve(matB, &matX);
-    VERIFY_IS_APPROX(symm * matX, matB);
-  }
-
-#if 0 // cholesky is not rank-revealing anyway
-  // test isPositiveDefinite on non definite matrix
-  if (rows>4)
-  {
-    SquareMatrixType symm =  a0.block(0,0,rows,cols-4) * a0.block(0,0,rows,cols-4).adjoint();
-    LLT<SquareMatrixType> chol(symm);
-    VERIFY(!chol.isPositiveDefinite());
-    LDLT<SquareMatrixType> cholnosqrt(symm);
-    VERIFY(!cholnosqrt.isPositiveDefinite());
-  }
-#endif
-}
-
-void test_eigen2_cholesky()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( cholesky(Matrix<double,1,1>()) );
-    CALL_SUBTEST_2( cholesky(Matrix2d()) );
-    CALL_SUBTEST_3( cholesky(Matrix3f()) );
-    CALL_SUBTEST_4( cholesky(Matrix4d()) );
-    CALL_SUBTEST_5( cholesky(MatrixXcd(7,7)) );
-    CALL_SUBTEST_6( cholesky(MatrixXf(17,17)) );
-    CALL_SUBTEST_7( cholesky(MatrixXd(33,33)) );
-  }
-}
diff --git a/test/eigen2/eigen2_commainitializer.cpp b/test/eigen2/eigen2_commainitializer.cpp
deleted file mode 100644
index e0f901e0b..000000000
--- a/test/eigen2/eigen2_commainitializer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-void test_eigen2_commainitializer()
-{
-  Matrix3d m3;
-  Matrix4d m4;
-
-  VERIFY_RAISES_ASSERT( (m3 << 1, 2, 3, 4, 5, 6, 7, 8) );
-  
-  #ifndef _MSC_VER
-  VERIFY_RAISES_ASSERT( (m3 << 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) );
-  #endif
-
-  double data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-  Matrix3d ref = Map<Matrix<double,3,3,RowMajor> >(data);
-
-  m3 = Matrix3d::Random();
-  m3 << 1, 2, 3, 4, 5, 6, 7, 8, 9;
-  VERIFY_IS_APPROX(m3, ref );
-
-  Vector3d vec[3];
-  vec[0] << 1, 4, 7;
-  vec[1] << 2, 5, 8;
-  vec[2] << 3, 6, 9;
-  m3 = Matrix3d::Random();
-  m3 << vec[0], vec[1], vec[2];
-  VERIFY_IS_APPROX(m3, ref);
-
-  vec[0] << 1, 2, 3;
-  vec[1] << 4, 5, 6;
-  vec[2] << 7, 8, 9;
-  m3 = Matrix3d::Random();
-  m3 << vec[0].transpose(),
-        4, 5, 6,
-        vec[2].transpose();
-  VERIFY_IS_APPROX(m3, ref);
-}
diff --git a/test/eigen2/eigen2_cwiseop.cpp b/test/eigen2/eigen2_cwiseop.cpp
deleted file mode 100644
index 22e1cc342..000000000
--- a/test/eigen2/eigen2_cwiseop.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <functional>
-#include <Eigen/Array>
-
-using namespace std;
-
-template<typename Scalar> struct AddIfNull {
-    const Scalar operator() (const Scalar a, const Scalar b) const {return a<=1e-3 ? b : a;}
-    enum { Cost = NumTraits<Scalar>::AddCost };
-};
-
-template<typename MatrixType> void cwiseops(const MatrixType& m)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols),
-             m3(rows, cols),
-             m4(rows, cols),
-             mzero = MatrixType::Zero(rows, cols),
-             mones = MatrixType::Ones(rows, cols),
-             identity = Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime>
-                              ::Identity(rows, rows);
-  VectorType vzero = VectorType::Zero(rows),
-             vones = VectorType::Ones(rows),
-             v3(rows);
-
-  int r = ei_random<int>(0, rows-1),
-      c = ei_random<int>(0, cols-1);
-  
-  Scalar s1 = ei_random<Scalar>();
-  
-  // test Zero, Ones, Constant, and the set* variants
-  m3 = MatrixType::Constant(rows, cols, s1);
-  for (int j=0; j<cols; ++j)
-    for (int i=0; i<rows; ++i)
-    {
-      VERIFY_IS_APPROX(mzero(i,j), Scalar(0));
-      VERIFY_IS_APPROX(mones(i,j), Scalar(1));
-      VERIFY_IS_APPROX(m3(i,j), s1);
-    }
-  VERIFY(mzero.isZero());
-  VERIFY(mones.isOnes());
-  VERIFY(m3.isConstant(s1));
-  VERIFY(identity.isIdentity());
-  VERIFY_IS_APPROX(m4.setConstant(s1), m3);
-  VERIFY_IS_APPROX(m4.setConstant(rows,cols,s1), m3);
-  VERIFY_IS_APPROX(m4.setZero(), mzero);
-  VERIFY_IS_APPROX(m4.setZero(rows,cols), mzero);
-  VERIFY_IS_APPROX(m4.setOnes(), mones);
-  VERIFY_IS_APPROX(m4.setOnes(rows,cols), mones);
-  m4.fill(s1);
-  VERIFY_IS_APPROX(m4, m3);
-  
-  VERIFY_IS_APPROX(v3.setConstant(rows, s1), VectorType::Constant(rows,s1));
-  VERIFY_IS_APPROX(v3.setZero(rows), vzero);
-  VERIFY_IS_APPROX(v3.setOnes(rows), vones);
-
-  m2 = m2.template binaryExpr<AddIfNull<Scalar> >(mones);
-
-  VERIFY_IS_APPROX(m1.cwise().pow(2), m1.cwise().abs2());
-  VERIFY_IS_APPROX(m1.cwise().pow(2), m1.cwise().square());
-  VERIFY_IS_APPROX(m1.cwise().pow(3), m1.cwise().cube());
-
-  VERIFY_IS_APPROX(m1 + mones, m1.cwise()+Scalar(1));
-  VERIFY_IS_APPROX(m1 - mones, m1.cwise()-Scalar(1));
-  m3 = m1; m3.cwise() += 1;
-  VERIFY_IS_APPROX(m1 + mones, m3);
-  m3 = m1; m3.cwise() -= 1;
-  VERIFY_IS_APPROX(m1 - mones, m3);
-
-  VERIFY_IS_APPROX(m2, m2.cwise() * mones);
-  VERIFY_IS_APPROX(m1.cwise() * m2,  m2.cwise() * m1);
-  m3 = m1;
-  m3.cwise() *= m2;
-  VERIFY_IS_APPROX(m3, m1.cwise() * m2);
-  
-  VERIFY_IS_APPROX(mones,    m2.cwise()/m2);
-  if(NumTraits<Scalar>::HasFloatingPoint)
-  {
-    VERIFY_IS_APPROX(m1.cwise() / m2,    m1.cwise() * (m2.cwise().inverse()));
-    m3 = m1.cwise().abs().cwise().sqrt();
-    VERIFY_IS_APPROX(m3.cwise().square(), m1.cwise().abs());
-    VERIFY_IS_APPROX(m1.cwise().square().cwise().sqrt(), m1.cwise().abs());
-    VERIFY_IS_APPROX(m1.cwise().abs().cwise().log().cwise().exp() , m1.cwise().abs());
-
-    VERIFY_IS_APPROX(m1.cwise().pow(2), m1.cwise().square());
-    m3 = (m1.cwise().abs().cwise()<=RealScalar(0.01)).select(mones,m1);
-    VERIFY_IS_APPROX(m3.cwise().pow(-1), m3.cwise().inverse());
-    m3 = m1.cwise().abs();
-    VERIFY_IS_APPROX(m3.cwise().pow(RealScalar(0.5)), m3.cwise().sqrt());
-    
-//     VERIFY_IS_APPROX(m1.cwise().tan(), m1.cwise().sin().cwise() / m1.cwise().cos());
-    VERIFY_IS_APPROX(mones, m1.cwise().sin().cwise().square() + m1.cwise().cos().cwise().square());
-    m3 = m1;
-    m3.cwise() /= m2;
-    VERIFY_IS_APPROX(m3, m1.cwise() / m2);
-  }
-
-  // check min
-  VERIFY_IS_APPROX( m1.cwise().min(m2), m2.cwise().min(m1) );
-  VERIFY_IS_APPROX( m1.cwise().min(m1+mones), m1 );
-  VERIFY_IS_APPROX( m1.cwise().min(m1-mones), m1-mones );
-
-  // check max
-  VERIFY_IS_APPROX( m1.cwise().max(m2), m2.cwise().max(m1) );
-  VERIFY_IS_APPROX( m1.cwise().max(m1-mones), m1 );
-  VERIFY_IS_APPROX( m1.cwise().max(m1+mones), m1+mones );
-  
-  VERIFY( (m1.cwise() == m1).all() );
-  VERIFY( (m1.cwise() != m2).any() );
-  VERIFY(!(m1.cwise() == (m1+mones)).any() );
-  if (rows*cols>1)
-  {
-    m3 = m1;
-    m3(r,c) += 1;
-    VERIFY( (m1.cwise() == m3).any() );
-    VERIFY( !(m1.cwise() == m3).all() );
-  }
-  VERIFY( (m1.cwise().min(m2).cwise() <= m2).all() );
-  VERIFY( (m1.cwise().max(m2).cwise() >= m2).all() );
-  VERIFY( (m1.cwise().min(m2).cwise() < (m1+mones)).all() );
-  VERIFY( (m1.cwise().max(m2).cwise() > (m1-mones)).all() );
-
-  VERIFY( (m1.cwise()<m1.unaryExpr(bind2nd(plus<Scalar>(), Scalar(1)))).all() );
-  VERIFY( !(m1.cwise()<m1.unaryExpr(bind2nd(minus<Scalar>(), Scalar(1)))).all() );
-  VERIFY( !(m1.cwise()>m1.unaryExpr(bind2nd(plus<Scalar>(), Scalar(1)))).any() );
-}
-
-void test_eigen2_cwiseop()
-{
-  for(int i = 0; i < g_repeat ; i++) {
-    CALL_SUBTEST_1( cwiseops(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( cwiseops(Matrix4d()) );
-    CALL_SUBTEST_3( cwiseops(MatrixXf(3, 3)) );
-    CALL_SUBTEST_3( cwiseops(MatrixXf(22, 22)) );
-    CALL_SUBTEST_4( cwiseops(MatrixXi(8, 12)) );
-    CALL_SUBTEST_5( cwiseops(MatrixXd(20, 20)) );
-  }
-}
diff --git a/test/eigen2/eigen2_determinant.cpp b/test/eigen2/eigen2_determinant.cpp
deleted file mode 100644
index c7b4ad053..000000000
--- a/test/eigen2/eigen2_determinant.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/LU>
-
-template<typename MatrixType> void determinant(const MatrixType& m)
-{
-  /* this test covers the following files:
-     Determinant.h
-  */
-  int size = m.rows();
-
-  MatrixType m1(size, size), m2(size, size);
-  m1.setRandom();
-  m2.setRandom();
-  typedef typename MatrixType::Scalar Scalar;
-  Scalar x = ei_random<Scalar>();
-  VERIFY_IS_APPROX(MatrixType::Identity(size, size).determinant(), Scalar(1));
-  VERIFY_IS_APPROX((m1*m2).determinant(), m1.determinant() * m2.determinant());
-  if(size==1) return;
-  int i = ei_random<int>(0, size-1);
-  int j;
-  do {
-    j = ei_random<int>(0, size-1);
-  } while(j==i);
-  m2 = m1;
-  m2.row(i).swap(m2.row(j));
-  VERIFY_IS_APPROX(m2.determinant(), -m1.determinant());
-  m2 = m1;
-  m2.col(i).swap(m2.col(j));
-  VERIFY_IS_APPROX(m2.determinant(), -m1.determinant());
-  VERIFY_IS_APPROX(m2.determinant(), m2.transpose().determinant());
-  VERIFY_IS_APPROX(ei_conj(m2.determinant()), m2.adjoint().determinant());
-  m2 = m1;
-  m2.row(i) += x*m2.row(j);
-  VERIFY_IS_APPROX(m2.determinant(), m1.determinant());
-  m2 = m1;
-  m2.row(i) *= x;
-  VERIFY_IS_APPROX(m2.determinant(), m1.determinant() * x);
-}
-
-void test_eigen2_determinant()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( determinant(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( determinant(Matrix<double, 2, 2>()) );
-    CALL_SUBTEST_3( determinant(Matrix<double, 3, 3>()) );
-    CALL_SUBTEST_4( determinant(Matrix<double, 4, 4>()) );
-    CALL_SUBTEST_5( determinant(Matrix<std::complex<double>, 10, 10>()) );
-    CALL_SUBTEST_6( determinant(MatrixXd(20, 20)) );
-  }
-  CALL_SUBTEST_6( determinant(MatrixXd(200, 200)) );
-}
diff --git a/test/eigen2/eigen2_dynalloc.cpp b/test/eigen2/eigen2_dynalloc.cpp
deleted file mode 100644
index 1891a9e33..000000000
--- a/test/eigen2/eigen2_dynalloc.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-#if EIGEN_ARCH_WANTS_ALIGNMENT
-#define ALIGNMENT 16
-#else
-#define ALIGNMENT 1
-#endif
-
-void check_handmade_aligned_malloc()
-{
-  for(int i = 1; i < 1000; i++)
-  {
-    char *p = (char*)ei_handmade_aligned_malloc(i);
-    VERIFY(std::size_t(p)%ALIGNMENT==0);
-    // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
-    for(int j = 0; j < i; j++) p[j]=0;
-    ei_handmade_aligned_free(p);
-  }
-}
-
-void check_aligned_malloc()
-{
-  for(int i = 1; i < 1000; i++)
-  {
-    char *p = (char*)ei_aligned_malloc(i);
-    VERIFY(std::size_t(p)%ALIGNMENT==0);
-    // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
-    for(int j = 0; j < i; j++) p[j]=0;
-    ei_aligned_free(p);
-  }
-}
-
-void check_aligned_new()
-{
-  for(int i = 1; i < 1000; i++)
-  {
-    float *p = ei_aligned_new<float>(i);
-    VERIFY(std::size_t(p)%ALIGNMENT==0);
-    // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
-    for(int j = 0; j < i; j++) p[j]=0;
-    ei_aligned_delete(p,i);
-  }
-}
-
-void check_aligned_stack_alloc()
-{
-  for(int i = 1; i < 1000; i++)
-  {
-    ei_declare_aligned_stack_constructed_variable(float, p, i, 0);
-    VERIFY(std::size_t(p)%ALIGNMENT==0);
-    // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
-    for(int j = 0; j < i; j++) p[j]=0;
-  }
-}
-
-
-// test compilation with both a struct and a class...
-struct MyStruct
-{
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  char dummychar;
-  Vector4f avec;
-};
-
-class MyClassA
-{
-  public:
-    EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-    char dummychar;
-    Vector4f avec;
-};
-
-template<typename T> void check_dynaligned()
-{
-  T* obj = new T;
-  VERIFY(std::size_t(obj)%ALIGNMENT==0);
-  delete obj;
-}
-
-void test_eigen2_dynalloc()
-{
-  // low level dynamic memory allocation
-  CALL_SUBTEST(check_handmade_aligned_malloc());
-  CALL_SUBTEST(check_aligned_malloc());
-  CALL_SUBTEST(check_aligned_new());
-  CALL_SUBTEST(check_aligned_stack_alloc());
-
-  for (int i=0; i<g_repeat*100; ++i)
-  {
-    CALL_SUBTEST( check_dynaligned<Vector4f>() );
-    CALL_SUBTEST( check_dynaligned<Vector2d>() );
-    CALL_SUBTEST( check_dynaligned<Matrix4f>() );
-    CALL_SUBTEST( check_dynaligned<Vector4d>() );
-    CALL_SUBTEST( check_dynaligned<Vector4i>() );
-  }
-  
-  // check static allocation, who knows ?
-  {
-    MyStruct foo0;  VERIFY(std::size_t(foo0.avec.data())%ALIGNMENT==0);
-    MyClassA fooA;  VERIFY(std::size_t(fooA.avec.data())%ALIGNMENT==0);
-  }
-
-  // dynamic allocation, single object
-  for (int i=0; i<g_repeat*100; ++i)
-  {
-    MyStruct *foo0 = new MyStruct();  VERIFY(std::size_t(foo0->avec.data())%ALIGNMENT==0);
-    MyClassA *fooA = new MyClassA();  VERIFY(std::size_t(fooA->avec.data())%ALIGNMENT==0);
-    delete foo0;
-    delete fooA;
-  }
-
-  // dynamic allocation, array
-  const int N = 10;
-  for (int i=0; i<g_repeat*100; ++i)
-  {
-    MyStruct *foo0 = new MyStruct[N];  VERIFY(std::size_t(foo0->avec.data())%ALIGNMENT==0);
-    MyClassA *fooA = new MyClassA[N];  VERIFY(std::size_t(fooA->avec.data())%ALIGNMENT==0);
-    delete[] foo0;
-    delete[] fooA;
-  }
-  
-}
diff --git a/test/eigen2/eigen2_eigensolver.cpp b/test/eigen2/eigen2_eigensolver.cpp
deleted file mode 100644
index 48b4ace43..000000000
--- a/test/eigen2/eigen2_eigensolver.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/QR>
-
-#ifdef HAS_GSL
-#include "gsl_helper.h"
-#endif
-
-template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
-{
-  /* this test covers the following files:
-     EigenSolver.h, SelfAdjointEigenSolver.h (and indirectly: Tridiagonalization.h)
-  */
-  int rows = m.rows();
-  int cols = m.cols();
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-  typedef Matrix<RealScalar, MatrixType::RowsAtCompileTime, 1> RealVectorType;
-  typedef typename std::complex<typename NumTraits<typename MatrixType::Scalar>::Real> Complex;
-
-  RealScalar largerEps = 10*test_precision<RealScalar>();
-
-  MatrixType a = MatrixType::Random(rows,cols);
-  MatrixType a1 = MatrixType::Random(rows,cols);
-  MatrixType symmA =  a.adjoint() * a + a1.adjoint() * a1;
-
-  MatrixType b = MatrixType::Random(rows,cols);
-  MatrixType b1 = MatrixType::Random(rows,cols);
-  MatrixType symmB = b.adjoint() * b + b1.adjoint() * b1;
-
-  SelfAdjointEigenSolver<MatrixType> eiSymm(symmA);
-  // generalized eigen pb
-  SelfAdjointEigenSolver<MatrixType> eiSymmGen(symmA, symmB);
-
-  #ifdef HAS_GSL
-  if (ei_is_same_type<RealScalar,double>::ret)
-  {
-    typedef GslTraits<Scalar> Gsl;
-    typename Gsl::Matrix gEvec=0, gSymmA=0, gSymmB=0;
-    typename GslTraits<RealScalar>::Vector gEval=0;
-    RealVectorType _eval;
-    MatrixType _evec;
-    convert<MatrixType>(symmA, gSymmA);
-    convert<MatrixType>(symmB, gSymmB);
-    convert<MatrixType>(symmA, gEvec);
-    gEval = GslTraits<RealScalar>::createVector(rows);
-
-    Gsl::eigen_symm(gSymmA, gEval, gEvec);
-    convert(gEval, _eval);
-    convert(gEvec, _evec);
-
-    // test gsl itself !
-    VERIFY((symmA * _evec).isApprox(_evec * _eval.asDiagonal(), largerEps));
-
-    // compare with eigen
-    VERIFY_IS_APPROX(_eval, eiSymm.eigenvalues());
-    VERIFY_IS_APPROX(_evec.cwise().abs(), eiSymm.eigenvectors().cwise().abs());
-
-    // generalized pb
-    Gsl::eigen_symm_gen(gSymmA, gSymmB, gEval, gEvec);
-    convert(gEval, _eval);
-    convert(gEvec, _evec);
-    // test GSL itself:
-    VERIFY((symmA * _evec).isApprox(symmB * (_evec * _eval.asDiagonal()), largerEps));
-
-    // compare with eigen
-    MatrixType normalized_eivec = eiSymmGen.eigenvectors()*eiSymmGen.eigenvectors().colwise().norm().asDiagonal().inverse();
-    VERIFY_IS_APPROX(_eval, eiSymmGen.eigenvalues());
-    VERIFY_IS_APPROX(_evec.cwiseAbs(), normalized_eivec.cwiseAbs());
-
-    Gsl::free(gSymmA);
-    Gsl::free(gSymmB);
-    GslTraits<RealScalar>::free(gEval);
-    Gsl::free(gEvec);
-  }
-  #endif
-
-  VERIFY((symmA * eiSymm.eigenvectors()).isApprox(
-          eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal(), largerEps));
-
-  // generalized eigen problem Ax = lBx
-  VERIFY((symmA * eiSymmGen.eigenvectors()).isApprox(
-          symmB * (eiSymmGen.eigenvectors() * eiSymmGen.eigenvalues().asDiagonal()), largerEps));
-
-  MatrixType sqrtSymmA = eiSymm.operatorSqrt();
-  VERIFY_IS_APPROX(symmA, sqrtSymmA*sqrtSymmA);
-  VERIFY_IS_APPROX(sqrtSymmA, symmA*eiSymm.operatorInverseSqrt());
-}
-
-template<typename MatrixType> void eigensolver(const MatrixType& m)
-{
-  /* this test covers the following files:
-     EigenSolver.h
-  */
-  int rows = m.rows();
-  int cols = m.cols();
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-  typedef Matrix<RealScalar, MatrixType::RowsAtCompileTime, 1> RealVectorType;
-  typedef typename std::complex<typename NumTraits<typename MatrixType::Scalar>::Real> Complex;
-
-  // RealScalar largerEps = 10*test_precision<RealScalar>();
-
-  MatrixType a = MatrixType::Random(rows,cols);
-  MatrixType a1 = MatrixType::Random(rows,cols);
-  MatrixType symmA =  a.adjoint() * a + a1.adjoint() * a1;
-
-  EigenSolver<MatrixType> ei0(symmA);
-  VERIFY_IS_APPROX(symmA * ei0.pseudoEigenvectors(), ei0.pseudoEigenvectors() * ei0.pseudoEigenvalueMatrix());
-  VERIFY_IS_APPROX((symmA.template cast<Complex>()) * (ei0.pseudoEigenvectors().template cast<Complex>()),
-    (ei0.pseudoEigenvectors().template cast<Complex>()) * (ei0.eigenvalues().asDiagonal()));
-
-  EigenSolver<MatrixType> ei1(a);
-  VERIFY_IS_APPROX(a * ei1.pseudoEigenvectors(), ei1.pseudoEigenvectors() * ei1.pseudoEigenvalueMatrix());
-  VERIFY_IS_APPROX(a.template cast<Complex>() * ei1.eigenvectors(),
-                   ei1.eigenvectors() * ei1.eigenvalues().asDiagonal());
-
-}
-
-void test_eigen2_eigensolver()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    // very important to test a 3x3 matrix since we provide a special path for it
-    CALL_SUBTEST_1( selfadjointeigensolver(Matrix3f()) );
-    CALL_SUBTEST_2( selfadjointeigensolver(Matrix4d()) );
-    CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(7,7)) );
-    CALL_SUBTEST_4( selfadjointeigensolver(MatrixXcd(5,5)) );
-    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXd(19,19)) );
-
-    CALL_SUBTEST_6( eigensolver(Matrix4f()) );
-    CALL_SUBTEST_5( eigensolver(MatrixXd(17,17)) );
-  }
-}
-
diff --git a/test/eigen2/eigen2_first_aligned.cpp b/test/eigen2/eigen2_first_aligned.cpp
deleted file mode 100644
index 51bb3cad1..000000000
--- a/test/eigen2/eigen2_first_aligned.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename Scalar>
-void test_eigen2_first_aligned_helper(Scalar *array, int size)
-{
-  const int packet_size = sizeof(Scalar) * ei_packet_traits<Scalar>::size;
-  VERIFY(((std::size_t(array) + sizeof(Scalar) * ei_alignmentOffset(array, size)) % packet_size) == 0);
-}
-
-template<typename Scalar>
-void test_eigen2_none_aligned_helper(Scalar *array, int size)
-{
-  VERIFY(ei_packet_traits<Scalar>::size == 1 || ei_alignmentOffset(array, size) == size);
-}
-
-struct some_non_vectorizable_type { float x; };
-
-void test_eigen2_first_aligned()
-{
-  EIGEN_ALIGN_128 float array_float[100];
-  test_first_aligned_helper(array_float, 50);
-  test_first_aligned_helper(array_float+1, 50);
-  test_first_aligned_helper(array_float+2, 50);
-  test_first_aligned_helper(array_float+3, 50);
-  test_first_aligned_helper(array_float+4, 50);
-  test_first_aligned_helper(array_float+5, 50);
-  
-  EIGEN_ALIGN_128 double array_double[100];
-  test_first_aligned_helper(array_double, 50);
-  test_first_aligned_helper(array_double+1, 50);
-  test_first_aligned_helper(array_double+2, 50);
-  
-  double *array_double_plus_4_bytes = (double*)(std::size_t(array_double)+4);
-  test_none_aligned_helper(array_double_plus_4_bytes, 50);
-  test_none_aligned_helper(array_double_plus_4_bytes+1, 50);
-  
-  some_non_vectorizable_type array_nonvec[100];
-  test_first_aligned_helper(array_nonvec, 100);
-  test_none_aligned_helper(array_nonvec, 100);
-}
diff --git a/test/eigen2/eigen2_geometry.cpp b/test/eigen2/eigen2_geometry.cpp
deleted file mode 100644
index 514040774..000000000
--- a/test/eigen2/eigen2_geometry.cpp
+++ /dev/null
@@ -1,432 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/Geometry>
-#include <Eigen/LU>
-#include <Eigen/SVD>
-
-template<typename Scalar> void geometry(void)
-{
-  /* this test covers the following files:
-     Cross.h Quaternion.h, Transform.cpp
-  */
-
-  typedef Matrix<Scalar,2,2> Matrix2;
-  typedef Matrix<Scalar,3,3> Matrix3;
-  typedef Matrix<Scalar,4,4> Matrix4;
-  typedef Matrix<Scalar,2,1> Vector2;
-  typedef Matrix<Scalar,3,1> Vector3;
-  typedef Matrix<Scalar,4,1> Vector4;
-  typedef Quaternion<Scalar> Quaternionx;
-  typedef AngleAxis<Scalar> AngleAxisx;
-  typedef Transform<Scalar,2> Transform2;
-  typedef Transform<Scalar,3> Transform3;
-  typedef Scaling<Scalar,2> Scaling2;
-  typedef Scaling<Scalar,3> Scaling3;
-  typedef Translation<Scalar,2> Translation2;
-  typedef Translation<Scalar,3> Translation3;
-
-  Scalar largeEps = test_precision<Scalar>();
-  if (ei_is_same_type<Scalar,float>::ret)
-    largeEps = 1e-2f;
-
-  Vector3 v0 = Vector3::Random(),
-    v1 = Vector3::Random(),
-    v2 = Vector3::Random();
-  Vector2 u0 = Vector2::Random();
-  Matrix3 matrot1;
-
-  Scalar a = ei_random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
-
-  // cross product
-  VERIFY_IS_MUCH_SMALLER_THAN(v1.cross(v2).eigen2_dot(v1), Scalar(1));
-  Matrix3 m;
-  m << v0.normalized(),
-      (v0.cross(v1)).normalized(),
-      (v0.cross(v1).cross(v0)).normalized();
-  VERIFY(m.isUnitary());
-
-  // Quaternion: Identity(), setIdentity();
-  Quaternionx q1, q2;
-  q2.setIdentity();
-  VERIFY_IS_APPROX(Quaternionx(Quaternionx::Identity()).coeffs(), q2.coeffs());
-  q1.coeffs().setRandom();
-  VERIFY_IS_APPROX(q1.coeffs(), (q1*q2).coeffs());
-
-  // unitOrthogonal
-  VERIFY_IS_MUCH_SMALLER_THAN(u0.unitOrthogonal().eigen2_dot(u0), Scalar(1));
-  VERIFY_IS_MUCH_SMALLER_THAN(v0.unitOrthogonal().eigen2_dot(v0), Scalar(1));
-  VERIFY_IS_APPROX(u0.unitOrthogonal().norm(), Scalar(1));
-  VERIFY_IS_APPROX(v0.unitOrthogonal().norm(), Scalar(1));
-
-
-  VERIFY_IS_APPROX(v0, AngleAxisx(a, v0.normalized()) * v0);
-  VERIFY_IS_APPROX(-v0, AngleAxisx(Scalar(M_PI), v0.unitOrthogonal()) * v0);
-  VERIFY_IS_APPROX(ei_cos(a)*v0.squaredNorm(), v0.eigen2_dot(AngleAxisx(a, v0.unitOrthogonal()) * v0));
-  m = AngleAxisx(a, v0.normalized()).toRotationMatrix().adjoint();
-  VERIFY_IS_APPROX(Matrix3::Identity(), m * AngleAxisx(a, v0.normalized()));
-  VERIFY_IS_APPROX(Matrix3::Identity(), AngleAxisx(a, v0.normalized()) * m);
-
-  q1 = AngleAxisx(a, v0.normalized());
-  q2 = AngleAxisx(a, v1.normalized());
-
-  // angular distance
-  Scalar refangle = ei_abs(AngleAxisx(q1.inverse()*q2).angle());
-  if (refangle>Scalar(M_PI))
-    refangle = Scalar(2)*Scalar(M_PI) - refangle;
-  
-  if((q1.coeffs()-q2.coeffs()).norm() > 10*largeEps)
-  {
-    VERIFY(ei_isApprox(q1.angularDistance(q2), refangle, largeEps));
-  }
-
-  // rotation matrix conversion
-  VERIFY_IS_APPROX(q1 * v2, q1.toRotationMatrix() * v2);
-  VERIFY_IS_APPROX(q1 * q2 * v2,
-    q1.toRotationMatrix() * q2.toRotationMatrix() * v2);
-
-  VERIFY( (q2*q1).isApprox(q1*q2, largeEps) || !(q2 * q1 * v2).isApprox(
-    q1.toRotationMatrix() * q2.toRotationMatrix() * v2));
-
-  q2 = q1.toRotationMatrix();
-  VERIFY_IS_APPROX(q1*v1,q2*v1);
-
-  matrot1 = AngleAxisx(Scalar(0.1), Vector3::UnitX())
-          * AngleAxisx(Scalar(0.2), Vector3::UnitY())
-          * AngleAxisx(Scalar(0.3), Vector3::UnitZ());
-  VERIFY_IS_APPROX(matrot1 * v1,
-       AngleAxisx(Scalar(0.1), Vector3(1,0,0)).toRotationMatrix()
-    * (AngleAxisx(Scalar(0.2), Vector3(0,1,0)).toRotationMatrix()
-    * (AngleAxisx(Scalar(0.3), Vector3(0,0,1)).toRotationMatrix() * v1)));
-
-  // angle-axis conversion
-  AngleAxisx aa = q1;
-  VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
-  VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1);
-
-  // from two vector creation
-  VERIFY_IS_APPROX(v2.normalized(),(q2.setFromTwoVectors(v1,v2)*v1).normalized());
-  VERIFY_IS_APPROX(v2.normalized(),(q2.setFromTwoVectors(v1,v2)*v1).normalized());
-
-  // inverse and conjugate
-  VERIFY_IS_APPROX(q1 * (q1.inverse() * v1), v1);
-  VERIFY_IS_APPROX(q1 * (q1.conjugate() * v1), v1);
-
-  // AngleAxis
-  VERIFY_IS_APPROX(AngleAxisx(a,v1.normalized()).toRotationMatrix(),
-    Quaternionx(AngleAxisx(a,v1.normalized())).toRotationMatrix());
-
-  AngleAxisx aa1;
-  m = q1.toRotationMatrix();
-  aa1 = m;
-  VERIFY_IS_APPROX(AngleAxisx(m).toRotationMatrix(),
-    Quaternionx(m).toRotationMatrix());
-
-  // Transform
-  // TODO complete the tests !
-  a = 0;
-  while (ei_abs(a)<Scalar(0.1))
-    a = ei_random<Scalar>(-Scalar(0.4)*Scalar(M_PI), Scalar(0.4)*Scalar(M_PI));
-  q1 = AngleAxisx(a, v0.normalized());
-  Transform3 t0, t1, t2;
-  // first test setIdentity() and Identity()
-  t0.setIdentity();
-  VERIFY_IS_APPROX(t0.matrix(), Transform3::MatrixType::Identity());
-  t0.matrix().setZero();
-  t0 = Transform3::Identity();
-  VERIFY_IS_APPROX(t0.matrix(), Transform3::MatrixType::Identity());
-
-  t0.linear() = q1.toRotationMatrix();
-  t1.setIdentity();
-  t1.linear() = q1.toRotationMatrix();
-
-  v0 << 50, 2, 1;//= ei_random_matrix<Vector3>().cwiseProduct(Vector3(10,2,0.5));
-  t0.scale(v0);
-  t1.prescale(v0);
-
-  VERIFY_IS_APPROX( (t0 * Vector3(1,0,0)).norm(), v0.x());
-  //VERIFY(!ei_isApprox((t1 * Vector3(1,0,0)).norm(), v0.x()));
-
-  t0.setIdentity();
-  t1.setIdentity();
-  v1 << 1, 2, 3;
-  t0.linear() = q1.toRotationMatrix();
-  t0.pretranslate(v0);
-  t0.scale(v1);
-  t1.linear() = q1.conjugate().toRotationMatrix();
-  t1.prescale(v1.cwise().inverse());
-  t1.translate(-v0);
-
-  VERIFY((t0.matrix() * t1.matrix()).isIdentity(test_precision<Scalar>()));
-
-  t1.fromPositionOrientationScale(v0, q1, v1);
-  VERIFY_IS_APPROX(t1.matrix(), t0.matrix());
-  VERIFY_IS_APPROX(t1*v1, t0*v1);
-
-  t0.setIdentity(); t0.scale(v0).rotate(q1.toRotationMatrix());
-  t1.setIdentity(); t1.scale(v0).rotate(q1);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  t0.setIdentity(); t0.scale(v0).rotate(AngleAxisx(q1));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  VERIFY_IS_APPROX(t0.scale(a).matrix(), t1.scale(Vector3::Constant(a)).matrix());
-  VERIFY_IS_APPROX(t0.prescale(a).matrix(), t1.prescale(Vector3::Constant(a)).matrix());
-
-  // More transform constructors, operator=, operator*=
-
-  Matrix3 mat3 = Matrix3::Random();
-  Matrix4 mat4;
-  mat4 << mat3 , Vector3::Zero() , Vector4::Zero().transpose();
-  Transform3 tmat3(mat3), tmat4(mat4);
-  tmat4.matrix()(3,3) = Scalar(1);
-  VERIFY_IS_APPROX(tmat3.matrix(), tmat4.matrix());
-
-  Scalar a3 = ei_random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
-  Vector3 v3 = Vector3::Random().normalized();
-  AngleAxisx aa3(a3, v3);
-  Transform3 t3(aa3);
-  Transform3 t4;
-  t4 = aa3;
-  VERIFY_IS_APPROX(t3.matrix(), t4.matrix());
-  t4.rotate(AngleAxisx(-a3,v3));
-  VERIFY_IS_APPROX(t4.matrix(), Matrix4::Identity());
-  t4 *= aa3;
-  VERIFY_IS_APPROX(t3.matrix(), t4.matrix());
-
-  v3 = Vector3::Random();
-  Translation3 tv3(v3);
-  Transform3 t5(tv3);
-  t4 = tv3;
-  VERIFY_IS_APPROX(t5.matrix(), t4.matrix());
-  t4.translate(-v3);
-  VERIFY_IS_APPROX(t4.matrix(), Matrix4::Identity());
-  t4 *= tv3;
-  VERIFY_IS_APPROX(t5.matrix(), t4.matrix());
-
-  Scaling3 sv3(v3);
-  Transform3 t6(sv3);
-  t4 = sv3;
-  VERIFY_IS_APPROX(t6.matrix(), t4.matrix());
-  t4.scale(v3.cwise().inverse());
-  VERIFY_IS_APPROX(t4.matrix(), Matrix4::Identity());
-  t4 *= sv3;
-  VERIFY_IS_APPROX(t6.matrix(), t4.matrix());
-
-  // matrix * transform
-  VERIFY_IS_APPROX(Transform3(t3.matrix()*t4).matrix(), Transform3(t3*t4).matrix());
-
-  // chained Transform product
-  VERIFY_IS_APPROX(((t3*t4)*t5).matrix(), (t3*(t4*t5)).matrix());
-
-  // check that Transform product doesn't have aliasing problems
-  t5 = t4;
-  t5 = t5*t5;
-  VERIFY_IS_APPROX(t5, t4*t4);
-
-  // 2D transformation
-  Transform2 t20, t21;
-  Vector2 v20 = Vector2::Random();
-  Vector2 v21 = Vector2::Random();
-  for (int k=0; k<2; ++k)
-    if (ei_abs(v21[k])<Scalar(1e-3)) v21[k] = Scalar(1e-3);
-  t21.setIdentity();
-  t21.linear() = Rotation2D<Scalar>(a).toRotationMatrix();
-  VERIFY_IS_APPROX(t20.fromPositionOrientationScale(v20,a,v21).matrix(),
-    t21.pretranslate(v20).scale(v21).matrix());
-
-  t21.setIdentity();
-  t21.linear() = Rotation2D<Scalar>(-a).toRotationMatrix();
-  VERIFY( (t20.fromPositionOrientationScale(v20,a,v21)
-        * (t21.prescale(v21.cwise().inverse()).translate(-v20))).matrix().isIdentity(test_precision<Scalar>()) );
-
-  // Transform - new API
-  // 3D
-  t0.setIdentity();
-  t0.rotate(q1).scale(v0).translate(v0);
-  // mat * scaling and mat * translation
-  t1 = (Matrix3(q1) * Scaling3(v0)) * Translation3(v0);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-  // mat * transformation and scaling * translation
-  t1 = Matrix3(q1) * (Scaling3(v0) * Translation3(v0));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  t0.setIdentity();
-  t0.prerotate(q1).prescale(v0).pretranslate(v0);
-  // translation * scaling and transformation * mat
-  t1 = (Translation3(v0) * Scaling3(v0)) * Matrix3(q1);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-  // scaling * mat and translation * mat
-  t1 = Translation3(v0) * (Scaling3(v0) * Matrix3(q1));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  t0.setIdentity();
-  t0.scale(v0).translate(v0).rotate(q1);
-  // translation * mat and scaling * transformation
-  t1 = Scaling3(v0) * (Translation3(v0) * Matrix3(q1));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-  // transformation * scaling
-  t0.scale(v0);
-  t1 = t1 * Scaling3(v0);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-  // transformation * translation
-  t0.translate(v0);
-  t1 = t1 * Translation3(v0);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-  // translation * transformation
-  t0.pretranslate(v0);
-  t1 = Translation3(v0) * t1;
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // transform * quaternion
-  t0.rotate(q1);
-  t1 = t1 * q1;
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // translation * quaternion
-  t0.translate(v1).rotate(q1);
-  t1 = t1 * (Translation3(v1) * q1);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // scaling * quaternion
-  t0.scale(v1).rotate(q1);
-  t1 = t1 * (Scaling3(v1) * q1);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // quaternion * transform
-  t0.prerotate(q1);
-  t1 = q1 * t1;
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // quaternion * translation
-  t0.rotate(q1).translate(v1);
-  t1 = t1 * (q1 * Translation3(v1));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // quaternion * scaling
-  t0.rotate(q1).scale(v1);
-  t1 = t1 * (q1 * Scaling3(v1));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // translation * vector
-  t0.setIdentity();
-  t0.translate(v0);
-  VERIFY_IS_APPROX(t0 * v1, Translation3(v0) * v1);
-
-  // scaling * vector
-  t0.setIdentity();
-  t0.scale(v0);
-  VERIFY_IS_APPROX(t0 * v1, Scaling3(v0) * v1);
-
-  // test transform inversion
-  t0.setIdentity();
-  t0.translate(v0);
-  t0.linear().setRandom();
-  VERIFY_IS_APPROX(t0.inverse(Affine), t0.matrix().inverse());
-  t0.setIdentity();
-  t0.translate(v0).rotate(q1);
-  VERIFY_IS_APPROX(t0.inverse(Isometry), t0.matrix().inverse());
-
-  // test extract rotation and scaling
-  t0.setIdentity();
-  t0.translate(v0).rotate(q1).scale(v1);
-  VERIFY_IS_APPROX(t0.rotation() * v1, Matrix3(q1) * v1);
-
-  Matrix3 mat_rotation, mat_scaling;
-  t0.setIdentity();
-  t0.translate(v0).rotate(q1).scale(v1);
-  t0.computeRotationScaling(&mat_rotation, &mat_scaling);
-  VERIFY_IS_APPROX(t0.linear(), mat_rotation * mat_scaling);
-  VERIFY_IS_APPROX(mat_rotation*mat_rotation.adjoint(), Matrix3::Identity());
-  VERIFY_IS_APPROX(mat_rotation.determinant(), Scalar(1));
-  t0.computeScalingRotation(&mat_scaling, &mat_rotation);
-  VERIFY_IS_APPROX(t0.linear(), mat_scaling * mat_rotation);
-  VERIFY_IS_APPROX(mat_rotation*mat_rotation.adjoint(), Matrix3::Identity());
-  VERIFY_IS_APPROX(mat_rotation.determinant(), Scalar(1));
-
-  // test casting
-  Transform<float,3> t1f = t1.template cast<float>();
-  VERIFY_IS_APPROX(t1f.template cast<Scalar>(),t1);
-  Transform<double,3> t1d = t1.template cast<double>();
-  VERIFY_IS_APPROX(t1d.template cast<Scalar>(),t1);
-
-  Translation3 tr1(v0);
-  Translation<float,3> tr1f = tr1.template cast<float>();
-  VERIFY_IS_APPROX(tr1f.template cast<Scalar>(),tr1);
-  Translation<double,3> tr1d = tr1.template cast<double>();
-  VERIFY_IS_APPROX(tr1d.template cast<Scalar>(),tr1);
-
-  Scaling3 sc1(v0);
-  Scaling<float,3> sc1f = sc1.template cast<float>();
-  VERIFY_IS_APPROX(sc1f.template cast<Scalar>(),sc1);
-  Scaling<double,3> sc1d = sc1.template cast<double>();
-  VERIFY_IS_APPROX(sc1d.template cast<Scalar>(),sc1);
-
-  Quaternion<float> q1f = q1.template cast<float>();
-  VERIFY_IS_APPROX(q1f.template cast<Scalar>(),q1);
-  Quaternion<double> q1d = q1.template cast<double>();
-  VERIFY_IS_APPROX(q1d.template cast<Scalar>(),q1);
-
-  AngleAxis<float> aa1f = aa1.template cast<float>();
-  VERIFY_IS_APPROX(aa1f.template cast<Scalar>(),aa1);
-  AngleAxis<double> aa1d = aa1.template cast<double>();
-  VERIFY_IS_APPROX(aa1d.template cast<Scalar>(),aa1);
-
-  Rotation2D<Scalar> r2d1(ei_random<Scalar>());
-  Rotation2D<float> r2d1f = r2d1.template cast<float>();
-  VERIFY_IS_APPROX(r2d1f.template cast<Scalar>(),r2d1);
-  Rotation2D<double> r2d1d = r2d1.template cast<double>();
-  VERIFY_IS_APPROX(r2d1d.template cast<Scalar>(),r2d1);
-
-  m = q1;
-//   m.col(1) = Vector3(0,ei_random<Scalar>(),ei_random<Scalar>()).normalized();
-//   m.col(0) = Vector3(-1,0,0).normalized();
-//   m.col(2) = m.col(0).cross(m.col(1));
-  #define VERIFY_EULER(I,J,K, X,Y,Z) { \
-    Vector3 ea = m.eulerAngles(I,J,K); \
-    Matrix3 m1 = Matrix3(AngleAxisx(ea[0], Vector3::Unit##X()) * AngleAxisx(ea[1], Vector3::Unit##Y()) * AngleAxisx(ea[2], Vector3::Unit##Z())); \
-    VERIFY_IS_APPROX(m, m1); \
-    VERIFY_IS_APPROX(m,  Matrix3(AngleAxisx(ea[0], Vector3::Unit##X()) * AngleAxisx(ea[1], Vector3::Unit##Y()) * AngleAxisx(ea[2], Vector3::Unit##Z()))); \
-  }
-  VERIFY_EULER(0,1,2, X,Y,Z);
-  VERIFY_EULER(0,1,0, X,Y,X);
-  VERIFY_EULER(0,2,1, X,Z,Y);
-  VERIFY_EULER(0,2,0, X,Z,X);
-
-  VERIFY_EULER(1,2,0, Y,Z,X);
-  VERIFY_EULER(1,2,1, Y,Z,Y);
-  VERIFY_EULER(1,0,2, Y,X,Z);
-  VERIFY_EULER(1,0,1, Y,X,Y);
-
-  VERIFY_EULER(2,0,1, Z,X,Y);
-  VERIFY_EULER(2,0,2, Z,X,Z);
-  VERIFY_EULER(2,1,0, Z,Y,X);
-  VERIFY_EULER(2,1,2, Z,Y,Z);
-
-  // colwise/rowwise cross product
-  mat3.setRandom();
-  Vector3 vec3 = Vector3::Random();
-  Matrix3 mcross;
-  int i = ei_random<int>(0,2);
-  mcross = mat3.colwise().cross(vec3);
-  VERIFY_IS_APPROX(mcross.col(i), mat3.col(i).cross(vec3));
-  mcross = mat3.rowwise().cross(vec3);
-  VERIFY_IS_APPROX(mcross.row(i), mat3.row(i).cross(vec3));
-
-
-}
-
-void test_eigen2_geometry()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( geometry<float>() );
-    CALL_SUBTEST_2( geometry<double>() );
-  }
-}
diff --git a/test/eigen2/eigen2_geometry_with_eigen2_prefix.cpp b/test/eigen2/eigen2_geometry_with_eigen2_prefix.cpp
deleted file mode 100644
index 12d4a71c3..000000000
--- a/test/eigen2/eigen2_geometry_with_eigen2_prefix.cpp
+++ /dev/null
@@ -1,435 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN2_SUPPORT_STAGE15_RESOLVE_API_CONFLICTS_WARN
-
-#include "main.h"
-#include <Eigen/Geometry>
-#include <Eigen/LU>
-#include <Eigen/SVD>
-
-template<typename Scalar> void geometry(void)
-{
-  /* this test covers the following files:
-     Cross.h Quaternion.h, Transform.cpp
-  */
-
-  typedef Matrix<Scalar,2,2> Matrix2;
-  typedef Matrix<Scalar,3,3> Matrix3;
-  typedef Matrix<Scalar,4,4> Matrix4;
-  typedef Matrix<Scalar,2,1> Vector2;
-  typedef Matrix<Scalar,3,1> Vector3;
-  typedef Matrix<Scalar,4,1> Vector4;
-  typedef eigen2_Quaternion<Scalar> Quaternionx;
-  typedef eigen2_AngleAxis<Scalar> AngleAxisx;
-  typedef eigen2_Transform<Scalar,2> Transform2;
-  typedef eigen2_Transform<Scalar,3> Transform3;
-  typedef eigen2_Scaling<Scalar,2> Scaling2;
-  typedef eigen2_Scaling<Scalar,3> Scaling3;
-  typedef eigen2_Translation<Scalar,2> Translation2;
-  typedef eigen2_Translation<Scalar,3> Translation3;
-
-  Scalar largeEps = test_precision<Scalar>();
-  if (ei_is_same_type<Scalar,float>::ret)
-    largeEps = 1e-2f;
-
-  Vector3 v0 = Vector3::Random(),
-    v1 = Vector3::Random(),
-    v2 = Vector3::Random();
-  Vector2 u0 = Vector2::Random();
-  Matrix3 matrot1;
-
-  Scalar a = ei_random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
-
-  // cross product
-  VERIFY_IS_MUCH_SMALLER_THAN(v1.cross(v2).eigen2_dot(v1), Scalar(1));
-  Matrix3 m;
-  m << v0.normalized(),
-      (v0.cross(v1)).normalized(),
-      (v0.cross(v1).cross(v0)).normalized();
-  VERIFY(m.isUnitary());
-
-  // Quaternion: Identity(), setIdentity();
-  Quaternionx q1, q2;
-  q2.setIdentity();
-  VERIFY_IS_APPROX(Quaternionx(Quaternionx::Identity()).coeffs(), q2.coeffs());
-  q1.coeffs().setRandom();
-  VERIFY_IS_APPROX(q1.coeffs(), (q1*q2).coeffs());
-
-  // unitOrthogonal
-  VERIFY_IS_MUCH_SMALLER_THAN(u0.unitOrthogonal().eigen2_dot(u0), Scalar(1));
-  VERIFY_IS_MUCH_SMALLER_THAN(v0.unitOrthogonal().eigen2_dot(v0), Scalar(1));
-  VERIFY_IS_APPROX(u0.unitOrthogonal().norm(), Scalar(1));
-  VERIFY_IS_APPROX(v0.unitOrthogonal().norm(), Scalar(1));
-
-
-  VERIFY_IS_APPROX(v0, AngleAxisx(a, v0.normalized()) * v0);
-  VERIFY_IS_APPROX(-v0, AngleAxisx(Scalar(M_PI), v0.unitOrthogonal()) * v0);
-  VERIFY_IS_APPROX(ei_cos(a)*v0.squaredNorm(), v0.eigen2_dot(AngleAxisx(a, v0.unitOrthogonal()) * v0));
-  m = AngleAxisx(a, v0.normalized()).toRotationMatrix().adjoint();
-  VERIFY_IS_APPROX(Matrix3::Identity(), m * AngleAxisx(a, v0.normalized()));
-  VERIFY_IS_APPROX(Matrix3::Identity(), AngleAxisx(a, v0.normalized()) * m);
-
-  q1 = AngleAxisx(a, v0.normalized());
-  q2 = AngleAxisx(a, v1.normalized());
-
-  // angular distance
-  Scalar refangle = ei_abs(AngleAxisx(q1.inverse()*q2).angle());
-  if (refangle>Scalar(M_PI))
-    refangle = Scalar(2)*Scalar(M_PI) - refangle;
-  
-  if((q1.coeffs()-q2.coeffs()).norm() > 10*largeEps)
-  {
-    VERIFY(ei_isApprox(q1.angularDistance(q2), refangle, largeEps));
-  }
-
-  // rotation matrix conversion
-  VERIFY_IS_APPROX(q1 * v2, q1.toRotationMatrix() * v2);
-  VERIFY_IS_APPROX(q1 * q2 * v2,
-    q1.toRotationMatrix() * q2.toRotationMatrix() * v2);
-
-  VERIFY( (q2*q1).isApprox(q1*q2, largeEps) || !(q2 * q1 * v2).isApprox(
-    q1.toRotationMatrix() * q2.toRotationMatrix() * v2));
-
-  q2 = q1.toRotationMatrix();
-  VERIFY_IS_APPROX(q1*v1,q2*v1);
-
-  matrot1 = AngleAxisx(Scalar(0.1), Vector3::UnitX())
-          * AngleAxisx(Scalar(0.2), Vector3::UnitY())
-          * AngleAxisx(Scalar(0.3), Vector3::UnitZ());
-  VERIFY_IS_APPROX(matrot1 * v1,
-       AngleAxisx(Scalar(0.1), Vector3(1,0,0)).toRotationMatrix()
-    * (AngleAxisx(Scalar(0.2), Vector3(0,1,0)).toRotationMatrix()
-    * (AngleAxisx(Scalar(0.3), Vector3(0,0,1)).toRotationMatrix() * v1)));
-
-  // angle-axis conversion
-  AngleAxisx aa = q1;
-  VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
-  VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1);
-
-  // from two vector creation
-  VERIFY_IS_APPROX(v2.normalized(),(q2.setFromTwoVectors(v1,v2)*v1).normalized());
-  VERIFY_IS_APPROX(v2.normalized(),(q2.setFromTwoVectors(v1,v2)*v1).normalized());
-
-  // inverse and conjugate
-  VERIFY_IS_APPROX(q1 * (q1.inverse() * v1), v1);
-  VERIFY_IS_APPROX(q1 * (q1.conjugate() * v1), v1);
-
-  // AngleAxis
-  VERIFY_IS_APPROX(AngleAxisx(a,v1.normalized()).toRotationMatrix(),
-    Quaternionx(AngleAxisx(a,v1.normalized())).toRotationMatrix());
-
-  AngleAxisx aa1;
-  m = q1.toRotationMatrix();
-  aa1 = m;
-  VERIFY_IS_APPROX(AngleAxisx(m).toRotationMatrix(),
-    Quaternionx(m).toRotationMatrix());
-
-  // Transform
-  // TODO complete the tests !
-  a = 0;
-  while (ei_abs(a)<Scalar(0.1))
-    a = ei_random<Scalar>(-Scalar(0.4)*Scalar(M_PI), Scalar(0.4)*Scalar(M_PI));
-  q1 = AngleAxisx(a, v0.normalized());
-  Transform3 t0, t1, t2;
-  // first test setIdentity() and Identity()
-  t0.setIdentity();
-  VERIFY_IS_APPROX(t0.matrix(), Transform3::MatrixType::Identity());
-  t0.matrix().setZero();
-  t0 = Transform3::Identity();
-  VERIFY_IS_APPROX(t0.matrix(), Transform3::MatrixType::Identity());
-
-  t0.linear() = q1.toRotationMatrix();
-  t1.setIdentity();
-  t1.linear() = q1.toRotationMatrix();
-
-  v0 << 50, 2, 1;//= ei_random_matrix<Vector3>().cwiseProduct(Vector3(10,2,0.5));
-  t0.scale(v0);
-  t1.prescale(v0);
-
-  VERIFY_IS_APPROX( (t0 * Vector3(1,0,0)).norm(), v0.x());
-  //VERIFY(!ei_isApprox((t1 * Vector3(1,0,0)).norm(), v0.x()));
-
-  t0.setIdentity();
-  t1.setIdentity();
-  v1 << 1, 2, 3;
-  t0.linear() = q1.toRotationMatrix();
-  t0.pretranslate(v0);
-  t0.scale(v1);
-  t1.linear() = q1.conjugate().toRotationMatrix();
-  t1.prescale(v1.cwise().inverse());
-  t1.translate(-v0);
-
-  VERIFY((t0.matrix() * t1.matrix()).isIdentity(test_precision<Scalar>()));
-
-  t1.fromPositionOrientationScale(v0, q1, v1);
-  VERIFY_IS_APPROX(t1.matrix(), t0.matrix());
-  VERIFY_IS_APPROX(t1*v1, t0*v1);
-
-  t0.setIdentity(); t0.scale(v0).rotate(q1.toRotationMatrix());
-  t1.setIdentity(); t1.scale(v0).rotate(q1);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  t0.setIdentity(); t0.scale(v0).rotate(AngleAxisx(q1));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  VERIFY_IS_APPROX(t0.scale(a).matrix(), t1.scale(Vector3::Constant(a)).matrix());
-  VERIFY_IS_APPROX(t0.prescale(a).matrix(), t1.prescale(Vector3::Constant(a)).matrix());
-
-  // More transform constructors, operator=, operator*=
-
-  Matrix3 mat3 = Matrix3::Random();
-  Matrix4 mat4;
-  mat4 << mat3 , Vector3::Zero() , Vector4::Zero().transpose();
-  Transform3 tmat3(mat3), tmat4(mat4);
-  tmat4.matrix()(3,3) = Scalar(1);
-  VERIFY_IS_APPROX(tmat3.matrix(), tmat4.matrix());
-
-  Scalar a3 = ei_random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
-  Vector3 v3 = Vector3::Random().normalized();
-  AngleAxisx aa3(a3, v3);
-  Transform3 t3(aa3);
-  Transform3 t4;
-  t4 = aa3;
-  VERIFY_IS_APPROX(t3.matrix(), t4.matrix());
-  t4.rotate(AngleAxisx(-a3,v3));
-  VERIFY_IS_APPROX(t4.matrix(), Matrix4::Identity());
-  t4 *= aa3;
-  VERIFY_IS_APPROX(t3.matrix(), t4.matrix());
-
-  v3 = Vector3::Random();
-  Translation3 tv3(v3);
-  Transform3 t5(tv3);
-  t4 = tv3;
-  VERIFY_IS_APPROX(t5.matrix(), t4.matrix());
-  t4.translate(-v3);
-  VERIFY_IS_APPROX(t4.matrix(), Matrix4::Identity());
-  t4 *= tv3;
-  VERIFY_IS_APPROX(t5.matrix(), t4.matrix());
-
-  Scaling3 sv3(v3);
-  Transform3 t6(sv3);
-  t4 = sv3;
-  VERIFY_IS_APPROX(t6.matrix(), t4.matrix());
-  t4.scale(v3.cwise().inverse());
-  VERIFY_IS_APPROX(t4.matrix(), Matrix4::Identity());
-  t4 *= sv3;
-  VERIFY_IS_APPROX(t6.matrix(), t4.matrix());
-
-  // matrix * transform
-  VERIFY_IS_APPROX(Transform3(t3.matrix()*t4).matrix(), Transform3(t3*t4).matrix());
-
-  // chained Transform product
-  VERIFY_IS_APPROX(((t3*t4)*t5).matrix(), (t3*(t4*t5)).matrix());
-
-  // check that Transform product doesn't have aliasing problems
-  t5 = t4;
-  t5 = t5*t5;
-  VERIFY_IS_APPROX(t5, t4*t4);
-
-  // 2D transformation
-  Transform2 t20, t21;
-  Vector2 v20 = Vector2::Random();
-  Vector2 v21 = Vector2::Random();
-  for (int k=0; k<2; ++k)
-    if (ei_abs(v21[k])<Scalar(1e-3)) v21[k] = Scalar(1e-3);
-  t21.setIdentity();
-  t21.linear() = Rotation2D<Scalar>(a).toRotationMatrix();
-  VERIFY_IS_APPROX(t20.fromPositionOrientationScale(v20,a,v21).matrix(),
-    t21.pretranslate(v20).scale(v21).matrix());
-
-  t21.setIdentity();
-  t21.linear() = Rotation2D<Scalar>(-a).toRotationMatrix();
-  VERIFY( (t20.fromPositionOrientationScale(v20,a,v21)
-        * (t21.prescale(v21.cwise().inverse()).translate(-v20))).matrix().isIdentity(test_precision<Scalar>()) );
-
-  // Transform - new API
-  // 3D
-  t0.setIdentity();
-  t0.rotate(q1).scale(v0).translate(v0);
-  // mat * scaling and mat * translation
-  t1 = (Matrix3(q1) * Scaling3(v0)) * Translation3(v0);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-  // mat * transformation and scaling * translation
-  t1 = Matrix3(q1) * (Scaling3(v0) * Translation3(v0));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  t0.setIdentity();
-  t0.prerotate(q1).prescale(v0).pretranslate(v0);
-  // translation * scaling and transformation * mat
-  t1 = (Translation3(v0) * Scaling3(v0)) * Matrix3(q1);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-  // scaling * mat and translation * mat
-  t1 = Translation3(v0) * (Scaling3(v0) * Matrix3(q1));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  t0.setIdentity();
-  t0.scale(v0).translate(v0).rotate(q1);
-  // translation * mat and scaling * transformation
-  t1 = Scaling3(v0) * (Translation3(v0) * Matrix3(q1));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-  // transformation * scaling
-  t0.scale(v0);
-  t1 = t1 * Scaling3(v0);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-  // transformation * translation
-  t0.translate(v0);
-  t1 = t1 * Translation3(v0);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-  // translation * transformation
-  t0.pretranslate(v0);
-  t1 = Translation3(v0) * t1;
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // transform * quaternion
-  t0.rotate(q1);
-  t1 = t1 * q1;
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // translation * quaternion
-  t0.translate(v1).rotate(q1);
-  t1 = t1 * (Translation3(v1) * q1);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // scaling * quaternion
-  t0.scale(v1).rotate(q1);
-  t1 = t1 * (Scaling3(v1) * q1);
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // quaternion * transform
-  t0.prerotate(q1);
-  t1 = q1 * t1;
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // quaternion * translation
-  t0.rotate(q1).translate(v1);
-  t1 = t1 * (q1 * Translation3(v1));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // quaternion * scaling
-  t0.rotate(q1).scale(v1);
-  t1 = t1 * (q1 * Scaling3(v1));
-  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
-
-  // translation * vector
-  t0.setIdentity();
-  t0.translate(v0);
-  VERIFY_IS_APPROX(t0 * v1, Translation3(v0) * v1);
-
-  // scaling * vector
-  t0.setIdentity();
-  t0.scale(v0);
-  VERIFY_IS_APPROX(t0 * v1, Scaling3(v0) * v1);
-
-  // test transform inversion
-  t0.setIdentity();
-  t0.translate(v0);
-  t0.linear().setRandom();
-  VERIFY_IS_APPROX(t0.inverse(Affine), t0.matrix().inverse());
-  t0.setIdentity();
-  t0.translate(v0).rotate(q1);
-  VERIFY_IS_APPROX(t0.inverse(Isometry), t0.matrix().inverse());
-
-  // test extract rotation and scaling
-  t0.setIdentity();
-  t0.translate(v0).rotate(q1).scale(v1);
-  VERIFY_IS_APPROX(t0.rotation() * v1, Matrix3(q1) * v1);
-
-  Matrix3 mat_rotation, mat_scaling;
-  t0.setIdentity();
-  t0.translate(v0).rotate(q1).scale(v1);
-  t0.computeRotationScaling(&mat_rotation, &mat_scaling);
-  VERIFY_IS_APPROX(t0.linear(), mat_rotation * mat_scaling);
-  VERIFY_IS_APPROX(mat_rotation*mat_rotation.adjoint(), Matrix3::Identity());
-  VERIFY_IS_APPROX(mat_rotation.determinant(), Scalar(1));
-  t0.computeScalingRotation(&mat_scaling, &mat_rotation);
-  VERIFY_IS_APPROX(t0.linear(), mat_scaling * mat_rotation);
-  VERIFY_IS_APPROX(mat_rotation*mat_rotation.adjoint(), Matrix3::Identity());
-  VERIFY_IS_APPROX(mat_rotation.determinant(), Scalar(1));
-
-  // test casting
-  eigen2_Transform<float,3> t1f = t1.template cast<float>();
-  VERIFY_IS_APPROX(t1f.template cast<Scalar>(),t1);
-  eigen2_Transform<double,3> t1d = t1.template cast<double>();
-  VERIFY_IS_APPROX(t1d.template cast<Scalar>(),t1);
-
-  Translation3 tr1(v0);
-  eigen2_Translation<float,3> tr1f = tr1.template cast<float>();
-  VERIFY_IS_APPROX(tr1f.template cast<Scalar>(),tr1);
-  eigen2_Translation<double,3> tr1d = tr1.template cast<double>();
-  VERIFY_IS_APPROX(tr1d.template cast<Scalar>(),tr1);
-
-  Scaling3 sc1(v0);
-  eigen2_Scaling<float,3> sc1f = sc1.template cast<float>();
-  VERIFY_IS_APPROX(sc1f.template cast<Scalar>(),sc1);
-  eigen2_Scaling<double,3> sc1d = sc1.template cast<double>();
-  VERIFY_IS_APPROX(sc1d.template cast<Scalar>(),sc1);
-
-  eigen2_Quaternion<float> q1f = q1.template cast<float>();
-  VERIFY_IS_APPROX(q1f.template cast<Scalar>(),q1);
-  eigen2_Quaternion<double> q1d = q1.template cast<double>();
-  VERIFY_IS_APPROX(q1d.template cast<Scalar>(),q1);
-
-  eigen2_AngleAxis<float> aa1f = aa1.template cast<float>();
-  VERIFY_IS_APPROX(aa1f.template cast<Scalar>(),aa1);
-  eigen2_AngleAxis<double> aa1d = aa1.template cast<double>();
-  VERIFY_IS_APPROX(aa1d.template cast<Scalar>(),aa1);
-
-  eigen2_Rotation2D<Scalar> r2d1(ei_random<Scalar>());
-  eigen2_Rotation2D<float> r2d1f = r2d1.template cast<float>();
-  VERIFY_IS_APPROX(r2d1f.template cast<Scalar>(),r2d1);
-  eigen2_Rotation2D<double> r2d1d = r2d1.template cast<double>();
-  VERIFY_IS_APPROX(r2d1d.template cast<Scalar>(),r2d1);
-
-  m = q1;
-//   m.col(1) = Vector3(0,ei_random<Scalar>(),ei_random<Scalar>()).normalized();
-//   m.col(0) = Vector3(-1,0,0).normalized();
-//   m.col(2) = m.col(0).cross(m.col(1));
-  #define VERIFY_EULER(I,J,K, X,Y,Z) { \
-    Vector3 ea = m.eulerAngles(I,J,K); \
-    Matrix3 m1 = Matrix3(AngleAxisx(ea[0], Vector3::Unit##X()) * AngleAxisx(ea[1], Vector3::Unit##Y()) * AngleAxisx(ea[2], Vector3::Unit##Z())); \
-    VERIFY_IS_APPROX(m, m1); \
-    VERIFY_IS_APPROX(m,  Matrix3(AngleAxisx(ea[0], Vector3::Unit##X()) * AngleAxisx(ea[1], Vector3::Unit##Y()) * AngleAxisx(ea[2], Vector3::Unit##Z()))); \
-  }
-  VERIFY_EULER(0,1,2, X,Y,Z);
-  VERIFY_EULER(0,1,0, X,Y,X);
-  VERIFY_EULER(0,2,1, X,Z,Y);
-  VERIFY_EULER(0,2,0, X,Z,X);
-
-  VERIFY_EULER(1,2,0, Y,Z,X);
-  VERIFY_EULER(1,2,1, Y,Z,Y);
-  VERIFY_EULER(1,0,2, Y,X,Z);
-  VERIFY_EULER(1,0,1, Y,X,Y);
-
-  VERIFY_EULER(2,0,1, Z,X,Y);
-  VERIFY_EULER(2,0,2, Z,X,Z);
-  VERIFY_EULER(2,1,0, Z,Y,X);
-  VERIFY_EULER(2,1,2, Z,Y,Z);
-
-  // colwise/rowwise cross product
-  mat3.setRandom();
-  Vector3 vec3 = Vector3::Random();
-  Matrix3 mcross;
-  int i = ei_random<int>(0,2);
-  mcross = mat3.colwise().cross(vec3);
-  VERIFY_IS_APPROX(mcross.col(i), mat3.col(i).cross(vec3));
-  mcross = mat3.rowwise().cross(vec3);
-  VERIFY_IS_APPROX(mcross.row(i), mat3.row(i).cross(vec3));
-
-
-}
-
-void test_eigen2_geometry_with_eigen2_prefix()
-{
-  std::cout << "eigen2 support: " << EIGEN2_SUPPORT_STAGE << std::endl;
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( geometry<float>() );
-    CALL_SUBTEST_2( geometry<double>() );
-  }
-}
diff --git a/test/eigen2/eigen2_hyperplane.cpp b/test/eigen2/eigen2_hyperplane.cpp
deleted file mode 100644
index f3f85e14d..000000000
--- a/test/eigen2/eigen2_hyperplane.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/Geometry>
-#include <Eigen/LU>
-#include <Eigen/QR>
-
-template<typename HyperplaneType> void hyperplane(const HyperplaneType& _plane)
-{
-  /* this test covers the following files:
-     Hyperplane.h
-  */
-
-  const int dim = _plane.dim();
-  typedef typename HyperplaneType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, HyperplaneType::AmbientDimAtCompileTime, 1> VectorType;
-  typedef Matrix<Scalar, HyperplaneType::AmbientDimAtCompileTime,
-                         HyperplaneType::AmbientDimAtCompileTime> MatrixType;
-
-  VectorType p0 = VectorType::Random(dim);
-  VectorType p1 = VectorType::Random(dim);
-
-  VectorType n0 = VectorType::Random(dim).normalized();
-  VectorType n1 = VectorType::Random(dim).normalized();
-
-  HyperplaneType pl0(n0, p0);
-  HyperplaneType pl1(n1, p1);
-  HyperplaneType pl2 = pl1;
-
-  Scalar s0 = ei_random<Scalar>();
-  Scalar s1 = ei_random<Scalar>();
-
-  VERIFY_IS_APPROX( n1.eigen2_dot(n1), Scalar(1) );
-
-  VERIFY_IS_MUCH_SMALLER_THAN( pl0.absDistance(p0), Scalar(1) );
-  VERIFY_IS_APPROX( pl1.signedDistance(p1 + n1 * s0), s0 );
-  VERIFY_IS_MUCH_SMALLER_THAN( pl1.signedDistance(pl1.projection(p0)), Scalar(1) );
-  VERIFY_IS_MUCH_SMALLER_THAN( pl1.absDistance(p1 +  pl1.normal().unitOrthogonal() * s1), Scalar(1) );
-
-  // transform
-  if (!NumTraits<Scalar>::IsComplex)
-  {
-    MatrixType rot = MatrixType::Random(dim,dim).qr().matrixQ();
-    Scaling<Scalar,HyperplaneType::AmbientDimAtCompileTime> scaling(VectorType::Random());
-    Translation<Scalar,HyperplaneType::AmbientDimAtCompileTime> translation(VectorType::Random());
-
-    pl2 = pl1;
-    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot).absDistance(rot * p1), Scalar(1) );
-    pl2 = pl1;
-    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot,Isometry).absDistance(rot * p1), Scalar(1) );
-    pl2 = pl1;
-    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*scaling).absDistance((rot*scaling) * p1), Scalar(1) );
-    pl2 = pl1;
-    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*scaling*translation)
-                                 .absDistance((rot*scaling*translation) * p1), Scalar(1) );
-    pl2 = pl1;
-    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*translation,Isometry)
-                                 .absDistance((rot*translation) * p1), Scalar(1) );
-  }
-
-  // casting
-  const int Dim = HyperplaneType::AmbientDimAtCompileTime;
-  typedef typename GetDifferentType<Scalar>::type OtherScalar;
-  Hyperplane<OtherScalar,Dim> hp1f = pl1.template cast<OtherScalar>();
-  VERIFY_IS_APPROX(hp1f.template cast<Scalar>(),pl1);
-  Hyperplane<Scalar,Dim> hp1d = pl1.template cast<Scalar>();
-  VERIFY_IS_APPROX(hp1d.template cast<Scalar>(),pl1);
-}
-
-template<typename Scalar> void lines()
-{
-  typedef Hyperplane<Scalar, 2> HLine;
-  typedef ParametrizedLine<Scalar, 2> PLine;
-  typedef Matrix<Scalar,2,1> Vector;
-  typedef Matrix<Scalar,3,1> CoeffsType;
-
-  for(int i = 0; i < 10; i++)
-  {
-    Vector center = Vector::Random();
-    Vector u = Vector::Random();
-    Vector v = Vector::Random();
-    Scalar a = ei_random<Scalar>();
-    while (ei_abs(a-1) < 1e-4) a = ei_random<Scalar>();
-    while (u.norm() < 1e-4) u = Vector::Random();
-    while (v.norm() < 1e-4) v = Vector::Random();
-
-    HLine line_u = HLine::Through(center + u, center + a*u);
-    HLine line_v = HLine::Through(center + v, center + a*v);
-
-    // the line equations should be normalized so that a^2+b^2=1
-    VERIFY_IS_APPROX(line_u.normal().norm(), Scalar(1));
-    VERIFY_IS_APPROX(line_v.normal().norm(), Scalar(1));
-
-    Vector result = line_u.intersection(line_v);
-
-    // the lines should intersect at the point we called "center"
-    VERIFY_IS_APPROX(result, center);
-
-    // check conversions between two types of lines
-    PLine pl(line_u); // gcc 3.3 will commit suicide if we don't name this variable
-    CoeffsType converted_coeffs(HLine(pl).coeffs());
-    converted_coeffs *= line_u.coeffs()(0)/converted_coeffs(0);
-    VERIFY(line_u.coeffs().isApprox(converted_coeffs));
-  }
-}
-
-void test_eigen2_hyperplane()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( hyperplane(Hyperplane<float,2>()) );
-    CALL_SUBTEST_2( hyperplane(Hyperplane<float,3>()) );
-    CALL_SUBTEST_3( hyperplane(Hyperplane<double,4>()) );
-    CALL_SUBTEST_4( hyperplane(Hyperplane<std::complex<double>,5>()) );
-    CALL_SUBTEST_5( lines<float>() );
-    CALL_SUBTEST_6( lines<double>() );
-  }
-}
diff --git a/test/eigen2/eigen2_inverse.cpp b/test/eigen2/eigen2_inverse.cpp
deleted file mode 100644
index ccd24a194..000000000
--- a/test/eigen2/eigen2_inverse.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/LU>
-
-template<typename MatrixType> void inverse(const MatrixType& m)
-{
-  /* this test covers the following files:
-     Inverse.h
-  */
-  int rows = m.rows();
-  int cols = m.cols();
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
-
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2(rows, cols),
-             identity = MatrixType::Identity(rows, rows);
-
-  while(ei_abs(m1.determinant()) < RealScalar(0.1) && rows <= 8)
-  {
-    m1 = MatrixType::Random(rows, cols);
-  }
-
-  m2 = m1.inverse();
-  VERIFY_IS_APPROX(m1, m2.inverse() );
-
-  m1.computeInverse(&m2);
-  VERIFY_IS_APPROX(m1, m2.inverse() );
-
-  VERIFY_IS_APPROX((Scalar(2)*m2).inverse(), m2.inverse()*Scalar(0.5));
-
-  VERIFY_IS_APPROX(identity, m1.inverse() * m1 );
-  VERIFY_IS_APPROX(identity, m1 * m1.inverse() );
-
-  VERIFY_IS_APPROX(m1, m1.inverse().inverse() );
-
-  // since for the general case we implement separately row-major and col-major, test that
-  VERIFY_IS_APPROX(m1.transpose().inverse(), m1.inverse().transpose());
-}
-
-void test_eigen2_inverse()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( inverse(Matrix<double,1,1>()) );
-    CALL_SUBTEST_2( inverse(Matrix2d()) );
-    CALL_SUBTEST_3( inverse(Matrix3f()) );
-    CALL_SUBTEST_4( inverse(Matrix4f()) );
-    CALL_SUBTEST_5( inverse(MatrixXf(8,8)) );
-    CALL_SUBTEST_6( inverse(MatrixXcd(7,7)) );
-  }
-}
diff --git a/test/eigen2/eigen2_linearstructure.cpp b/test/eigen2/eigen2_linearstructure.cpp
deleted file mode 100644
index 488f4c485..000000000
--- a/test/eigen2/eigen2_linearstructure.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename MatrixType> void linearStructure(const MatrixType& m)
-{
-  /* this test covers the following files:
-     Sum.h Difference.h Opposite.h ScalarMultiple.h
-  */
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  // this test relies a lot on Random.h, and there's not much more that we can do
-  // to test it, hence I consider that we will have tested Random.h
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols),
-             m3(rows, cols);
-
-  Scalar s1 = ei_random<Scalar>();
-  while (ei_abs(s1)<1e-3) s1 = ei_random<Scalar>();
-
-  int r = ei_random<int>(0, rows-1),
-      c = ei_random<int>(0, cols-1);
-
-  VERIFY_IS_APPROX(-(-m1),                  m1);
-  VERIFY_IS_APPROX(m1+m1,                   2*m1);
-  VERIFY_IS_APPROX(m1+m2-m1,                m2);
-  VERIFY_IS_APPROX(-m2+m1+m2,               m1);
-  VERIFY_IS_APPROX(m1*s1,                   s1*m1);
-  VERIFY_IS_APPROX((m1+m2)*s1,              s1*m1+s1*m2);
-  VERIFY_IS_APPROX((-m1+m2)*s1,             -s1*m1+s1*m2);
-  m3 = m2; m3 += m1;
-  VERIFY_IS_APPROX(m3,                      m1+m2);
-  m3 = m2; m3 -= m1;
-  VERIFY_IS_APPROX(m3,                      m2-m1);
-  m3 = m2; m3 *= s1;
-  VERIFY_IS_APPROX(m3,                      s1*m2);
-  if(NumTraits<Scalar>::HasFloatingPoint)
-  {
-    m3 = m2; m3 /= s1;
-    VERIFY_IS_APPROX(m3,                    m2/s1);
-  }
-
-  // again, test operator() to check const-qualification
-  VERIFY_IS_APPROX((-m1)(r,c), -(m1(r,c)));
-  VERIFY_IS_APPROX((m1-m2)(r,c), (m1(r,c))-(m2(r,c)));
-  VERIFY_IS_APPROX((m1+m2)(r,c), (m1(r,c))+(m2(r,c)));
-  VERIFY_IS_APPROX((s1*m1)(r,c), s1*(m1(r,c)));
-  VERIFY_IS_APPROX((m1*s1)(r,c), (m1(r,c))*s1);
-  if(NumTraits<Scalar>::HasFloatingPoint)
-    VERIFY_IS_APPROX((m1/s1)(r,c), (m1(r,c))/s1);
-
-  // use .block to disable vectorization and compare to the vectorized version
-  VERIFY_IS_APPROX(m1+m1.block(0,0,rows,cols), m1+m1);
-  VERIFY_IS_APPROX(m1.cwise() * m1.block(0,0,rows,cols), m1.cwise() * m1);
-  VERIFY_IS_APPROX(m1 - m1.block(0,0,rows,cols), m1 - m1);
-  VERIFY_IS_APPROX(m1.block(0,0,rows,cols) * s1, m1 * s1);
-}
-
-void test_eigen2_linearstructure()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( linearStructure(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( linearStructure(Matrix2f()) );
-    CALL_SUBTEST_3( linearStructure(Vector3d()) );
-    CALL_SUBTEST_4( linearStructure(Matrix4d()) );
-    CALL_SUBTEST_5( linearStructure(MatrixXcf(3, 3)) );
-    CALL_SUBTEST_6( linearStructure(MatrixXf(8, 12)) );
-    CALL_SUBTEST_7( linearStructure(MatrixXi(8, 12)) );
-    CALL_SUBTEST_8( linearStructure(MatrixXcd(20, 20)) );
-  }
-}
diff --git a/test/eigen2/eigen2_lu.cpp b/test/eigen2/eigen2_lu.cpp
deleted file mode 100644
index e993b1c7c..000000000
--- a/test/eigen2/eigen2_lu.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/LU>
-
-template<typename Derived>
-void doSomeRankPreservingOperations(Eigen::MatrixBase<Derived>& m)
-{
-  typedef typename Derived::RealScalar RealScalar;
-  for(int a = 0; a < 3*(m.rows()+m.cols()); a++)
-  {
-    RealScalar d = Eigen::ei_random<RealScalar>(-1,1);
-    int i = Eigen::ei_random<int>(0,m.rows()-1); // i is a random row number
-    int j;
-    do {
-      j = Eigen::ei_random<int>(0,m.rows()-1);
-    } while (i==j); // j is another one (must be different)
-    m.row(i) += d * m.row(j);
-
-    i = Eigen::ei_random<int>(0,m.cols()-1); // i is a random column number
-    do {
-      j = Eigen::ei_random<int>(0,m.cols()-1);
-    } while (i==j); // j is another one (must be different)
-    m.col(i) += d * m.col(j);
-  }
-}
-
-template<typename MatrixType> void lu_non_invertible()
-{
-  /* this test covers the following files:
-     LU.h
-  */
-  // NOTE there seems to be a problem with too small sizes -- could easily lie in the doSomeRankPreservingOperations function
-  int rows = ei_random<int>(20,200), cols = ei_random<int>(20,200), cols2 = ei_random<int>(20,200);
-  int rank = ei_random<int>(1, std::min(rows, cols)-1);
-
-  MatrixType m1(rows, cols), m2(cols, cols2), m3(rows, cols2), k(1,1);
-  m1 = MatrixType::Random(rows,cols);
-  if(rows <= cols)
-    for(int i = rank; i < rows; i++) m1.row(i).setZero();
-  else
-    for(int i = rank; i < cols; i++) m1.col(i).setZero();
-  doSomeRankPreservingOperations(m1);
-
-  LU<MatrixType> lu(m1);
-  typename LU<MatrixType>::KernelResultType m1kernel = lu.kernel();
-  typename LU<MatrixType>::ImageResultType m1image = lu.image();
-
-  VERIFY(rank == lu.rank());
-  VERIFY(cols - lu.rank() == lu.dimensionOfKernel());
-  VERIFY(!lu.isInjective());
-  VERIFY(!lu.isInvertible());
-  VERIFY(lu.isSurjective() == (lu.rank() == rows));
-  VERIFY((m1 * m1kernel).isMuchSmallerThan(m1));
-  VERIFY(m1image.lu().rank() == rank);
-  MatrixType sidebyside(m1.rows(), m1.cols() + m1image.cols());
-  sidebyside << m1, m1image;
-  VERIFY(sidebyside.lu().rank() == rank);
-  m2 = MatrixType::Random(cols,cols2);
-  m3 = m1*m2;
-  m2 = MatrixType::Random(cols,cols2);
-  lu.solve(m3, &m2);
-  VERIFY_IS_APPROX(m3, m1*m2);
-  /* solve now always returns true
-  m3 = MatrixType::Random(rows,cols2);
-  VERIFY(!lu.solve(m3, &m2));
-  */
-}
-
-template<typename MatrixType> void lu_invertible()
-{
-  /* this test covers the following files:
-     LU.h
-  */
-  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  int size = ei_random<int>(10,200);
-
-  MatrixType m1(size, size), m2(size, size), m3(size, size);
-  m1 = MatrixType::Random(size,size);
-
-  if (ei_is_same_type<RealScalar,float>::ret)
-  {
-    // let's build a matrix more stable to inverse
-    MatrixType a = MatrixType::Random(size,size*2);
-    m1 += a * a.adjoint();
-  }
-
-  LU<MatrixType> lu(m1);
-  VERIFY(0 == lu.dimensionOfKernel());
-  VERIFY(size == lu.rank());
-  VERIFY(lu.isInjective());
-  VERIFY(lu.isSurjective());
-  VERIFY(lu.isInvertible());
-  VERIFY(lu.image().lu().isInvertible());
-  m3 = MatrixType::Random(size,size);
-  lu.solve(m3, &m2);
-  VERIFY_IS_APPROX(m3, m1*m2);
-  VERIFY_IS_APPROX(m2, lu.inverse()*m3);
-  m3 = MatrixType::Random(size,size);
-  VERIFY(lu.solve(m3, &m2));
-}
-
-void test_eigen2_lu()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( lu_non_invertible<MatrixXf>() );
-    CALL_SUBTEST_2( lu_non_invertible<MatrixXd>() );
-    CALL_SUBTEST_3( lu_non_invertible<MatrixXcf>() );
-    CALL_SUBTEST_4( lu_non_invertible<MatrixXcd>() );
-    CALL_SUBTEST_1( lu_invertible<MatrixXf>() );
-    CALL_SUBTEST_2( lu_invertible<MatrixXd>() );
-    CALL_SUBTEST_3( lu_invertible<MatrixXcf>() );
-    CALL_SUBTEST_4( lu_invertible<MatrixXcd>() );
-  }
-}
diff --git a/test/eigen2/eigen2_map.cpp b/test/eigen2/eigen2_map.cpp
deleted file mode 100644
index 4a1c4e11a..000000000
--- a/test/eigen2/eigen2_map.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename VectorType> void map_class_vector(const VectorType& m)
-{
-  typedef typename VectorType::Scalar Scalar;
-
-  int size = m.size();
-
-  // test Map.h
-  Scalar* array1 = ei_aligned_new<Scalar>(size);
-  Scalar* array2 = ei_aligned_new<Scalar>(size);
-  Scalar* array3 = new Scalar[size+1];
-  Scalar* array3unaligned = std::size_t(array3)%16 == 0 ? array3+1 : array3;
-  
-  Map<VectorType, Aligned>(array1, size) = VectorType::Random(size);
-  Map<VectorType>(array2, size) = Map<VectorType>(array1, size);
-  Map<VectorType>(array3unaligned, size) = Map<VectorType>((const Scalar*)array1, size); // test non-const-correctness support in eigen2
-  VectorType ma1 = Map<VectorType>(array1, size);
-  VectorType ma2 = Map<VectorType, Aligned>(array2, size);
-  VectorType ma3 = Map<VectorType>(array3unaligned, size);
-  VERIFY_IS_APPROX(ma1, ma2);
-  VERIFY_IS_APPROX(ma1, ma3);
-  
-  ei_aligned_delete(array1, size);
-  ei_aligned_delete(array2, size);
-  delete[] array3;
-}
-
-template<typename MatrixType> void map_class_matrix(const MatrixType& m)
-{
-  typedef typename MatrixType::Scalar Scalar;
-
-  int rows = m.rows(), cols = m.cols(), size = rows*cols;
-
-  // test Map.h
-  Scalar* array1 = ei_aligned_new<Scalar>(size);
-  for(int i = 0; i < size; i++) array1[i] = Scalar(1);
-  Scalar* array2 = ei_aligned_new<Scalar>(size);
-  for(int i = 0; i < size; i++) array2[i] = Scalar(1);
-  Scalar* array3 = new Scalar[size+1];
-  for(int i = 0; i < size+1; i++) array3[i] = Scalar(1);
-  Scalar* array3unaligned = std::size_t(array3)%16 == 0 ? array3+1 : array3;
-  Map<MatrixType, Aligned>(array1, rows, cols) = MatrixType::Ones(rows,cols);
-  Map<MatrixType>(array2, rows, cols) = Map<MatrixType>((const Scalar*)array1, rows, cols); // test non-const-correctness support in eigen2
-  Map<MatrixType>(array3unaligned, rows, cols) = Map<MatrixType>(array1, rows, cols);
-  MatrixType ma1 = Map<MatrixType>(array1, rows, cols);
-  MatrixType ma2 = Map<MatrixType, Aligned>(array2, rows, cols);
-  VERIFY_IS_APPROX(ma1, ma2);
-  MatrixType ma3 = Map<MatrixType>(array3unaligned, rows, cols);
-  VERIFY_IS_APPROX(ma1, ma3);
-  
-  ei_aligned_delete(array1, size);
-  ei_aligned_delete(array2, size);
-  delete[] array3;
-}
-
-template<typename VectorType> void map_static_methods(const VectorType& m)
-{
-  typedef typename VectorType::Scalar Scalar;
-
-  int size = m.size();
-
-  // test Map.h
-  Scalar* array1 = ei_aligned_new<Scalar>(size);
-  Scalar* array2 = ei_aligned_new<Scalar>(size);
-  Scalar* array3 = new Scalar[size+1];
-  Scalar* array3unaligned = std::size_t(array3)%16 == 0 ? array3+1 : array3;
-  
-  VectorType::MapAligned(array1, size) = VectorType::Random(size);
-  VectorType::Map(array2, size) = VectorType::Map(array1, size);
-  VectorType::Map(array3unaligned, size) = VectorType::Map(array1, size);
-  VectorType ma1 = VectorType::Map(array1, size);
-  VectorType ma2 = VectorType::MapAligned(array2, size);
-  VectorType ma3 = VectorType::Map(array3unaligned, size);
-  VERIFY_IS_APPROX(ma1, ma2);
-  VERIFY_IS_APPROX(ma1, ma3);
-  
-  ei_aligned_delete(array1, size);
-  ei_aligned_delete(array2, size);
-  delete[] array3;
-}
-
-
-void test_eigen2_map()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( map_class_vector(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( map_class_vector(Vector4d()) );
-    CALL_SUBTEST_3( map_class_vector(RowVector4f()) );
-    CALL_SUBTEST_4( map_class_vector(VectorXcf(8)) );
-    CALL_SUBTEST_5( map_class_vector(VectorXi(12)) );
-
-    CALL_SUBTEST_1( map_class_matrix(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( map_class_matrix(Matrix4d()) );
-    CALL_SUBTEST_6( map_class_matrix(Matrix<float,3,5>()) );
-    CALL_SUBTEST_4( map_class_matrix(MatrixXcf(ei_random<int>(1,10),ei_random<int>(1,10))) );
-    CALL_SUBTEST_5( map_class_matrix(MatrixXi(ei_random<int>(1,10),ei_random<int>(1,10))) );
-
-    CALL_SUBTEST_1( map_static_methods(Matrix<double, 1, 1>()) );
-    CALL_SUBTEST_2( map_static_methods(Vector3f()) );
-    CALL_SUBTEST_7( map_static_methods(RowVector3d()) );
-    CALL_SUBTEST_4( map_static_methods(VectorXcd(8)) );
-    CALL_SUBTEST_5( map_static_methods(VectorXf(12)) );
-  }
-}
diff --git a/test/eigen2/eigen2_meta.cpp b/test/eigen2/eigen2_meta.cpp
deleted file mode 100644
index 1d01bd84d..000000000
--- a/test/eigen2/eigen2_meta.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-void test_eigen2_meta()
-{
-  typedef float & FloatRef;
-  typedef const float & ConstFloatRef;
-  
-  VERIFY((ei_meta_if<(3<4),ei_meta_true, ei_meta_false>::ret::ret));
-  VERIFY(( ei_is_same_type<float,float>::ret));
-  VERIFY((!ei_is_same_type<float,double>::ret));
-  VERIFY((!ei_is_same_type<float,float&>::ret));
-  VERIFY((!ei_is_same_type<float,const float&>::ret));
-  
-  VERIFY(( ei_is_same_type<float,ei_cleantype<const float&>::type >::ret));
-  VERIFY(( ei_is_same_type<float,ei_cleantype<const float*>::type >::ret));
-  VERIFY(( ei_is_same_type<float,ei_cleantype<const float*&>::type >::ret));
-  VERIFY(( ei_is_same_type<float,ei_cleantype<float**>::type >::ret));
-  VERIFY(( ei_is_same_type<float,ei_cleantype<float**&>::type >::ret));
-  VERIFY(( ei_is_same_type<float,ei_cleantype<float* const *&>::type >::ret));
-  VERIFY(( ei_is_same_type<float,ei_cleantype<float* const>::type >::ret));
-
-  VERIFY(( ei_is_same_type<float*,ei_unconst<const float*>::type >::ret));
-  VERIFY(( ei_is_same_type<float&,ei_unconst<const float&>::type >::ret));
-  VERIFY(( ei_is_same_type<float&,ei_unconst<ConstFloatRef>::type >::ret));
-  
-  VERIFY(( ei_is_same_type<float&,ei_unconst<float&>::type >::ret));
-  VERIFY(( ei_is_same_type<float,ei_unref<float&>::type >::ret));
-  VERIFY(( ei_is_same_type<const float,ei_unref<const float&>::type >::ret));
-  VERIFY(( ei_is_same_type<float,ei_unpointer<float*>::type >::ret));
-  VERIFY(( ei_is_same_type<const float,ei_unpointer<const float*>::type >::ret));
-  VERIFY(( ei_is_same_type<float,ei_unpointer<float* const >::type >::ret));
-  
-  VERIFY(ei_meta_sqrt<1>::ret == 1);
-  #define VERIFY_META_SQRT(X) VERIFY(ei_meta_sqrt<X>::ret == int(ei_sqrt(double(X))))
-  VERIFY_META_SQRT(2);
-  VERIFY_META_SQRT(3);
-  VERIFY_META_SQRT(4);
-  VERIFY_META_SQRT(5);
-  VERIFY_META_SQRT(6);
-  VERIFY_META_SQRT(8);
-  VERIFY_META_SQRT(9);
-  VERIFY_META_SQRT(15);
-  VERIFY_META_SQRT(16);
-  VERIFY_META_SQRT(17);
-  VERIFY_META_SQRT(255);
-  VERIFY_META_SQRT(256);
-  VERIFY_META_SQRT(257);
-  VERIFY_META_SQRT(1023);
-  VERIFY_META_SQRT(1024);
-  VERIFY_META_SQRT(1025);
-}
diff --git a/test/eigen2/eigen2_miscmatrices.cpp b/test/eigen2/eigen2_miscmatrices.cpp
deleted file mode 100644
index 8bbb20cc8..000000000
--- a/test/eigen2/eigen2_miscmatrices.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename MatrixType> void miscMatrices(const MatrixType& m)
-{
-  /* this test covers the following files:
-     DiagonalMatrix.h Ones.h
-  */
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-  typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType;
-  int rows = m.rows();
-  int cols = m.cols();
-
-  int r = ei_random<int>(0, rows-1), r2 = ei_random<int>(0, rows-1), c = ei_random<int>(0, cols-1);
-  VERIFY_IS_APPROX(MatrixType::Ones(rows,cols)(r,c), static_cast<Scalar>(1));
-  MatrixType m1 = MatrixType::Ones(rows,cols);
-  VERIFY_IS_APPROX(m1(r,c), static_cast<Scalar>(1));
-  VectorType v1 = VectorType::Random(rows);
-  v1[0];
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime>
-  square = v1.asDiagonal();
-  if(r==r2) VERIFY_IS_APPROX(square(r,r2), v1[r]);
-  else VERIFY_IS_MUCH_SMALLER_THAN(square(r,r2), static_cast<Scalar>(1));
-  square = MatrixType::Zero(rows, rows);
-  square.diagonal() = VectorType::Ones(rows);
-  VERIFY_IS_APPROX(square, MatrixType::Identity(rows, rows));
-}
-
-void test_eigen2_miscmatrices()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( miscMatrices(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( miscMatrices(Matrix4d()) );
-    CALL_SUBTEST_3( miscMatrices(MatrixXcf(3, 3)) );
-    CALL_SUBTEST_4( miscMatrices(MatrixXi(8, 12)) );
-    CALL_SUBTEST_5( miscMatrices(MatrixXcd(20, 20)) );
-  }
-}
diff --git a/test/eigen2/eigen2_mixingtypes.cpp b/test/eigen2/eigen2_mixingtypes.cpp
deleted file mode 100644
index fb5ac5dda..000000000
--- a/test/eigen2/eigen2_mixingtypes.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_NO_STATIC_ASSERT
-#define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them
-#endif
-
-#ifndef EIGEN_DONT_VECTORIZE
-#define EIGEN_DONT_VECTORIZE // SSE intrinsics aren't designed to allow mixing types
-#endif
-
-#include "main.h"
-
-
-template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
-{
-  typedef Matrix<float, SizeAtCompileType, SizeAtCompileType> Mat_f;
-  typedef Matrix<double, SizeAtCompileType, SizeAtCompileType> Mat_d;
-  typedef Matrix<std::complex<float>, SizeAtCompileType, SizeAtCompileType> Mat_cf;
-  typedef Matrix<std::complex<double>, SizeAtCompileType, SizeAtCompileType> Mat_cd;
-  typedef Matrix<float, SizeAtCompileType, 1> Vec_f;
-  typedef Matrix<double, SizeAtCompileType, 1> Vec_d;
-  typedef Matrix<std::complex<float>, SizeAtCompileType, 1> Vec_cf;
-  typedef Matrix<std::complex<double>, SizeAtCompileType, 1> Vec_cd;
-
-  Mat_f mf(size,size);
-  Mat_d md(size,size);
-  Mat_cf mcf(size,size);
-  Mat_cd mcd(size,size);
-  Vec_f vf(size,1);
-  Vec_d vd(size,1);
-  Vec_cf vcf(size,1);
-  Vec_cd vcd(size,1);
-
-  mf+mf;
-  VERIFY_RAISES_ASSERT(mf+md);
-  VERIFY_RAISES_ASSERT(mf+mcf);
-  VERIFY_RAISES_ASSERT(vf=vd);
-  VERIFY_RAISES_ASSERT(vf+=vd);
-  VERIFY_RAISES_ASSERT(mcd=md);
-
-  mf*mf;
-  md*mcd;
-  mcd*md;
-  mf*vcf;
-  mcf*vf;
-  mcf *= mf;
-  vcd = md*vcd;
-  vcf = mcf*vf;
-#if 0
-  // these are know generating hard build errors in eigen3
-  VERIFY_RAISES_ASSERT(mf*md);
-  VERIFY_RAISES_ASSERT(mcf*mcd);
-  VERIFY_RAISES_ASSERT(mcf*vcd);
-  VERIFY_RAISES_ASSERT(vcf = mf*vf);
-
-  vf.eigen2_dot(vf);
-  VERIFY_RAISES_ASSERT(vd.eigen2_dot(vf));
-  VERIFY_RAISES_ASSERT(vcf.eigen2_dot(vf)); // yeah eventually we should allow this but i'm too lazy to make that change now in Dot.h
-  // especially as that might be rewritten as cwise product .sum() which would make that automatic.
-#endif
-}
-
-void test_eigen2_mixingtypes()
-{
-  // check that our operator new is indeed called:
-  CALL_SUBTEST_1(mixingtypes<3>());
-  CALL_SUBTEST_2(mixingtypes<4>());
-  CALL_SUBTEST_3(mixingtypes<Dynamic>(20));
-}
diff --git a/test/eigen2/eigen2_nomalloc.cpp b/test/eigen2/eigen2_nomalloc.cpp
deleted file mode 100644
index d34a69999..000000000
--- a/test/eigen2/eigen2_nomalloc.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// this hack is needed to make this file compiles with -pedantic (gcc)
-#ifdef __GNUC__
-#define throw(X)
-#endif
-// discard stack allocation as that too bypasses malloc
-#define EIGEN_STACK_ALLOCATION_LIMIT 0
-// any heap allocation will raise an assert
-#define EIGEN_NO_MALLOC
-
-#include "main.h"
-
-template<typename MatrixType> void nomalloc(const MatrixType& m)
-{
-  /* this test check no dynamic memory allocation are issued with fixed-size matrices
-  */
-
-  typedef typename MatrixType::Scalar Scalar;
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols);
-
-  Scalar s1 = ei_random<Scalar>();
-
-  int r = ei_random<int>(0, rows-1),
-      c = ei_random<int>(0, cols-1);
-
-  VERIFY_IS_APPROX((m1+m2)*s1,              s1*m1+s1*m2);
-  VERIFY_IS_APPROX((m1+m2)(r,c), (m1(r,c))+(m2(r,c)));
-  VERIFY_IS_APPROX(m1.cwise() * m1.block(0,0,rows,cols), m1.cwise() * m1);
-  VERIFY_IS_APPROX((m1*m1.transpose())*m2,  m1*(m1.transpose()*m2));
-}
-
-void test_eigen2_nomalloc()
-{
-  // check that our operator new is indeed called:
-  VERIFY_RAISES_ASSERT(MatrixXd dummy = MatrixXd::Random(3,3));
-  CALL_SUBTEST_1( nomalloc(Matrix<float, 1, 1>()) );
-  CALL_SUBTEST_2( nomalloc(Matrix4d()) );
-  CALL_SUBTEST_3( nomalloc(Matrix<float,32,32>()) );
-}
diff --git a/test/eigen2/eigen2_packetmath.cpp b/test/eigen2/eigen2_packetmath.cpp
deleted file mode 100644
index b1f325fe7..000000000
--- a/test/eigen2/eigen2_packetmath.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-// using namespace Eigen;
-
-template<typename Scalar> bool areApprox(const Scalar* a, const Scalar* b, int size)
-{
-  for (int i=0; i<size; ++i)
-    if (!ei_isApprox(a[i],b[i])) return false;
-  return true;
-}
-
-#define CHECK_CWISE(REFOP, POP) { \
-  for (int i=0; i<PacketSize; ++i) \
-    ref[i] = REFOP(data1[i], data1[i+PacketSize]); \
-  ei_pstore(data2, POP(ei_pload(data1), ei_pload(data1+PacketSize))); \
-  VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
-}
-
-#define REF_ADD(a,b) ((a)+(b))
-#define REF_SUB(a,b) ((a)-(b))
-#define REF_MUL(a,b) ((a)*(b))
-#define REF_DIV(a,b) ((a)/(b))
-
-namespace std {
-
-template<> const complex<float>& min(const complex<float>& a, const complex<float>& b)
-{ return a.real() < b.real() ? a : b; }
-
-template<> const complex<float>& max(const complex<float>& a, const complex<float>& b)
-{ return a.real() < b.real() ? b : a; }
-
-}
-
-template<typename Scalar> void packetmath()
-{
-  typedef typename ei_packet_traits<Scalar>::type Packet;
-  const int PacketSize = ei_packet_traits<Scalar>::size;
-
-  const int size = PacketSize*4;
-  EIGEN_ALIGN_128 Scalar data1[ei_packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN_128 Scalar data2[ei_packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN_128 Packet packets[PacketSize*2];
-  EIGEN_ALIGN_128 Scalar ref[ei_packet_traits<Scalar>::size*4];
-  for (int i=0; i<size; ++i)
-  {
-    data1[i] = ei_random<Scalar>();
-    data2[i] = ei_random<Scalar>();
-  }
-
-  ei_pstore(data2, ei_pload(data1));
-  VERIFY(areApprox(data1, data2, PacketSize) && "aligned load/store");
-
-  for (int offset=0; offset<PacketSize; ++offset)
-  {
-    ei_pstore(data2, ei_ploadu(data1+offset));
-    VERIFY(areApprox(data1+offset, data2, PacketSize) && "ei_ploadu");
-  }
-
-  for (int offset=0; offset<PacketSize; ++offset)
-  {
-    ei_pstoreu(data2+offset, ei_pload(data1));
-    VERIFY(areApprox(data1, data2+offset, PacketSize) && "ei_pstoreu");
-  }
-
-  for (int offset=0; offset<PacketSize; ++offset)
-  {
-    packets[0] = ei_pload(data1);
-    packets[1] = ei_pload(data1+PacketSize);
-         if (offset==0) ei_palign<0>(packets[0], packets[1]);
-    else if (offset==1) ei_palign<1>(packets[0], packets[1]);
-    else if (offset==2) ei_palign<2>(packets[0], packets[1]);
-    else if (offset==3) ei_palign<3>(packets[0], packets[1]);
-    ei_pstore(data2, packets[0]);
-
-    for (int i=0; i<PacketSize; ++i)
-      ref[i] = data1[i+offset];
-
-    typedef Matrix<Scalar, PacketSize, 1> Vector;
-    VERIFY(areApprox(ref, data2, PacketSize) && "ei_palign");
-  }
-
-  CHECK_CWISE(REF_ADD,  ei_padd);
-  CHECK_CWISE(REF_SUB,  ei_psub);
-  CHECK_CWISE(REF_MUL,  ei_pmul);
-  #ifndef EIGEN_VECTORIZE_ALTIVEC
-  if (!ei_is_same_type<Scalar,int>::ret)
-    CHECK_CWISE(REF_DIV,  ei_pdiv);
-  #endif
-  CHECK_CWISE(std::min, ei_pmin);
-  CHECK_CWISE(std::max, ei_pmax);
-
-  for (int i=0; i<PacketSize; ++i)
-    ref[i] = data1[0];
-  ei_pstore(data2, ei_pset1(data1[0]));
-  VERIFY(areApprox(ref, data2, PacketSize) && "ei_pset1");
-
-  VERIFY(ei_isApprox(data1[0], ei_pfirst(ei_pload(data1))) && "ei_pfirst");
-
-  ref[0] = 0;
-  for (int i=0; i<PacketSize; ++i)
-    ref[0] += data1[i];
-  VERIFY(ei_isApprox(ref[0], ei_predux(ei_pload(data1))) && "ei_predux");
-
-  for (int j=0; j<PacketSize; ++j)
-  {
-    ref[j] = 0;
-    for (int i=0; i<PacketSize; ++i)
-      ref[j] += data1[i+j*PacketSize];
-    packets[j] = ei_pload(data1+j*PacketSize);
-  }
-  ei_pstore(data2, ei_preduxp(packets));
-  VERIFY(areApprox(ref, data2, PacketSize) && "ei_preduxp");
-}
-
-void test_eigen2_packetmath()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( packetmath<float>() );
-    CALL_SUBTEST_2( packetmath<double>() );
-    CALL_SUBTEST_3( packetmath<int>() );
-    CALL_SUBTEST_4( packetmath<std::complex<float> >() );
-  }
-}
diff --git a/test/eigen2/eigen2_parametrizedline.cpp b/test/eigen2/eigen2_parametrizedline.cpp
deleted file mode 100644
index 814728870..000000000
--- a/test/eigen2/eigen2_parametrizedline.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/Geometry>
-#include <Eigen/LU>
-#include <Eigen/QR>
-
-template<typename LineType> void parametrizedline(const LineType& _line)
-{
-  /* this test covers the following files:
-     ParametrizedLine.h
-  */
-
-  const int dim = _line.dim();
-  typedef typename LineType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar, LineType::AmbientDimAtCompileTime, 1> VectorType;
-  typedef Matrix<Scalar, LineType::AmbientDimAtCompileTime,
-                         LineType::AmbientDimAtCompileTime> MatrixType;
-
-  VectorType p0 = VectorType::Random(dim);
-  VectorType p1 = VectorType::Random(dim);
-
-  VectorType d0 = VectorType::Random(dim).normalized();
-
-  LineType l0(p0, d0);
-
-  Scalar s0 = ei_random<Scalar>();
-  Scalar s1 = ei_abs(ei_random<Scalar>());
-
-  VERIFY_IS_MUCH_SMALLER_THAN( l0.distance(p0), RealScalar(1) );
-  VERIFY_IS_MUCH_SMALLER_THAN( l0.distance(p0+s0*d0), RealScalar(1) );
-  VERIFY_IS_APPROX( (l0.projection(p1)-p1).norm(), l0.distance(p1) );
-  VERIFY_IS_MUCH_SMALLER_THAN( l0.distance(l0.projection(p1)), RealScalar(1) );
-  VERIFY_IS_APPROX( Scalar(l0.distance((p0+s0*d0) + d0.unitOrthogonal() * s1)), s1 );
-
-  // casting
-  const int Dim = LineType::AmbientDimAtCompileTime;
-  typedef typename GetDifferentType<Scalar>::type OtherScalar;
-  ParametrizedLine<OtherScalar,Dim> hp1f = l0.template cast<OtherScalar>();
-  VERIFY_IS_APPROX(hp1f.template cast<Scalar>(),l0);
-  ParametrizedLine<Scalar,Dim> hp1d = l0.template cast<Scalar>();
-  VERIFY_IS_APPROX(hp1d.template cast<Scalar>(),l0);
-}
-
-void test_eigen2_parametrizedline()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( parametrizedline(ParametrizedLine<float,2>()) );
-    CALL_SUBTEST_2( parametrizedline(ParametrizedLine<float,3>()) );
-    CALL_SUBTEST_3( parametrizedline(ParametrizedLine<double,4>()) );
-    CALL_SUBTEST_4( parametrizedline(ParametrizedLine<std::complex<double>,5>()) );
-  }
-}
diff --git a/test/eigen2/eigen2_prec_inverse_4x4.cpp b/test/eigen2/eigen2_prec_inverse_4x4.cpp
deleted file mode 100644
index 8bfa55694..000000000
--- a/test/eigen2/eigen2_prec_inverse_4x4.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/LU>
-#include <algorithm>
-
-template<typename T> std::string type_name() { return "other"; }
-template<> std::string type_name<float>() { return "float"; }
-template<> std::string type_name<double>() { return "double"; }
-template<> std::string type_name<int>() { return "int"; }
-template<> std::string type_name<std::complex<float> >() { return "complex<float>"; }
-template<> std::string type_name<std::complex<double> >() { return "complex<double>"; }
-template<> std::string type_name<std::complex<int> >() { return "complex<int>"; }
-
-#define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl;
-
-template<typename T> inline typename NumTraits<T>::Real epsilon()
-{
- return std::numeric_limits<typename NumTraits<T>::Real>::epsilon();
-}
-
-template<typename MatrixType> void inverse_permutation_4x4()
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  Vector4i indices(0,1,2,3);
-  for(int i = 0; i < 24; ++i)
-  {
-    MatrixType m = MatrixType::Zero();
-    m(indices(0),0) = 1;
-    m(indices(1),1) = 1;
-    m(indices(2),2) = 1;
-    m(indices(3),3) = 1;
-    MatrixType inv = m.inverse();
-    double error = double( (m*inv-MatrixType::Identity()).norm() / epsilon<Scalar>() );
-    VERIFY(error == 0.0);
-    std::next_permutation(indices.data(),indices.data()+4);
-  }
-}
-
-template<typename MatrixType> void inverse_general_4x4(int repeat)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  double error_sum = 0., error_max = 0.;
-  for(int i = 0; i < repeat; ++i)
-  {
-    MatrixType m;
-    RealScalar absdet;
-    do {
-      m = MatrixType::Random();
-      absdet = ei_abs(m.determinant());
-    } while(absdet < 10 * epsilon<Scalar>());
-    MatrixType inv = m.inverse();
-    double error = double( (m*inv-MatrixType::Identity()).norm() * absdet / epsilon<Scalar>() );
-    error_sum += error;
-    error_max = std::max(error_max, error);
-  }
-  std::cerr << "inverse_general_4x4, Scalar = " << type_name<Scalar>() << std::endl;
-  double error_avg = error_sum / repeat;
-  EIGEN_DEBUG_VAR(error_avg);
-  EIGEN_DEBUG_VAR(error_max);
-  VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.25));
-  VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 20.0));
-}
-
-void test_eigen2_prec_inverse_4x4()
-{
-  CALL_SUBTEST_1((inverse_permutation_4x4<Matrix4f>()));
-  CALL_SUBTEST_1(( inverse_general_4x4<Matrix4f>(200000 * g_repeat) ));
-
-  CALL_SUBTEST_2((inverse_permutation_4x4<Matrix<double,4,4,RowMajor> >()));
-  CALL_SUBTEST_2(( inverse_general_4x4<Matrix<double,4,4,RowMajor> >(200000 * g_repeat) ));
-
-  CALL_SUBTEST_3((inverse_permutation_4x4<Matrix4cf>()));
-  CALL_SUBTEST_3((inverse_general_4x4<Matrix4cf>(50000 * g_repeat)));
-}
diff --git a/test/eigen2/eigen2_product_large.cpp b/test/eigen2/eigen2_product_large.cpp
deleted file mode 100644
index 5149ef748..000000000
--- a/test/eigen2/eigen2_product_large.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "product.h"
-
-void test_eigen2_product_large()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( product(MatrixXf(ei_random<int>(1,320), ei_random<int>(1,320))) );
-    CALL_SUBTEST_2( product(MatrixXd(ei_random<int>(1,320), ei_random<int>(1,320))) );
-    CALL_SUBTEST_3( product(MatrixXi(ei_random<int>(1,320), ei_random<int>(1,320))) );
-    CALL_SUBTEST_4( product(MatrixXcf(ei_random<int>(1,50), ei_random<int>(1,50))) );
-    CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(ei_random<int>(1,320), ei_random<int>(1,320))) );
-  }
-
-#ifdef EIGEN_TEST_PART_6
-  {
-    // test a specific issue in DiagonalProduct
-    int N = 1000000;
-    VectorXf v = VectorXf::Ones(N);
-    MatrixXf m = MatrixXf::Ones(N,3);
-    m = (v+v).asDiagonal() * m;
-    VERIFY_IS_APPROX(m, MatrixXf::Constant(N,3,2));
-  }
-
-  {
-    // test deferred resizing in Matrix::operator=
-    MatrixXf a = MatrixXf::Random(10,4), b = MatrixXf::Random(4,10), c = a;
-    VERIFY_IS_APPROX((a = a * b), (c * b).eval());
-  }
-
-  {
-    MatrixXf mat1(10,10); mat1.setRandom();
-    MatrixXf mat2(32,10); mat2.setRandom();
-    MatrixXf result = mat1.row(2)*mat2.transpose();
-    VERIFY_IS_APPROX(result, (mat1.row(2)*mat2.transpose()).eval());
-  }
-#endif
-}
diff --git a/test/eigen2/eigen2_product_small.cpp b/test/eigen2/eigen2_product_small.cpp
deleted file mode 100644
index 4cd8c102f..000000000
--- a/test/eigen2/eigen2_product_small.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_NO_STATIC_ASSERT
-#include "product.h"
-
-void test_eigen2_product_small()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
-    CALL_SUBTEST_2( product(Matrix<int, 3, 5>()) );
-    CALL_SUBTEST_3( product(Matrix3d()) );
-    CALL_SUBTEST_4( product(Matrix4d()) );
-    CALL_SUBTEST_5( product(Matrix4f()) );
-  }
-}
diff --git a/test/eigen2/eigen2_qr.cpp b/test/eigen2/eigen2_qr.cpp
deleted file mode 100644
index 76977e4c1..000000000
--- a/test/eigen2/eigen2_qr.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/QR>
-
-template<typename MatrixType> void qr(const MatrixType& m)
-{
-  /* this test covers the following files:
-     QR.h
-  */
-  int rows = m.rows();
-  int cols = m.cols();
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> SquareMatrixType;
-  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
-
-  MatrixType a = MatrixType::Random(rows,cols);
-  QR<MatrixType> qrOfA(a);
-  VERIFY_IS_APPROX(a, qrOfA.matrixQ() * qrOfA.matrixR());
-  VERIFY_IS_NOT_APPROX(a+MatrixType::Identity(rows, cols), qrOfA.matrixQ() * qrOfA.matrixR());
-
-  #if 0 // eigenvalues module not yet ready
-  SquareMatrixType b = a.adjoint() * a;
-
-  // check tridiagonalization
-  Tridiagonalization<SquareMatrixType> tridiag(b);
-  VERIFY_IS_APPROX(b, tridiag.matrixQ() * tridiag.matrixT() * tridiag.matrixQ().adjoint());
-
-  // check hessenberg decomposition
-  HessenbergDecomposition<SquareMatrixType> hess(b);
-  VERIFY_IS_APPROX(b, hess.matrixQ() * hess.matrixH() * hess.matrixQ().adjoint());
-  VERIFY_IS_APPROX(tridiag.matrixT(), hess.matrixH());
-  b = SquareMatrixType::Random(cols,cols);
-  hess.compute(b);
-  VERIFY_IS_APPROX(b, hess.matrixQ() * hess.matrixH() * hess.matrixQ().adjoint());
-  #endif
-}
-
-void test_eigen2_qr()
-{
-  for(int i = 0; i < 1; i++) {
-    CALL_SUBTEST_1( qr(Matrix2f()) );
-    CALL_SUBTEST_2( qr(Matrix4d()) );
-    CALL_SUBTEST_3( qr(MatrixXf(12,8)) );
-    CALL_SUBTEST_4( qr(MatrixXcd(5,5)) );
-    CALL_SUBTEST_4( qr(MatrixXcd(7,3)) );
-  }
-
-#ifdef EIGEN_TEST_PART_5
-  // small isFullRank test
-  {
-    Matrix3d mat;
-    mat << 1, 45, 1, 2, 2, 2, 1, 2, 3;
-    VERIFY(mat.qr().isFullRank());
-    mat << 1, 1, 1, 2, 2, 2, 1, 2, 3;
-    //always returns true in eigen2support
-    //VERIFY(!mat.qr().isFullRank());
-  }
-
-#endif
-}
diff --git a/test/eigen2/eigen2_qtvector.cpp b/test/eigen2/eigen2_qtvector.cpp
deleted file mode 100644
index 6cfb58a26..000000000
--- a/test/eigen2/eigen2_qtvector.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_WORK_AROUND_QT_BUG_CALLING_WRONG_OPERATOR_NEW_FIXED_IN_QT_4_5
-
-#include "main.h"
-
-#include <Eigen/Geometry>
-#include <Eigen/QtAlignedMalloc>
-
-#include <QtCore/QVector>
-
-template<typename MatrixType>
-void check_qtvector_matrix(const MatrixType& m)
-{
-  int rows = m.rows();
-  int cols = m.cols();
-  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
-  QVector<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
-  for(int i = 0; i < 20; i++)
-  {
-    VERIFY_IS_APPROX(w[i], y);
-  }
-  v[5] = x;
-  w[6] = v[5];
-  VERIFY_IS_APPROX(w[6], v[5]);
-  v = w;
-  for(int i = 0; i < 20; i++)
-  {
-    VERIFY_IS_APPROX(w[i], v[i]);
-  }
-
-  v.resize(21);
-  v[20] = x;
-  VERIFY_IS_APPROX(v[20], x);
-  v.fill(y,22);
-  VERIFY_IS_APPROX(v[21], y);
-  v.push_back(x);
-  VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(MatrixType));
-
-  // do a lot of push_back such that the vector gets internally resized
-  // (with memory reallocation)
-  MatrixType* ref = &w[0];
-  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
-    v.push_back(w[i%w.size()]);
-  for(int i=23; i<v.size(); ++i)
-  {
-    VERIFY(v[i]==w[(i-23)%w.size()]);
-  }
-}
-
-template<typename TransformType>
-void check_qtvector_transform(const TransformType&)
-{
-  typedef typename TransformType::MatrixType MatrixType;
-  TransformType x(MatrixType::Random()), y(MatrixType::Random());
-  QVector<TransformType> v(10), w(20, y);
-  v[5] = x;
-  w[6] = v[5];
-  VERIFY_IS_APPROX(w[6], v[5]);
-  v = w;
-  for(int i = 0; i < 20; i++)
-  {
-    VERIFY_IS_APPROX(w[i], v[i]);
-  }
-
-  v.resize(21);
-  v[20] = x;
-  VERIFY_IS_APPROX(v[20], x);
-  v.fill(y,22);
-  VERIFY_IS_APPROX(v[21], y);
-  v.push_back(x);
-  VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(TransformType));
-
-  // do a lot of push_back such that the vector gets internally resized
-  // (with memory reallocation)
-  TransformType* ref = &w[0];
-  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
-    v.push_back(w[i%w.size()]);
-  for(unsigned int i=23; int(i)<v.size(); ++i)
-  {
-    VERIFY(v[i].matrix()==w[(i-23)%w.size()].matrix());
-  }
-}
-
-template<typename QuaternionType>
-void check_qtvector_quaternion(const QuaternionType&)
-{
-  typedef typename QuaternionType::Coefficients Coefficients;
-  QuaternionType x(Coefficients::Random()), y(Coefficients::Random());
-  QVector<QuaternionType> v(10), w(20, y);
-  v[5] = x;
-  w[6] = v[5];
-  VERIFY_IS_APPROX(w[6], v[5]);
-  v = w;
-  for(int i = 0; i < 20; i++)
-  {
-    VERIFY_IS_APPROX(w[i], v[i]);
-  }
-
-  v.resize(21);
-  v[20] = x;
-  VERIFY_IS_APPROX(v[20], x);
-  v.fill(y,22);
-  VERIFY_IS_APPROX(v[21], y);
-  v.push_back(x);
-  VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(QuaternionType));
-
-  // do a lot of push_back such that the vector gets internally resized
-  // (with memory reallocation)
-  QuaternionType* ref = &w[0];
-  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
-    v.push_back(w[i%w.size()]);
-  for(unsigned int i=23; int(i)<v.size(); ++i)
-  {
-    VERIFY(v[i].coeffs()==w[(i-23)%w.size()].coeffs());
-  }
-}
-
-void test_eigen2_qtvector()
-{
-  // some non vectorizable fixed sizes
-  CALL_SUBTEST_1(check_qtvector_matrix(Vector2f()));
-  CALL_SUBTEST_1(check_qtvector_matrix(Matrix3f()));
-  CALL_SUBTEST_1(check_qtvector_matrix(Matrix3d()));
-
-  // some vectorizable fixed sizes
-  CALL_SUBTEST_2(check_qtvector_matrix(Matrix2f()));
-  CALL_SUBTEST_2(check_qtvector_matrix(Vector4f()));
-  CALL_SUBTEST_2(check_qtvector_matrix(Matrix4f()));
-  CALL_SUBTEST_2(check_qtvector_matrix(Matrix4d()));
-
-  // some dynamic sizes
-  CALL_SUBTEST_3(check_qtvector_matrix(MatrixXd(1,1)));
-  CALL_SUBTEST_3(check_qtvector_matrix(VectorXd(20)));
-  CALL_SUBTEST_3(check_qtvector_matrix(RowVectorXf(20)));
-  CALL_SUBTEST_3(check_qtvector_matrix(MatrixXcf(10,10)));
-
-  // some Transform
-  CALL_SUBTEST_4(check_qtvector_transform(Transform2f()));
-  CALL_SUBTEST_4(check_qtvector_transform(Transform3f()));
-  CALL_SUBTEST_4(check_qtvector_transform(Transform3d()));
-  //CALL_SUBTEST_4(check_qtvector_transform(Transform4d()));
-
-  // some Quaternion
-  CALL_SUBTEST_5(check_qtvector_quaternion(Quaternionf()));
-  CALL_SUBTEST_5(check_qtvector_quaternion(Quaternionf()));
-}
diff --git a/test/eigen2/eigen2_regression.cpp b/test/eigen2/eigen2_regression.cpp
deleted file mode 100644
index c68b58da8..000000000
--- a/test/eigen2/eigen2_regression.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/LeastSquares>
-
-template<typename VectorType,
-         typename HyperplaneType>
-void makeNoisyCohyperplanarPoints(int numPoints,
-                                  VectorType **points,
-                                  HyperplaneType *hyperplane,
-                                  typename VectorType::Scalar noiseAmplitude)
-{
-  typedef typename VectorType::Scalar Scalar;
-  const int size = points[0]->size();
-  // pick a random hyperplane, store the coefficients of its equation
-  hyperplane->coeffs().resize(size + 1);
-  for(int j = 0; j < size + 1; j++)
-  {
-    do {
-      hyperplane->coeffs().coeffRef(j) = ei_random<Scalar>();
-    } while(ei_abs(hyperplane->coeffs().coeff(j)) < 0.5);
-  }
-
-  // now pick numPoints random points on this hyperplane
-  for(int i = 0; i < numPoints; i++)
-  {
-    VectorType& cur_point = *(points[i]);
-    do
-    {
-      cur_point = VectorType::Random(size)/*.normalized()*/;
-      // project cur_point onto the hyperplane
-      Scalar x = - (hyperplane->coeffs().start(size).cwise()*cur_point).sum();
-      cur_point *= hyperplane->coeffs().coeff(size) / x;
-    } while( cur_point.norm() < 0.5
-          || cur_point.norm() > 2.0 );
-  }
-
-  // add some noise to these points
-  for(int i = 0; i < numPoints; i++ )
-    *(points[i]) += noiseAmplitude * VectorType::Random(size);
-}
-
-template<typename VectorType>
-void check_linearRegression(int numPoints,
-                            VectorType **points,
-                            const VectorType& original,
-                            typename VectorType::Scalar tolerance)
-{
-  int size = points[0]->size();
-  assert(size==2);
-  VectorType result(size);
-  linearRegression(numPoints, points, &result, 1);
-  typename VectorType::Scalar error = (result - original).norm() / original.norm();
-  VERIFY(ei_abs(error) < ei_abs(tolerance));
-}
-
-template<typename VectorType,
-         typename HyperplaneType>
-void check_fitHyperplane(int numPoints,
-                         VectorType **points,
-                         const HyperplaneType& original,
-                         typename VectorType::Scalar tolerance)
-{
-  int size = points[0]->size();
-  HyperplaneType result(size);
-  fitHyperplane(numPoints, points, &result);
-  result.coeffs() *= original.coeffs().coeff(size)/result.coeffs().coeff(size);
-  typename VectorType::Scalar error = (result.coeffs() - original.coeffs()).norm() / original.coeffs().norm();
-  std::cout << ei_abs(error) << "  xxx   " << ei_abs(tolerance) << std::endl;
-  VERIFY(ei_abs(error) < ei_abs(tolerance));
-}
-
-void test_eigen2_regression()
-{
-  for(int i = 0; i < g_repeat; i++)
-  {
-#ifdef EIGEN_TEST_PART_1
-    {
-      Vector2f points2f [1000];
-      Vector2f *points2f_ptrs [1000];
-      for(int i = 0; i < 1000; i++) points2f_ptrs[i] = &(points2f[i]);
-      Vector2f coeffs2f;
-      Hyperplane<float,2> coeffs3f;
-      makeNoisyCohyperplanarPoints(1000, points2f_ptrs, &coeffs3f, 0.01f);
-      coeffs2f[0] = -coeffs3f.coeffs()[0]/coeffs3f.coeffs()[1];
-      coeffs2f[1] = -coeffs3f.coeffs()[2]/coeffs3f.coeffs()[1];
-      CALL_SUBTEST(check_linearRegression(10, points2f_ptrs, coeffs2f, 0.05f));
-      CALL_SUBTEST(check_linearRegression(100, points2f_ptrs, coeffs2f, 0.01f));
-      CALL_SUBTEST(check_linearRegression(1000, points2f_ptrs, coeffs2f, 0.002f));
-    }
-#endif
-#ifdef EIGEN_TEST_PART_2
-    {
-      Vector2f points2f [1000];
-      Vector2f *points2f_ptrs [1000];
-      for(int i = 0; i < 1000; i++) points2f_ptrs[i] = &(points2f[i]);
-      Hyperplane<float,2> coeffs3f;
-      makeNoisyCohyperplanarPoints(1000, points2f_ptrs, &coeffs3f, 0.01f);
-      CALL_SUBTEST(check_fitHyperplane(10, points2f_ptrs, coeffs3f, 0.05f));
-      CALL_SUBTEST(check_fitHyperplane(100, points2f_ptrs, coeffs3f, 0.01f));
-      CALL_SUBTEST(check_fitHyperplane(1000, points2f_ptrs, coeffs3f, 0.002f));
-    }
-#endif
-#ifdef EIGEN_TEST_PART_3
-    {
-      Vector4d points4d [1000];
-      Vector4d *points4d_ptrs [1000];
-      for(int i = 0; i < 1000; i++) points4d_ptrs[i] = &(points4d[i]);
-      Hyperplane<double,4> coeffs5d;
-      makeNoisyCohyperplanarPoints(1000, points4d_ptrs, &coeffs5d, 0.01);
-      CALL_SUBTEST(check_fitHyperplane(10, points4d_ptrs, coeffs5d, 0.05));
-      CALL_SUBTEST(check_fitHyperplane(100, points4d_ptrs, coeffs5d, 0.01));
-      CALL_SUBTEST(check_fitHyperplane(1000, points4d_ptrs, coeffs5d, 0.002));
-    }
-#endif
-#ifdef EIGEN_TEST_PART_4
-    {
-      VectorXcd *points11cd_ptrs[1000];
-      for(int i = 0; i < 1000; i++) points11cd_ptrs[i] = new VectorXcd(11);
-      Hyperplane<std::complex<double>,Dynamic> *coeffs12cd = new Hyperplane<std::complex<double>,Dynamic>(11);
-      makeNoisyCohyperplanarPoints(1000, points11cd_ptrs, coeffs12cd, 0.01);
-      CALL_SUBTEST(check_fitHyperplane(100, points11cd_ptrs, *coeffs12cd, 0.025));
-      CALL_SUBTEST(check_fitHyperplane(1000, points11cd_ptrs, *coeffs12cd, 0.006));
-      delete coeffs12cd;
-      for(int i = 0; i < 1000; i++) delete points11cd_ptrs[i];
-    }
-#endif
-  }
-}
diff --git a/test/eigen2/eigen2_sizeof.cpp b/test/eigen2/eigen2_sizeof.cpp
deleted file mode 100644
index ec1af5a06..000000000
--- a/test/eigen2/eigen2_sizeof.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename MatrixType> void verifySizeOf(const MatrixType&)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  if (MatrixType::RowsAtCompileTime!=Dynamic && MatrixType::ColsAtCompileTime!=Dynamic)
-    VERIFY(sizeof(MatrixType)==sizeof(Scalar)*MatrixType::SizeAtCompileTime);
-  else
-    VERIFY(sizeof(MatrixType)==sizeof(Scalar*) + 2 * sizeof(typename MatrixType::Index));
-}
-
-void test_eigen2_sizeof()
-{
-  CALL_SUBTEST( verifySizeOf(Matrix<float, 1, 1>()) );
-  CALL_SUBTEST( verifySizeOf(Matrix4d()) );
-  CALL_SUBTEST( verifySizeOf(Matrix<double, 4, 2>()) );
-  CALL_SUBTEST( verifySizeOf(Matrix<bool, 7, 5>()) );
-  CALL_SUBTEST( verifySizeOf(MatrixXcf(3, 3)) );
-  CALL_SUBTEST( verifySizeOf(MatrixXi(8, 12)) );
-  CALL_SUBTEST( verifySizeOf(MatrixXcd(20, 20)) );
-  CALL_SUBTEST( verifySizeOf(Matrix<float, 100, 100>()) );
-}
diff --git a/test/eigen2/eigen2_smallvectors.cpp b/test/eigen2/eigen2_smallvectors.cpp
deleted file mode 100644
index 03962b17d..000000000
--- a/test/eigen2/eigen2_smallvectors.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename Scalar> void smallVectors()
-{
-  typedef Matrix<Scalar, 1, 2> V2;
-  typedef Matrix<Scalar, 3, 1> V3;
-  typedef Matrix<Scalar, 1, 4> V4;
-  Scalar x1 = ei_random<Scalar>(),
-         x2 = ei_random<Scalar>(),
-         x3 = ei_random<Scalar>(),
-         x4 = ei_random<Scalar>();
-  V2 v2(x1, x2);
-  V3 v3(x1, x2, x3);
-  V4 v4(x1, x2, x3, x4);
-  VERIFY_IS_APPROX(x1, v2.x());
-  VERIFY_IS_APPROX(x1, v3.x());
-  VERIFY_IS_APPROX(x1, v4.x());
-  VERIFY_IS_APPROX(x2, v2.y());
-  VERIFY_IS_APPROX(x2, v3.y());
-  VERIFY_IS_APPROX(x2, v4.y());
-  VERIFY_IS_APPROX(x3, v3.z());
-  VERIFY_IS_APPROX(x3, v4.z());
-  VERIFY_IS_APPROX(x4, v4.w());
-}
-
-void test_eigen2_smallvectors()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST( smallVectors<int>() );
-    CALL_SUBTEST( smallVectors<float>() );
-    CALL_SUBTEST( smallVectors<double>() );
-  }
-}
diff --git a/test/eigen2/eigen2_sparse_basic.cpp b/test/eigen2/eigen2_sparse_basic.cpp
deleted file mode 100644
index 049077670..000000000
--- a/test/eigen2/eigen2_sparse_basic.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Daniel Gomez Ferro <dgomezferro@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "sparse.h"
-
-template<typename SetterType,typename DenseType, typename Scalar, int Options>
-bool test_random_setter(SparseMatrix<Scalar,Options>& sm, const DenseType& ref, const std::vector<Vector2i>& nonzeroCoords)
-{
-  typedef SparseMatrix<Scalar,Options> SparseType;
-  {
-    sm.setZero();
-    SetterType w(sm);
-    std::vector<Vector2i> remaining = nonzeroCoords;
-    while(!remaining.empty())
-    {
-      int i = ei_random<int>(0,remaining.size()-1);
-      w(remaining[i].x(),remaining[i].y()) = ref.coeff(remaining[i].x(),remaining[i].y());
-      remaining[i] = remaining.back();
-      remaining.pop_back();
-    }
-  }
-  return sm.isApprox(ref);
-}
-
-template<typename SetterType,typename DenseType, typename T>
-bool test_random_setter(DynamicSparseMatrix<T>& sm, const DenseType& ref, const std::vector<Vector2i>& nonzeroCoords)
-{
-  sm.setZero();
-  std::vector<Vector2i> remaining = nonzeroCoords;
-  while(!remaining.empty())
-  {
-    int i = ei_random<int>(0,remaining.size()-1);
-    sm.coeffRef(remaining[i].x(),remaining[i].y()) = ref.coeff(remaining[i].x(),remaining[i].y());
-    remaining[i] = remaining.back();
-    remaining.pop_back();
-  }
-  return sm.isApprox(ref);
-}
-
-template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& ref)
-{
-  const int rows = ref.rows();
-  const int cols = ref.cols();
-  typedef typename SparseMatrixType::Scalar Scalar;
-  enum { Flags = SparseMatrixType::Flags };
-  
-  double density = std::max(8./(rows*cols), 0.01);
-  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
-  typedef Matrix<Scalar,Dynamic,1> DenseVector;
-  Scalar eps = 1e-6;
-
-  SparseMatrixType m(rows, cols);
-  DenseMatrix refMat = DenseMatrix::Zero(rows, cols);
-  DenseVector vec1 = DenseVector::Random(rows);
-  Scalar s1 = ei_random<Scalar>();
-
-  std::vector<Vector2i> zeroCoords;
-  std::vector<Vector2i> nonzeroCoords;
-  initSparse<Scalar>(density, refMat, m, 0, &zeroCoords, &nonzeroCoords);
-  
-  if (zeroCoords.size()==0 || nonzeroCoords.size()==0)
-    return;
-
-  // test coeff and coeffRef
-  for (int i=0; i<(int)zeroCoords.size(); ++i)
-  {
-    VERIFY_IS_MUCH_SMALLER_THAN( m.coeff(zeroCoords[i].x(),zeroCoords[i].y()), eps );
-    if(ei_is_same_type<SparseMatrixType,SparseMatrix<Scalar,Flags> >::ret)
-      VERIFY_RAISES_ASSERT( m.coeffRef(zeroCoords[0].x(),zeroCoords[0].y()) = 5 );
-  }
-  VERIFY_IS_APPROX(m, refMat);
-
-  m.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
-  refMat.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
-
-  VERIFY_IS_APPROX(m, refMat);
-  /*
-  // test InnerIterators and Block expressions
-  for (int t=0; t<10; ++t)
-  {
-    int j = ei_random<int>(0,cols-1);
-    int i = ei_random<int>(0,rows-1);
-    int w = ei_random<int>(1,cols-j-1);
-    int h = ei_random<int>(1,rows-i-1);
-
-//     VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w));
-    for(int c=0; c<w; c++)
-    {
-      VERIFY_IS_APPROX(m.block(i,j,h,w).col(c), refMat.block(i,j,h,w).col(c));
-      for(int r=0; r<h; r++)
-      {
-//         VERIFY_IS_APPROX(m.block(i,j,h,w).col(c).coeff(r), refMat.block(i,j,h,w).col(c).coeff(r));
-      }
-    }
-//     for(int r=0; r<h; r++)
-//     {
-//       VERIFY_IS_APPROX(m.block(i,j,h,w).row(r), refMat.block(i,j,h,w).row(r));
-//       for(int c=0; c<w; c++)
-//       {
-//         VERIFY_IS_APPROX(m.block(i,j,h,w).row(r).coeff(c), refMat.block(i,j,h,w).row(r).coeff(c));
-//       }
-//     }
-  }
-
-  for(int c=0; c<cols; c++)
-  {
-    VERIFY_IS_APPROX(m.col(c) + m.col(c), (m + m).col(c));
-    VERIFY_IS_APPROX(m.col(c) + m.col(c), refMat.col(c) + refMat.col(c));
-  }
-
-  for(int r=0; r<rows; r++)
-  {
-    VERIFY_IS_APPROX(m.row(r) + m.row(r), (m + m).row(r));
-    VERIFY_IS_APPROX(m.row(r) + m.row(r), refMat.row(r) + refMat.row(r));
-  }
-  */
-
-  // test SparseSetters
-  // coherent setter
-  // TODO extend the MatrixSetter
-//   {
-//     m.setZero();
-//     VERIFY_IS_NOT_APPROX(m, refMat);
-//     SparseSetter<SparseMatrixType, FullyCoherentAccessPattern> w(m);
-//     for (int i=0; i<nonzeroCoords.size(); ++i)
-//     {
-//       w->coeffRef(nonzeroCoords[i].x(),nonzeroCoords[i].y()) = refMat.coeff(nonzeroCoords[i].x(),nonzeroCoords[i].y());
-//     }
-//   }
-//   VERIFY_IS_APPROX(m, refMat);
-
-  // random setter
-//   {
-//     m.setZero();
-//     VERIFY_IS_NOT_APPROX(m, refMat);
-//     SparseSetter<SparseMatrixType, RandomAccessPattern> w(m);
-//     std::vector<Vector2i> remaining = nonzeroCoords;
-//     while(!remaining.empty())
-//     {
-//       int i = ei_random<int>(0,remaining.size()-1);
-//       w->coeffRef(remaining[i].x(),remaining[i].y()) = refMat.coeff(remaining[i].x(),remaining[i].y());
-//       remaining[i] = remaining.back();
-//       remaining.pop_back();
-//     }
-//   }
-//   VERIFY_IS_APPROX(m, refMat);
-
-    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdMapTraits> >(m,refMat,nonzeroCoords) ));
-    #ifdef EIGEN_UNORDERED_MAP_SUPPORT
-    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdUnorderedMapTraits> >(m,refMat,nonzeroCoords) ));
-    #endif
-    #ifdef _DENSE_HASH_MAP_H_
-    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleDenseHashMapTraits> >(m,refMat,nonzeroCoords) ));
-    #endif
-    #ifdef _SPARSE_HASH_MAP_H_
-    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleSparseHashMapTraits> >(m,refMat,nonzeroCoords) ));
-    #endif
-
-    // test fillrand
-    {
-      DenseMatrix m1(rows,cols);
-      m1.setZero();
-      SparseMatrixType m2(rows,cols);
-      m2.startFill();
-      for (int j=0; j<cols; ++j)
-      {
-        for (int k=0; k<rows/2; ++k)
-        {
-          int i = ei_random<int>(0,rows-1);
-          if (m1.coeff(i,j)==Scalar(0))
-            m2.fillrand(i,j) = m1(i,j) = ei_random<Scalar>();
-        }
-      }
-      m2.endFill();
-      VERIFY_IS_APPROX(m2,m1);
-    }
-  
-  // test RandomSetter
-  /*{
-    SparseMatrixType m1(rows,cols), m2(rows,cols);
-    DenseMatrix refM1 = DenseMatrix::Zero(rows, rows);
-    initSparse<Scalar>(density, refM1, m1);
-    {
-      Eigen::RandomSetter<SparseMatrixType > setter(m2);
-      for (int j=0; j<m1.outerSize(); ++j)
-        for (typename SparseMatrixType::InnerIterator i(m1,j); i; ++i)
-          setter(i.index(), j) = i.value();
-    }
-    VERIFY_IS_APPROX(m1, m2);
-  }*/
-//   std::cerr << m.transpose() << "\n\n"  << refMat.transpose() << "\n\n";
-//   VERIFY_IS_APPROX(m, refMat);
-
-  // test basic computations
-  {
-    DenseMatrix refM1 = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refM2 = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refM3 = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refM4 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m1(rows, rows);
-    SparseMatrixType m2(rows, rows);
-    SparseMatrixType m3(rows, rows);
-    SparseMatrixType m4(rows, rows);
-    initSparse<Scalar>(density, refM1, m1);
-    initSparse<Scalar>(density, refM2, m2);
-    initSparse<Scalar>(density, refM3, m3);
-    initSparse<Scalar>(density, refM4, m4);
-
-    VERIFY_IS_APPROX(m1+m2, refM1+refM2);
-    VERIFY_IS_APPROX(m1+m2+m3, refM1+refM2+refM3);
-    VERIFY_IS_APPROX(m3.cwise()*(m1+m2), refM3.cwise()*(refM1+refM2));
-    VERIFY_IS_APPROX(m1*s1-m2, refM1*s1-refM2);
-
-    VERIFY_IS_APPROX(m1*=s1, refM1*=s1);
-    VERIFY_IS_APPROX(m1/=s1, refM1/=s1);
-    
-    VERIFY_IS_APPROX(m1+=m2, refM1+=refM2);
-    VERIFY_IS_APPROX(m1-=m2, refM1-=refM2);
-    
-    VERIFY_IS_APPROX(m1.col(0).eigen2_dot(refM2.row(0)), refM1.col(0).eigen2_dot(refM2.row(0)));
-    
-    refM4.setRandom();
-    // sparse cwise* dense
-    VERIFY_IS_APPROX(m3.cwise()*refM4, refM3.cwise()*refM4);
-//     VERIFY_IS_APPROX(m3.cwise()/refM4, refM3.cwise()/refM4);
-  }
-
-  // test innerVector()
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m2(rows, rows);
-    initSparse<Scalar>(density, refMat2, m2);
-    int j0 = ei_random(0,rows-1);
-    int j1 = ei_random(0,rows-1);
-    VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0));
-    VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1));
-    //m2.innerVector(j0) = 2*m2.innerVector(j1);
-    //refMat2.col(j0) = 2*refMat2.col(j1);
-    //VERIFY_IS_APPROX(m2, refMat2);
-  }
-  
-  // test innerVectors()
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m2(rows, rows);
-    initSparse<Scalar>(density, refMat2, m2);
-    int j0 = ei_random(0,rows-2);
-    int j1 = ei_random(0,rows-2);
-    int n0 = ei_random<int>(1,rows-std::max(j0,j1));
-    VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(0,j0,rows,n0));
-    VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
-                     refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
-    //m2.innerVectors(j0,n0) = m2.innerVectors(j0,n0) + m2.innerVectors(j1,n0);
-    //refMat2.block(0,j0,rows,n0) = refMat2.block(0,j0,rows,n0) + refMat2.block(0,j1,rows,n0);
-  }
-
-  // test transpose
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m2(rows, rows);
-    initSparse<Scalar>(density, refMat2, m2);
-    VERIFY_IS_APPROX(m2.transpose().eval(), refMat2.transpose().eval());
-    VERIFY_IS_APPROX(m2.transpose(), refMat2.transpose());
-  }
-  
-  // test prune
-  {
-    SparseMatrixType m2(rows, rows);
-    DenseMatrix refM2(rows, rows);
-    refM2.setZero();
-    int countFalseNonZero = 0;
-    int countTrueNonZero = 0;
-    m2.startFill();
-    for (int j=0; j<m2.outerSize(); ++j)
-      for (int i=0; i<m2.innerSize(); ++i)
-      {
-        float x = ei_random<float>(0,1);
-        if (x<0.1)
-        {
-          // do nothing
-        }
-        else if (x<0.5)
-        {
-          countFalseNonZero++;
-          m2.fill(i,j) = Scalar(0);
-        }
-        else
-        {
-          countTrueNonZero++;
-          m2.fill(i,j) = refM2(i,j) = Scalar(1);
-        }
-      }
-    m2.endFill();
-    VERIFY(countFalseNonZero+countTrueNonZero == m2.nonZeros());
-    VERIFY_IS_APPROX(m2, refM2);
-    m2.prune(1);
-    VERIFY(countTrueNonZero==m2.nonZeros());
-    VERIFY_IS_APPROX(m2, refM2);
-  }
-}
-
-void test_eigen2_sparse_basic()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( sparse_basic(SparseMatrix<double>(8, 8)) );
-    CALL_SUBTEST_2( sparse_basic(SparseMatrix<std::complex<double> >(16, 16)) );
-    CALL_SUBTEST_1( sparse_basic(SparseMatrix<double>(33, 33)) );
-    
-    CALL_SUBTEST_3( sparse_basic(DynamicSparseMatrix<double>(8, 8)) );
-  }
-}
diff --git a/test/eigen2/eigen2_sparse_product.cpp b/test/eigen2/eigen2_sparse_product.cpp
deleted file mode 100644
index d28e76dff..000000000
--- a/test/eigen2/eigen2_sparse_product.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Daniel Gomez Ferro <dgomezferro@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "sparse.h"
-
-template<typename SparseMatrixType> void sparse_product(const SparseMatrixType& ref)
-{
-  const int rows = ref.rows();
-  const int cols = ref.cols();
-  typedef typename SparseMatrixType::Scalar Scalar;
-  enum { Flags = SparseMatrixType::Flags };
-
-  double density = std::max(8./(rows*cols), 0.01);
-  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
-  typedef Matrix<Scalar,Dynamic,1> DenseVector;
-
-  // test matrix-matrix product
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refMat3 = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refMat4 = DenseMatrix::Zero(rows, rows);
-    DenseMatrix dm4 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m2(rows, rows);
-    SparseMatrixType m3(rows, rows);
-    SparseMatrixType m4(rows, rows);
-    initSparse<Scalar>(density, refMat2, m2);
-    initSparse<Scalar>(density, refMat3, m3);
-    initSparse<Scalar>(density, refMat4, m4);
-    VERIFY_IS_APPROX(m4=m2*m3, refMat4=refMat2*refMat3);
-    VERIFY_IS_APPROX(m4=m2.transpose()*m3, refMat4=refMat2.transpose()*refMat3);
-    VERIFY_IS_APPROX(m4=m2.transpose()*m3.transpose(), refMat4=refMat2.transpose()*refMat3.transpose());
-    VERIFY_IS_APPROX(m4=m2*m3.transpose(), refMat4=refMat2*refMat3.transpose());
-
-    // sparse * dense
-    VERIFY_IS_APPROX(dm4=m2*refMat3, refMat4=refMat2*refMat3);
-    VERIFY_IS_APPROX(dm4=m2*refMat3.transpose(), refMat4=refMat2*refMat3.transpose());
-    VERIFY_IS_APPROX(dm4=m2.transpose()*refMat3, refMat4=refMat2.transpose()*refMat3);
-    VERIFY_IS_APPROX(dm4=m2.transpose()*refMat3.transpose(), refMat4=refMat2.transpose()*refMat3.transpose());
-
-    // dense * sparse
-    VERIFY_IS_APPROX(dm4=refMat2*m3, refMat4=refMat2*refMat3);
-    VERIFY_IS_APPROX(dm4=refMat2*m3.transpose(), refMat4=refMat2*refMat3.transpose());
-    VERIFY_IS_APPROX(dm4=refMat2.transpose()*m3, refMat4=refMat2.transpose()*refMat3);
-    VERIFY_IS_APPROX(dm4=refMat2.transpose()*m3.transpose(), refMat4=refMat2.transpose()*refMat3.transpose());
-
-    VERIFY_IS_APPROX(m3=m3*m3, refMat3=refMat3*refMat3);
-  }
-
-  // test matrix - diagonal product
-  if(false) // it compiles, but the precision is terrible. probably doesn't matter in this branch....
-  {
-    DenseMatrix refM2 = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refM3 = DenseMatrix::Zero(rows, rows);
-    DiagonalMatrix<DenseVector> d1(DenseVector::Random(rows));
-    SparseMatrixType m2(rows, rows);
-    SparseMatrixType m3(rows, rows);
-    initSparse<Scalar>(density, refM2, m2);
-    initSparse<Scalar>(density, refM3, m3);
-    VERIFY_IS_APPROX(m3=m2*d1, refM3=refM2*d1);
-    VERIFY_IS_APPROX(m3=m2.transpose()*d1, refM3=refM2.transpose()*d1);
-    VERIFY_IS_APPROX(m3=d1*m2, refM3=d1*refM2);
-    VERIFY_IS_APPROX(m3=d1*m2.transpose(), refM3=d1 * refM2.transpose());
-  }
-
-  // test self adjoint products
-  {
-    DenseMatrix b = DenseMatrix::Random(rows, rows);
-    DenseMatrix x = DenseMatrix::Random(rows, rows);
-    DenseMatrix refX = DenseMatrix::Random(rows, rows);
-    DenseMatrix refUp = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refLo = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refS = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType mUp(rows, rows);
-    SparseMatrixType mLo(rows, rows);
-    SparseMatrixType mS(rows, rows);
-    do {
-      initSparse<Scalar>(density, refUp, mUp, ForceRealDiag|/*ForceNonZeroDiag|*/MakeUpperTriangular);
-    } while (refUp.isZero());
-    refLo = refUp.transpose().conjugate();
-    mLo = mUp.transpose().conjugate();
-    refS = refUp + refLo;
-    refS.diagonal() *= 0.5;
-    mS = mUp + mLo;
-    for (int k=0; k<mS.outerSize(); ++k)
-      for (typename SparseMatrixType::InnerIterator it(mS,k); it; ++it)
-        if (it.index() == k)
-          it.valueRef() *= 0.5;
-
-    VERIFY_IS_APPROX(refS.adjoint(), refS);
-    VERIFY_IS_APPROX(mS.transpose().conjugate(), mS);
-    VERIFY_IS_APPROX(mS, refS);
-    VERIFY_IS_APPROX(x=mS*b, refX=refS*b);
-    VERIFY_IS_APPROX(x=mUp.template marked<UpperTriangular|SelfAdjoint>()*b, refX=refS*b);
-    VERIFY_IS_APPROX(x=mLo.template marked<LowerTriangular|SelfAdjoint>()*b, refX=refS*b);
-    VERIFY_IS_APPROX(x=mS.template marked<SelfAdjoint>()*b, refX=refS*b);
-  }
-
-}
-
-void test_eigen2_sparse_product()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( sparse_product(SparseMatrix<double>(8, 8)) );
-    CALL_SUBTEST_2( sparse_product(SparseMatrix<std::complex<double> >(16, 16)) );
-    CALL_SUBTEST_1( sparse_product(SparseMatrix<double>(33, 33)) );
-
-    CALL_SUBTEST_3( sparse_product(DynamicSparseMatrix<double>(8, 8)) );
-  }
-}
diff --git a/test/eigen2/eigen2_sparse_solvers.cpp b/test/eigen2/eigen2_sparse_solvers.cpp
deleted file mode 100644
index 3aef27ab4..000000000
--- a/test/eigen2/eigen2_sparse_solvers.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Daniel Gomez Ferro <dgomezferro@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "sparse.h"
-
-template<typename Scalar> void
-initSPD(double density,
-        Matrix<Scalar,Dynamic,Dynamic>& refMat,
-        SparseMatrix<Scalar>& sparseMat)
-{
-  Matrix<Scalar,Dynamic,Dynamic> aux(refMat.rows(),refMat.cols());
-  initSparse(density,refMat,sparseMat);
-  refMat = refMat * refMat.adjoint();
-  for (int k=0; k<2; ++k)
-  {
-    initSparse(density,aux,sparseMat,ForceNonZeroDiag);
-    refMat += aux * aux.adjoint();
-  }
-  sparseMat.startFill();
-  for (int j=0 ; j<sparseMat.cols(); ++j)
-    for (int i=j ; i<sparseMat.rows(); ++i)
-      if (refMat(i,j)!=Scalar(0))
-        sparseMat.fill(i,j) = refMat(i,j);
-  sparseMat.endFill();
-}
-
-template<typename Scalar> void sparse_solvers(int rows, int cols)
-{
-  double density = std::max(8./(rows*cols), 0.01);
-  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
-  typedef Matrix<Scalar,Dynamic,1> DenseVector;
-  // Scalar eps = 1e-6;
-
-  DenseVector vec1 = DenseVector::Random(rows);
-
-  std::vector<Vector2i> zeroCoords;
-  std::vector<Vector2i> nonzeroCoords;
-
-  // test triangular solver
-  {
-    DenseVector vec2 = vec1, vec3 = vec1;
-    SparseMatrix<Scalar> m2(rows, cols);
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
-
-    // lower
-    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeLowerTriangular, &zeroCoords, &nonzeroCoords);
-    VERIFY_IS_APPROX(refMat2.template marked<LowerTriangular>().solveTriangular(vec2),
-                     m2.template marked<LowerTriangular>().solveTriangular(vec3));
-
-    // lower - transpose
-    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeLowerTriangular, &zeroCoords, &nonzeroCoords);
-    VERIFY_IS_APPROX(refMat2.template marked<LowerTriangular>().transpose().solveTriangular(vec2),
-                     m2.template marked<LowerTriangular>().transpose().solveTriangular(vec3));
-
-    // upper
-    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeUpperTriangular, &zeroCoords, &nonzeroCoords);
-    VERIFY_IS_APPROX(refMat2.template marked<UpperTriangular>().solveTriangular(vec2),
-                     m2.template marked<UpperTriangular>().solveTriangular(vec3));
-
-    // upper - transpose
-    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeUpperTriangular, &zeroCoords, &nonzeroCoords);
-    VERIFY_IS_APPROX(refMat2.template marked<UpperTriangular>().transpose().solveTriangular(vec2),
-                     m2.template marked<UpperTriangular>().transpose().solveTriangular(vec3));
-  }
-
-  // test LLT
-  {
-    // TODO fix the issue with complex (see SparseLLT::solveInPlace)
-    SparseMatrix<Scalar> m2(rows, cols);
-    DenseMatrix refMat2(rows, cols);
-
-    DenseVector b = DenseVector::Random(cols);
-    DenseVector refX(cols), x(cols);
-
-    initSPD(density, refMat2, m2);
-
-    refMat2.llt().solve(b, &refX);
-    typedef SparseMatrix<Scalar,LowerTriangular|SelfAdjoint> SparseSelfAdjointMatrix;
-    if (!NumTraits<Scalar>::IsComplex)
-    {
-      x = b;
-      SparseLLT<SparseSelfAdjointMatrix> (m2).solveInPlace(x);
-      VERIFY(refX.isApprox(x,test_precision<Scalar>()) && "LLT: default");
-    }
-    #ifdef EIGEN_CHOLMOD_SUPPORT
-    x = b;
-    SparseLLT<SparseSelfAdjointMatrix,Cholmod>(m2).solveInPlace(x);
-    VERIFY(refX.isApprox(x,test_precision<Scalar>()) && "LLT: cholmod");
-    #endif
-    if (!NumTraits<Scalar>::IsComplex)
-    {
-      #ifdef EIGEN_TAUCS_SUPPORT
-      x = b;
-      SparseLLT<SparseSelfAdjointMatrix,Taucs>(m2,IncompleteFactorization).solveInPlace(x);
-      VERIFY(refX.isApprox(x,test_precision<Scalar>()) && "LLT: taucs (IncompleteFactorization)");
-      x = b;
-      SparseLLT<SparseSelfAdjointMatrix,Taucs>(m2,SupernodalMultifrontal).solveInPlace(x);
-      VERIFY(refX.isApprox(x,test_precision<Scalar>()) && "LLT: taucs (SupernodalMultifrontal)");
-      x = b;
-      SparseLLT<SparseSelfAdjointMatrix,Taucs>(m2,SupernodalLeftLooking).solveInPlace(x);
-      VERIFY(refX.isApprox(x,test_precision<Scalar>()) && "LLT: taucs (SupernodalLeftLooking)");
-      #endif
-    }
-  }
-
-  // test LDLT
-  if (!NumTraits<Scalar>::IsComplex)
-  {
-    // TODO fix the issue with complex (see SparseLDLT::solveInPlace)
-    SparseMatrix<Scalar> m2(rows, cols);
-    DenseMatrix refMat2(rows, cols);
-
-    DenseVector b = DenseVector::Random(cols);
-    DenseVector refX(cols), x(cols);
-
-    //initSPD(density, refMat2, m2);
-    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeUpperTriangular, 0, 0);
-    refMat2 += refMat2.adjoint();
-    refMat2.diagonal() *= 0.5;
-
-    refMat2.ldlt().solve(b, &refX);
-    typedef SparseMatrix<Scalar,UpperTriangular|SelfAdjoint> SparseSelfAdjointMatrix;
-    x = b;
-    SparseLDLT<SparseSelfAdjointMatrix> ldlt(m2);
-    if (ldlt.succeeded())
-      ldlt.solveInPlace(x);
-    VERIFY(refX.isApprox(x,test_precision<Scalar>()) && "LDLT: default");
-  }
-
-  // test LU
-  {
-    static int count = 0;
-    SparseMatrix<Scalar> m2(rows, cols);
-    DenseMatrix refMat2(rows, cols);
-
-    DenseVector b = DenseVector::Random(cols);
-    DenseVector refX(cols), x(cols);
-
-    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag, &zeroCoords, &nonzeroCoords);
-
-    LU<DenseMatrix> refLu(refMat2);
-    refLu.solve(b, &refX);
-    #if defined(EIGEN_SUPERLU_SUPPORT) || defined(EIGEN_UMFPACK_SUPPORT)
-    Scalar refDet = refLu.determinant();
-    #endif
-    x.setZero();
-    // // SparseLU<SparseMatrix<Scalar> > (m2).solve(b,&x);
-    // // VERIFY(refX.isApprox(x,test_precision<Scalar>()) && "LU: default");
-    #ifdef EIGEN_SUPERLU_SUPPORT
-    {
-      x.setZero();
-      SparseLU<SparseMatrix<Scalar>,SuperLU> slu(m2);
-      if (slu.succeeded())
-      {
-        if (slu.solve(b,&x)) {
-          VERIFY(refX.isApprox(x,test_precision<Scalar>()) && "LU: SuperLU");
-        }
-        // std::cerr << refDet << " == " << slu.determinant() << "\n";
-        if (count==0) {
-          VERIFY_IS_APPROX(refDet,slu.determinant()); // FIXME det is not very stable for complex
-        }
-      }
-    }
-    #endif
-    #ifdef EIGEN_UMFPACK_SUPPORT
-    {
-      // check solve
-      x.setZero();
-      SparseLU<SparseMatrix<Scalar>,UmfPack> slu(m2);
-      if (slu.succeeded()) {
-        if (slu.solve(b,&x)) {
-          if (count==0) {
-            VERIFY(refX.isApprox(x,test_precision<Scalar>()) && "LU: umfpack");  // FIXME solve is not very stable for complex
-          }
-        }
-        VERIFY_IS_APPROX(refDet,slu.determinant());
-        // TODO check the extracted data
-        //std::cerr << slu.matrixL() << "\n";
-      }
-    }
-    #endif
-    count++;
-  }
-
-}
-
-void test_eigen2_sparse_solvers()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( sparse_solvers<double>(8, 8) );
-    CALL_SUBTEST_2( sparse_solvers<std::complex<double> >(16, 16) );
-    CALL_SUBTEST_1( sparse_solvers<double>(101, 101) );
-  }
-}
diff --git a/test/eigen2/eigen2_sparse_vector.cpp b/test/eigen2/eigen2_sparse_vector.cpp
deleted file mode 100644
index e6d2d77a1..000000000
--- a/test/eigen2/eigen2_sparse_vector.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Daniel Gomez Ferro <dgomezferro@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "sparse.h"
-
-template<typename Scalar> void sparse_vector(int rows, int cols)
-{
-  double densityMat = std::max(8./(rows*cols), 0.01);
-  double densityVec = std::max(8./float(rows), 0.1);
-  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
-  typedef Matrix<Scalar,Dynamic,1> DenseVector;
-  typedef SparseVector<Scalar> SparseVectorType;
-  typedef SparseMatrix<Scalar> SparseMatrixType;
-  Scalar eps = 1e-6;
-
-  SparseMatrixType m1(rows,cols);
-  SparseVectorType v1(rows), v2(rows), v3(rows);
-  DenseMatrix refM1 = DenseMatrix::Zero(rows, cols);
-  DenseVector refV1 = DenseVector::Random(rows),
-    refV2 = DenseVector::Random(rows),
-    refV3 = DenseVector::Random(rows);
-
-  std::vector<int> zerocoords, nonzerocoords;
-  initSparse<Scalar>(densityVec, refV1, v1, &zerocoords, &nonzerocoords);
-  initSparse<Scalar>(densityMat, refM1, m1);
-
-  initSparse<Scalar>(densityVec, refV2, v2);
-  initSparse<Scalar>(densityVec, refV3, v3);
-
-  Scalar s1 = ei_random<Scalar>();
-
-  // test coeff and coeffRef
-  for (unsigned int i=0; i<zerocoords.size(); ++i)
-  {
-    VERIFY_IS_MUCH_SMALLER_THAN( v1.coeff(zerocoords[i]), eps );
-    //VERIFY_RAISES_ASSERT( v1.coeffRef(zerocoords[i]) = 5 );
-  }
-  {
-    VERIFY(int(nonzerocoords.size()) == v1.nonZeros());
-    int j=0;
-    for (typename SparseVectorType::InnerIterator it(v1); it; ++it,++j)
-    {
-      VERIFY(nonzerocoords[j]==it.index());
-      VERIFY(it.value()==v1.coeff(it.index()));
-      VERIFY(it.value()==refV1.coeff(it.index()));
-    }
-  }
-  VERIFY_IS_APPROX(v1, refV1);
-
-  v1.coeffRef(nonzerocoords[0]) = Scalar(5);
-  refV1.coeffRef(nonzerocoords[0]) = Scalar(5);
-  VERIFY_IS_APPROX(v1, refV1);
-
-  VERIFY_IS_APPROX(v1+v2, refV1+refV2);
-  VERIFY_IS_APPROX(v1+v2+v3, refV1+refV2+refV3);
-
-  VERIFY_IS_APPROX(v1*s1-v2, refV1*s1-refV2);
-
-  VERIFY_IS_APPROX(v1*=s1, refV1*=s1);
-  VERIFY_IS_APPROX(v1/=s1, refV1/=s1);
-  
-  VERIFY_IS_APPROX(v1+=v2, refV1+=refV2);
-  VERIFY_IS_APPROX(v1-=v2, refV1-=refV2);
-
-  VERIFY_IS_APPROX(v1.eigen2_dot(v2), refV1.eigen2_dot(refV2));
-  VERIFY_IS_APPROX(v1.eigen2_dot(refV2), refV1.eigen2_dot(refV2));
-
-}
-
-void test_eigen2_sparse_vector()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( sparse_vector<double>(8, 8) );
-    CALL_SUBTEST_2( sparse_vector<std::complex<double> >(16, 16) );
-    CALL_SUBTEST_1( sparse_vector<double>(299, 535) );
-  }
-}
-
diff --git a/test/eigen2/eigen2_stdvector.cpp b/test/eigen2/eigen2_stdvector.cpp
deleted file mode 100644
index 6ab05c20a..000000000
--- a/test/eigen2/eigen2_stdvector.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include <Eigen/StdVector>
-#include "main.h"
-#include <Eigen/Geometry>
-
-template<typename MatrixType>
-void check_stdvector_matrix(const MatrixType& m)
-{
-  int rows = m.rows();
-  int cols = m.cols();
-  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
-  std::vector<MatrixType, aligned_allocator<MatrixType> > v(10, MatrixType(rows,cols)), w(20, y);
-  v[5] = x;
-  w[6] = v[5];
-  VERIFY_IS_APPROX(w[6], v[5]);
-  v = w;
-  for(int i = 0; i < 20; i++)
-  {
-    VERIFY_IS_APPROX(w[i], v[i]);
-  }
-
-  v.resize(21);
-  v[20] = x;
-  VERIFY_IS_APPROX(v[20], x);
-  v.resize(22,y);
-  VERIFY_IS_APPROX(v[21], y);
-  v.push_back(x);
-  VERIFY_IS_APPROX(v[22], x);
-  VERIFY((std::size_t)&(v[22]) == (std::size_t)&(v[21]) + sizeof(MatrixType));
-
-  // do a lot of push_back such that the vector gets internally resized
-  // (with memory reallocation)
-  MatrixType* ref = &w[0];
-  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
-    v.push_back(w[i%w.size()]);
-  for(unsigned int i=23; i<v.size(); ++i)
-  {
-    VERIFY(v[i]==w[(i-23)%w.size()]);
-  }
-}
-
-template<typename TransformType>
-void check_stdvector_transform(const TransformType&)
-{
-  typedef typename TransformType::MatrixType MatrixType;
-  TransformType x(MatrixType::Random()), y(MatrixType::Random());
-  std::vector<TransformType, aligned_allocator<TransformType> > v(10), w(20, y);
-  v[5] = x;
-  w[6] = v[5];
-  VERIFY_IS_APPROX(w[6], v[5]);
-  v = w;
-  for(int i = 0; i < 20; i++)
-  {
-    VERIFY_IS_APPROX(w[i], v[i]);
-  }
-
-  v.resize(21);
-  v[20] = x;
-  VERIFY_IS_APPROX(v[20], x);
-  v.resize(22,y);
-  VERIFY_IS_APPROX(v[21], y);
-  v.push_back(x);
-  VERIFY_IS_APPROX(v[22], x);
-  VERIFY((std::size_t)&(v[22]) == (std::size_t)&(v[21]) + sizeof(TransformType));
-
-  // do a lot of push_back such that the vector gets internally resized
-  // (with memory reallocation)
-  TransformType* ref = &w[0];
-  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
-    v.push_back(w[i%w.size()]);
-  for(unsigned int i=23; i<v.size(); ++i)
-  {
-    VERIFY(v[i].matrix()==w[(i-23)%w.size()].matrix());
-  }
-}
-
-template<typename QuaternionType>
-void check_stdvector_quaternion(const QuaternionType&)
-{
-  typedef typename QuaternionType::Coefficients Coefficients;
-  QuaternionType x(Coefficients::Random()), y(Coefficients::Random());
-  std::vector<QuaternionType, aligned_allocator<QuaternionType> > v(10), w(20, y);
-  v[5] = x;
-  w[6] = v[5];
-  VERIFY_IS_APPROX(w[6], v[5]);
-  v = w;
-  for(int i = 0; i < 20; i++)
-  {
-    VERIFY_IS_APPROX(w[i], v[i]);
-  }
-
-  v.resize(21);
-  v[20] = x;
-  VERIFY_IS_APPROX(v[20], x);
-  v.resize(22,y);
-  VERIFY_IS_APPROX(v[21], y);
-  v.push_back(x);
-  VERIFY_IS_APPROX(v[22], x);
-  VERIFY((std::size_t)&(v[22]) == (std::size_t)&(v[21]) + sizeof(QuaternionType));
-
-  // do a lot of push_back such that the vector gets internally resized
-  // (with memory reallocation)
-  QuaternionType* ref = &w[0];
-  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
-    v.push_back(w[i%w.size()]);
-  for(unsigned int i=23; i<v.size(); ++i)
-  {
-    VERIFY(v[i].coeffs()==w[(i-23)%w.size()].coeffs());
-  }
-}
-
-void test_eigen2_stdvector()
-{
-  // some non vectorizable fixed sizes
-  CALL_SUBTEST_1(check_stdvector_matrix(Vector2f()));
-  CALL_SUBTEST_1(check_stdvector_matrix(Matrix3f()));
-  CALL_SUBTEST_1(check_stdvector_matrix(Matrix3d()));
-
-  // some vectorizable fixed sizes
-  CALL_SUBTEST_2(check_stdvector_matrix(Matrix2f()));
-  CALL_SUBTEST_2(check_stdvector_matrix(Vector4f()));
-  CALL_SUBTEST_2(check_stdvector_matrix(Matrix4f()));
-  CALL_SUBTEST_2(check_stdvector_matrix(Matrix4d()));
-
-  // some dynamic sizes
-  CALL_SUBTEST_3(check_stdvector_matrix(MatrixXd(1,1)));
-  CALL_SUBTEST_3(check_stdvector_matrix(VectorXd(20)));
-  CALL_SUBTEST_3(check_stdvector_matrix(RowVectorXf(20)));
-  CALL_SUBTEST_3(check_stdvector_matrix(MatrixXcf(10,10)));
-
-  // some Transform
-  CALL_SUBTEST_4(check_stdvector_transform(Transform2f()));
-  CALL_SUBTEST_4(check_stdvector_transform(Transform3f()));
-  CALL_SUBTEST_4(check_stdvector_transform(Transform3d()));
-  //CALL_SUBTEST_4(check_stdvector_transform(Transform4d()));
-
-  // some Quaternion
-  CALL_SUBTEST_5(check_stdvector_quaternion(Quaternionf()));
-  CALL_SUBTEST_5(check_stdvector_quaternion(Quaternionf()));
-}
diff --git a/test/eigen2/eigen2_submatrices.cpp b/test/eigen2/eigen2_submatrices.cpp
deleted file mode 100644
index dee970b63..000000000
--- a/test/eigen2/eigen2_submatrices.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-// check minor separately in order to avoid the possible creation of a zero-sized
-// array. Comes from a compilation error with gcc-3.4 or gcc-4 with -ansi -pedantic.
-// Another solution would be to declare the array like this: T m_data[Size==0?1:Size]; in ei_matrix_storage
-// but this is probably not bad to raise such an error at compile time...
-template<typename Scalar, int _Rows, int _Cols> struct CheckMinor
-{
-    typedef Matrix<Scalar, _Rows, _Cols> MatrixType;
-    CheckMinor(MatrixType& m1, int r1, int c1)
-    {
-        int rows = m1.rows();
-        int cols = m1.cols();
-
-        Matrix<Scalar, Dynamic, Dynamic> mi = m1.minor(0,0).eval();
-        VERIFY_IS_APPROX(mi, m1.block(1,1,rows-1,cols-1));
-        mi = m1.minor(r1,c1);
-        VERIFY_IS_APPROX(mi.transpose(), m1.transpose().minor(c1,r1));
-        //check operator(), both constant and non-constant, on minor()
-        m1.minor(r1,c1)(0,0) = m1.minor(0,0)(0,0);
-    }
-};
-
-template<typename Scalar> struct CheckMinor<Scalar,1,1>
-{
-    typedef Matrix<Scalar, 1, 1> MatrixType;
-    CheckMinor(MatrixType&, int, int) {}
-};
-
-template<typename MatrixType> void submatrices(const MatrixType& m)
-{
-  /* this test covers the following files:
-     Row.h Column.h Block.h Minor.h DiagonalCoeffs.h
-  */
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-  typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType;
-  int rows = m.rows();
-  int cols = m.cols();
-
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols),
-             m3(rows, cols),
-             ones = MatrixType::Ones(rows, cols),
-             square = Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime>
-                              ::Random(rows, rows);
-  VectorType v1 = VectorType::Random(rows);
-
-  Scalar s1 = ei_random<Scalar>();
-
-  int r1 = ei_random<int>(0,rows-1);
-  int r2 = ei_random<int>(r1,rows-1);
-  int c1 = ei_random<int>(0,cols-1);
-  int c2 = ei_random<int>(c1,cols-1);
-
-  //check row() and col()
-  VERIFY_IS_APPROX(m1.col(c1).transpose(), m1.transpose().row(c1));
-  VERIFY_IS_APPROX(square.row(r1).eigen2_dot(m1.col(c1)), (square.lazy() * m1.conjugate())(r1,c1));
-  //check operator(), both constant and non-constant, on row() and col()
-  m1.row(r1) += s1 * m1.row(r2);
-  m1.col(c1) += s1 * m1.col(c2);
-
-  //check block()
-  Matrix<Scalar,Dynamic,Dynamic> b1(1,1); b1(0,0) = m1(r1,c1);
-  RowVectorType br1(m1.block(r1,0,1,cols));
-  VectorType bc1(m1.block(0,c1,rows,1));
-  VERIFY_IS_APPROX(b1, m1.block(r1,c1,1,1));
-  VERIFY_IS_APPROX(m1.row(r1), br1);
-  VERIFY_IS_APPROX(m1.col(c1), bc1);
-  //check operator(), both constant and non-constant, on block()
-  m1.block(r1,c1,r2-r1+1,c2-c1+1) = s1 * m2.block(0, 0, r2-r1+1,c2-c1+1);
-  m1.block(r1,c1,r2-r1+1,c2-c1+1)(r2-r1,c2-c1) = m2.block(0, 0, r2-r1+1,c2-c1+1)(0,0);
-
-  //check minor()
-  CheckMinor<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> checkminor(m1,r1,c1);
-
-  //check diagonal()
-  VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal());
-  m2.diagonal() = 2 * m1.diagonal();
-  m2.diagonal()[0] *= 3;
-  VERIFY_IS_APPROX(m2.diagonal()[0], static_cast<Scalar>(6) * m1.diagonal()[0]);
-
-  enum {
-    BlockRows = EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,2),
-    BlockCols = EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::ColsAtCompileTime,5)
-  };
-  if (rows>=5 && cols>=8)
-  {
-    // test fixed block() as lvalue
-    m1.template block<BlockRows,BlockCols>(1,1) *= s1;
-    // test operator() on fixed block() both as constant and non-constant
-    m1.template block<BlockRows,BlockCols>(1,1)(0, 3) = m1.template block<2,5>(1,1)(1,2);
-    // check that fixed block() and block() agree
-    Matrix<Scalar,Dynamic,Dynamic> b = m1.template block<BlockRows,BlockCols>(3,3);
-    VERIFY_IS_APPROX(b, m1.block(3,3,BlockRows,BlockCols));
-  }
-
-  if (rows>2)
-  {
-    // test sub vectors
-    VERIFY_IS_APPROX(v1.template start<2>(), v1.block(0,0,2,1));
-    VERIFY_IS_APPROX(v1.template start<2>(), v1.start(2));
-    VERIFY_IS_APPROX(v1.template start<2>(), v1.segment(0,2));
-    VERIFY_IS_APPROX(v1.template start<2>(), v1.template segment<2>(0));
-    int i = rows-2;
-    VERIFY_IS_APPROX(v1.template end<2>(), v1.block(i,0,2,1));
-    VERIFY_IS_APPROX(v1.template end<2>(), v1.end(2));
-    VERIFY_IS_APPROX(v1.template end<2>(), v1.segment(i,2));
-    VERIFY_IS_APPROX(v1.template end<2>(), v1.template segment<2>(i));
-    i = ei_random(0,rows-2);
-    VERIFY_IS_APPROX(v1.segment(i,2), v1.template segment<2>(i));
-  }
-
-  // stress some basic stuffs with block matrices
-  VERIFY(ei_real(ones.col(c1).sum()) == RealScalar(rows));
-  VERIFY(ei_real(ones.row(r1).sum()) == RealScalar(cols));
-
-  VERIFY(ei_real(ones.col(c1).eigen2_dot(ones.col(c2))) == RealScalar(rows));
-  VERIFY(ei_real(ones.row(r1).eigen2_dot(ones.row(r2))) == RealScalar(cols));
-}
-
-void test_eigen2_submatrices()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( submatrices(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( submatrices(Matrix4d()) );
-    CALL_SUBTEST_3( submatrices(MatrixXcf(3, 3)) );
-    CALL_SUBTEST_4( submatrices(MatrixXi(8, 12)) );
-    CALL_SUBTEST_5( submatrices(MatrixXcd(20, 20)) );
-    CALL_SUBTEST_6( submatrices(MatrixXf(20, 20)) );
-  }
-}
diff --git a/test/eigen2/eigen2_sum.cpp b/test/eigen2/eigen2_sum.cpp
deleted file mode 100644
index b47057caa..000000000
--- a/test/eigen2/eigen2_sum.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename MatrixType> void matrixSum(const MatrixType& m)
-{
-  typedef typename MatrixType::Scalar Scalar;
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  MatrixType m1 = MatrixType::Random(rows, cols);
-
-  VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).sum(), Scalar(1));
-  VERIFY_IS_APPROX(MatrixType::Ones(rows, cols).sum(), Scalar(float(rows*cols))); // the float() here to shut up excessive MSVC warning about int->complex conversion being lossy
-  Scalar x = Scalar(0);
-  for(int i = 0; i < rows; i++) for(int j = 0; j < cols; j++) x += m1(i,j);
-  VERIFY_IS_APPROX(m1.sum(), x);
-}
-
-template<typename VectorType> void vectorSum(const VectorType& w)
-{
-  typedef typename VectorType::Scalar Scalar;
-  int size = w.size();
-
-  VectorType v = VectorType::Random(size);
-  for(int i = 1; i < size; i++)
-  {
-    Scalar s = Scalar(0);
-    for(int j = 0; j < i; j++) s += v[j];
-    VERIFY_IS_APPROX(s, v.start(i).sum());
-  }
-
-  for(int i = 0; i < size-1; i++)
-  {
-    Scalar s = Scalar(0);
-    for(int j = i; j < size; j++) s += v[j];
-    VERIFY_IS_APPROX(s, v.end(size-i).sum());
-  }
-
-  for(int i = 0; i < size/2; i++)
-  {
-    Scalar s = Scalar(0);
-    for(int j = i; j < size-i; j++) s += v[j];
-    VERIFY_IS_APPROX(s, v.segment(i, size-2*i).sum());
-  }
-}
-
-void test_eigen2_sum()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( matrixSum(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( matrixSum(Matrix2f()) );
-    CALL_SUBTEST_3( matrixSum(Matrix4d()) );
-    CALL_SUBTEST_4( matrixSum(MatrixXcf(3, 3)) );
-    CALL_SUBTEST_5( matrixSum(MatrixXf(8, 12)) );
-    CALL_SUBTEST_6( matrixSum(MatrixXi(8, 12)) );
-  }
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_5( vectorSum(VectorXf(5)) );
-    CALL_SUBTEST_7( vectorSum(VectorXd(10)) );
-    CALL_SUBTEST_5( vectorSum(VectorXf(33)) );
-  }
-}
diff --git a/test/eigen2/eigen2_svd.cpp b/test/eigen2/eigen2_svd.cpp
deleted file mode 100644
index d4689a56f..000000000
--- a/test/eigen2/eigen2_svd.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/SVD>
-
-template<typename MatrixType> void svd(const MatrixType& m)
-{
-  /* this test covers the following files:
-     SVD.h
-  */
-  int rows = m.rows();
-  int cols = m.cols();
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  MatrixType a = MatrixType::Random(rows,cols);
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> b =
-    Matrix<Scalar, MatrixType::RowsAtCompileTime, 1>::Random(rows,1);
-  Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> x(cols,1), x2(cols,1);
-
-  RealScalar largerEps = test_precision<RealScalar>();
-  if (ei_is_same_type<RealScalar,float>::ret)
-    largerEps = 1e-3f;
-
-  {
-    SVD<MatrixType> svd(a);
-    MatrixType sigma = MatrixType::Zero(rows,cols);
-    MatrixType matU  = MatrixType::Zero(rows,rows);
-    sigma.block(0,0,cols,cols) = svd.singularValues().asDiagonal();
-    matU.block(0,0,rows,cols) = svd.matrixU();
-    VERIFY_IS_APPROX(a, matU * sigma * svd.matrixV().transpose());
-  }
-
-
-  if (rows==cols)
-  {
-    if (ei_is_same_type<RealScalar,float>::ret)
-    {
-      MatrixType a1 = MatrixType::Random(rows,cols);
-      a += a * a.adjoint() + a1 * a1.adjoint();
-    }
-    SVD<MatrixType> svd(a);
-    svd.solve(b, &x);
-    VERIFY_IS_APPROX(a * x,b);
-  }
-
-
-  if(rows==cols)
-  {
-    SVD<MatrixType> svd(a);
-    MatrixType unitary, positive;
-    svd.computeUnitaryPositive(&unitary, &positive);
-    VERIFY_IS_APPROX(unitary * unitary.adjoint(), MatrixType::Identity(unitary.rows(),unitary.rows()));
-    VERIFY_IS_APPROX(positive, positive.adjoint());
-    for(int i = 0; i < rows; i++) VERIFY(positive.diagonal()[i] >= 0); // cheap necessary (not sufficient) condition for positivity
-    VERIFY_IS_APPROX(unitary*positive, a);
-
-    svd.computePositiveUnitary(&positive, &unitary);
-    VERIFY_IS_APPROX(unitary * unitary.adjoint(), MatrixType::Identity(unitary.rows(),unitary.rows()));
-    VERIFY_IS_APPROX(positive, positive.adjoint());
-    for(int i = 0; i < rows; i++) VERIFY(positive.diagonal()[i] >= 0); // cheap necessary (not sufficient) condition for positivity
-    VERIFY_IS_APPROX(positive*unitary, a);
-  }
-}
-
-void test_eigen2_svd()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( svd(Matrix3f()) );
-    CALL_SUBTEST_2( svd(Matrix4d()) );
-    CALL_SUBTEST_3( svd(MatrixXf(7,7)) );
-    CALL_SUBTEST_4( svd(MatrixXd(14,7)) );
-    // complex are not implemented yet
-//     CALL_SUBTEST( svd(MatrixXcd(6,6)) );
-//     CALL_SUBTEST( svd(MatrixXcf(3,3)) );
-    SVD<MatrixXf> s;
-    MatrixXf m = MatrixXf::Random(10,1);
-    s.compute(m);
-  }
-}
diff --git a/test/eigen2/eigen2_swap.cpp b/test/eigen2/eigen2_swap.cpp
deleted file mode 100644
index f3a8846d9..000000000
--- a/test/eigen2/eigen2_swap.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_NO_STATIC_ASSERT
-#include "main.h"
-
-template<typename T>
-struct other_matrix_type
-{
-  typedef int type;
-};
-
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct other_matrix_type<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  typedef Matrix<_Scalar, _Rows, _Cols, _Options^RowMajor, _MaxRows, _MaxCols> type;
-};
-
-template<typename MatrixType> void swap(const MatrixType& m)
-{
-  typedef typename other_matrix_type<MatrixType>::type OtherMatrixType;
-  typedef typename MatrixType::Scalar Scalar;
-
-  ei_assert((!ei_is_same_type<MatrixType,OtherMatrixType>::ret));
-  int rows = m.rows();
-  int cols = m.cols();
-  
-  // construct 3 matrix guaranteed to be distinct
-  MatrixType m1 = MatrixType::Random(rows,cols);
-  MatrixType m2 = MatrixType::Random(rows,cols) + Scalar(100) * MatrixType::Identity(rows,cols);
-  OtherMatrixType m3 = OtherMatrixType::Random(rows,cols) + Scalar(200) * OtherMatrixType::Identity(rows,cols);
-  
-  MatrixType m1_copy = m1;
-  MatrixType m2_copy = m2;
-  OtherMatrixType m3_copy = m3;
-  
-  // test swapping 2 matrices of same type
-  m1.swap(m2);
-  VERIFY_IS_APPROX(m1,m2_copy);
-  VERIFY_IS_APPROX(m2,m1_copy);
-  m1 = m1_copy;
-  m2 = m2_copy;
-  
-  // test swapping 2 matrices of different types
-  m1.swap(m3);
-  VERIFY_IS_APPROX(m1,m3_copy);
-  VERIFY_IS_APPROX(m3,m1_copy);
-  m1 = m1_copy;
-  m3 = m3_copy;
-  
-  // test swapping matrix with expression
-  m1.swap(m2.block(0,0,rows,cols));
-  VERIFY_IS_APPROX(m1,m2_copy);
-  VERIFY_IS_APPROX(m2,m1_copy);
-  m1 = m1_copy;
-  m2 = m2_copy;
-
-  // test swapping two expressions of different types
-  m1.transpose().swap(m3.transpose());
-  VERIFY_IS_APPROX(m1,m3_copy);
-  VERIFY_IS_APPROX(m3,m1_copy);
-  m1 = m1_copy;
-  m3 = m3_copy;
-  
-  // test assertion on mismatching size -- matrix case
-  VERIFY_RAISES_ASSERT(m1.swap(m1.row(0)));
-  // test assertion on mismatching size -- xpr case
-  VERIFY_RAISES_ASSERT(m1.row(0).swap(m1));
-}
-
-void test_eigen2_swap()
-{
-  CALL_SUBTEST_1( swap(Matrix3f()) ); // fixed size, no vectorization 
-  CALL_SUBTEST_1( swap(Matrix4d()) ); // fixed size, possible vectorization 
-  CALL_SUBTEST_1( swap(MatrixXd(3,3)) ); // dyn size, no vectorization 
-  CALL_SUBTEST_1( swap(MatrixXf(30,30)) ); // dyn size, possible vectorization 
-}
diff --git a/test/eigen2/eigen2_triangular.cpp b/test/eigen2/eigen2_triangular.cpp
deleted file mode 100644
index 6f17b7dff..000000000
--- a/test/eigen2/eigen2_triangular.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename MatrixType> void triangular(const MatrixType& m)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-
-  RealScalar largerEps = 10*test_precision<RealScalar>();
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols),
-             m3(rows, cols),
-             m4(rows, cols),
-             r1(rows, cols),
-             r2(rows, cols);
-
-  MatrixType m1up = m1.template part<Eigen::UpperTriangular>();
-  MatrixType m2up = m2.template part<Eigen::UpperTriangular>();
-
-  if (rows*cols>1)
-  {
-    VERIFY(m1up.isUpperTriangular());
-    VERIFY(m2up.transpose().isLowerTriangular());
-    VERIFY(!m2.isLowerTriangular());
-  }
-
-//   VERIFY_IS_APPROX(m1up.transpose() * m2, m1.upper().transpose().lower() * m2);
-
-  // test overloaded operator+=
-  r1.setZero();
-  r2.setZero();
-  r1.template part<Eigen::UpperTriangular>() +=  m1;
-  r2 += m1up;
-  VERIFY_IS_APPROX(r1,r2);
-
-  // test overloaded operator=
-  m1.setZero();
-  m1.template part<Eigen::UpperTriangular>() = (m2.transpose() * m2).lazy();
-  m3 = m2.transpose() * m2;
-  VERIFY_IS_APPROX(m3.template part<Eigen::LowerTriangular>().transpose(), m1);
-
-  // test overloaded operator=
-  m1.setZero();
-  m1.template part<Eigen::LowerTriangular>() = (m2.transpose() * m2).lazy();
-  VERIFY_IS_APPROX(m3.template part<Eigen::LowerTriangular>(), m1);
-
-  VERIFY_IS_APPROX(m3.template part<Diagonal>(), m3.diagonal().asDiagonal());
-
-  m1 = MatrixType::Random(rows, cols);
-  for (int i=0; i<rows; ++i)
-    while (ei_abs2(m1(i,i))<1e-3) m1(i,i) = ei_random<Scalar>();
-
-  Transpose<MatrixType> trm4(m4);
-  // test back and forward subsitution
-  m3 = m1.template part<Eigen::LowerTriangular>();
-  VERIFY(m3.template marked<Eigen::LowerTriangular>().solveTriangular(m3).cwise().abs().isIdentity(test_precision<RealScalar>()));
-  VERIFY(m3.transpose().template marked<Eigen::UpperTriangular>()
-    .solveTriangular(m3.transpose()).cwise().abs().isIdentity(test_precision<RealScalar>()));
-  // check M * inv(L) using in place API
-  m4 = m3;
-  m3.transpose().template marked<Eigen::UpperTriangular>().solveTriangularInPlace(trm4);
-  VERIFY(m4.cwise().abs().isIdentity(test_precision<RealScalar>()));
-
-  m3 = m1.template part<Eigen::UpperTriangular>();
-  VERIFY(m3.template marked<Eigen::UpperTriangular>().solveTriangular(m3).cwise().abs().isIdentity(test_precision<RealScalar>()));
-  VERIFY(m3.transpose().template marked<Eigen::LowerTriangular>()
-    .solveTriangular(m3.transpose()).cwise().abs().isIdentity(test_precision<RealScalar>()));
-  // check M * inv(U) using in place API
-  m4 = m3;
-  m3.transpose().template marked<Eigen::LowerTriangular>().solveTriangularInPlace(trm4);
-  VERIFY(m4.cwise().abs().isIdentity(test_precision<RealScalar>()));
-
-  m3 = m1.template part<Eigen::UpperTriangular>();
-  VERIFY(m2.isApprox(m3 * (m3.template marked<Eigen::UpperTriangular>().solveTriangular(m2)), largerEps));
-  m3 = m1.template part<Eigen::LowerTriangular>();
-  VERIFY(m2.isApprox(m3 * (m3.template marked<Eigen::LowerTriangular>().solveTriangular(m2)), largerEps));
-
-  VERIFY((m1.template part<Eigen::UpperTriangular>() * m2.template part<Eigen::UpperTriangular>()).isUpperTriangular());
-
-  // test swap
-  m1.setOnes();
-  m2.setZero();
-  m2.template part<Eigen::UpperTriangular>().swap(m1);
-  m3.setZero();
-  m3.template part<Eigen::UpperTriangular>().setOnes();
-  VERIFY_IS_APPROX(m2,m3);
-
-}
-
-void selfadjoint()
-{
-  Matrix2i m;
-  m << 1, 2,
-       3, 4;
-
-  Matrix2i m1 = Matrix2i::Zero();
-  m1.part<SelfAdjoint>() = m;
-  Matrix2i ref1;
-  ref1 << 1, 2,
-          2, 4;
-  VERIFY(m1 == ref1);
-  
-  Matrix2i m2 = Matrix2i::Zero();
-  m2.part<SelfAdjoint>() = m.part<UpperTriangular>();
-  Matrix2i ref2;
-  ref2 << 1, 2,
-          2, 4;
-  VERIFY(m2 == ref2);
- 
-  Matrix2i m3 = Matrix2i::Zero();
-  m3.part<SelfAdjoint>() = m.part<LowerTriangular>();
-  Matrix2i ref3;
-  ref3 << 1, 0,
-          0, 4;
-  VERIFY(m3 == ref3);
-  
-  // example inspired from bug 159
-  int array[] = {1, 2, 3, 4};
-  Matrix2i::Map(array).part<SelfAdjoint>() = Matrix2i::Random().part<LowerTriangular>();
-  
-  std::cout << "hello\n" << array << std::endl;
-}
-
-void test_eigen2_triangular()
-{
-  CALL_SUBTEST_8( selfadjoint() );
-  for(int i = 0; i < g_repeat ; i++) {
-    CALL_SUBTEST_1( triangular(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( triangular(Matrix<float, 2, 2>()) );
-    CALL_SUBTEST_3( triangular(Matrix3d()) );
-    CALL_SUBTEST_4( triangular(MatrixXcf(4, 4)) );
-    CALL_SUBTEST_5( triangular(Matrix<std::complex<float>,8, 8>()) );
-    CALL_SUBTEST_6( triangular(MatrixXd(17,17)) );
-    CALL_SUBTEST_7( triangular(Matrix<float,Dynamic,Dynamic,RowMajor>(5, 5)) );
-  }
-}
diff --git a/test/eigen2/eigen2_unalignedassert.cpp b/test/eigen2/eigen2_unalignedassert.cpp
deleted file mode 100644
index d10b6f538..000000000
--- a/test/eigen2/eigen2_unalignedassert.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-struct Good1
-{
-  MatrixXd m; // good: m will allocate its own array, taking care of alignment.
-  Good1() : m(20,20) {}
-};
-
-struct Good2
-{
-  Matrix3d m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be aligned
-};
-
-struct Good3
-{
-  Vector2f m; // good: same reason
-};
-
-struct Bad4
-{
-  Vector2d m; // bad: sizeof(m)%16==0 so alignment is required
-};
-
-struct Bad5
-{
-  Matrix<float, 2, 6> m; // bad: same reason
-};
-
-struct Bad6
-{
-  Matrix<double, 3, 4> m; // bad: same reason
-};
-
-struct Good7
-{
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  Vector2d m;
-  float f; // make the struct have sizeof%16!=0 to make it a little more tricky when we allow an array of 2 such objects
-};
-
-struct Good8
-{
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  float f; // try the f at first -- the EIGEN_ALIGN_128 attribute of m should make that still work
-  Matrix4f m;
-};
-
-struct Good9
-{
-  Matrix<float,2,2,DontAlign> m; // good: no alignment requested
-  float f;
-};
-
-template<bool Align> struct Depends
-{
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(Align)
-  Vector2d m;
-  float f;
-};
-
-template<typename T>
-void check_unalignedassert_good()
-{
-  T *x, *y;
-  x = new T;
-  delete x;
-  y = new T[2];
-  delete[] y;
-}
-
-#if EIGEN_ARCH_WANTS_ALIGNMENT
-template<typename T>
-void check_unalignedassert_bad()
-{
-  float buf[sizeof(T)+16];
-  float *unaligned = buf;
-  while((reinterpret_cast<std::size_t>(unaligned)&0xf)==0) ++unaligned; // make sure unaligned is really unaligned
-  T *x = ::new(static_cast<void*>(unaligned)) T;
-  x->~T();
-}
-#endif
-
-void unalignedassert()
-{
-  check_unalignedassert_good<Good1>();
-  check_unalignedassert_good<Good2>();
-  check_unalignedassert_good<Good3>();
-#if EIGEN_ARCH_WANTS_ALIGNMENT
-  VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Bad4>());
-  VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Bad5>());
-  VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Bad6>());
-#endif
-
-  check_unalignedassert_good<Good7>();
-  check_unalignedassert_good<Good8>();
-  check_unalignedassert_good<Good9>();
-  check_unalignedassert_good<Depends<true> >();
-
-#if EIGEN_ARCH_WANTS_ALIGNMENT
-  VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Depends<false> >());
-#endif
-}
-
-void test_eigen2_unalignedassert()
-{
-  CALL_SUBTEST(unalignedassert());
-}
diff --git a/test/eigen2/eigen2_visitor.cpp b/test/eigen2/eigen2_visitor.cpp
deleted file mode 100644
index 4781991de..000000000
--- a/test/eigen2/eigen2_visitor.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-template<typename MatrixType> void matrixVisitor(const MatrixType& p)
-{
-  typedef typename MatrixType::Scalar Scalar;
-
-  int rows = p.rows();
-  int cols = p.cols();
-
-  // construct a random matrix where all coefficients are different
-  MatrixType m;
-  m = MatrixType::Random(rows, cols);
-  for(int i = 0; i < m.size(); i++)
-    for(int i2 = 0; i2 < i; i2++)
-      while(m(i) == m(i2)) // yes, ==
-        m(i) = ei_random<Scalar>();
-  
-  Scalar minc = Scalar(1000), maxc = Scalar(-1000);
-  int minrow=0,mincol=0,maxrow=0,maxcol=0;
-  for(int j = 0; j < cols; j++)
-  for(int i = 0; i < rows; i++)
-  {
-    if(m(i,j) < minc)
-    {
-      minc = m(i,j);
-      minrow = i;
-      mincol = j;
-    }
-    if(m(i,j) > maxc)
-    {
-      maxc = m(i,j);
-      maxrow = i;
-      maxcol = j;
-    }
-  }
-  int eigen_minrow, eigen_mincol, eigen_maxrow, eigen_maxcol;
-  Scalar eigen_minc, eigen_maxc;
-  eigen_minc = m.minCoeff(&eigen_minrow,&eigen_mincol);
-  eigen_maxc = m.maxCoeff(&eigen_maxrow,&eigen_maxcol);
-  VERIFY(minrow == eigen_minrow);
-  VERIFY(maxrow == eigen_maxrow);
-  VERIFY(mincol == eigen_mincol);
-  VERIFY(maxcol == eigen_maxcol);
-  VERIFY_IS_APPROX(minc, eigen_minc);
-  VERIFY_IS_APPROX(maxc, eigen_maxc);
-  VERIFY_IS_APPROX(minc, m.minCoeff());
-  VERIFY_IS_APPROX(maxc, m.maxCoeff());
-}
-
-template<typename VectorType> void vectorVisitor(const VectorType& w)
-{
-  typedef typename VectorType::Scalar Scalar;
-
-  int size = w.size();
-
-  // construct a random vector where all coefficients are different
-  VectorType v;
-  v = VectorType::Random(size);
-  for(int i = 0; i < size; i++)
-    for(int i2 = 0; i2 < i; i2++)
-      while(v(i) == v(i2)) // yes, ==
-        v(i) = ei_random<Scalar>();
-  
-  Scalar minc = Scalar(1000), maxc = Scalar(-1000);
-  int minidx=0,maxidx=0;
-  for(int i = 0; i < size; i++)
-  {
-    if(v(i) < minc)
-    {
-      minc = v(i);
-      minidx = i;
-    }
-    if(v(i) > maxc)
-    {
-      maxc = v(i);
-      maxidx = i;
-    }
-  }
-  int eigen_minidx, eigen_maxidx;
-  Scalar eigen_minc, eigen_maxc;
-  eigen_minc = v.minCoeff(&eigen_minidx);
-  eigen_maxc = v.maxCoeff(&eigen_maxidx);
-  VERIFY(minidx == eigen_minidx);
-  VERIFY(maxidx == eigen_maxidx);
-  VERIFY_IS_APPROX(minc, eigen_minc);
-  VERIFY_IS_APPROX(maxc, eigen_maxc);
-  VERIFY_IS_APPROX(minc, v.minCoeff());
-  VERIFY_IS_APPROX(maxc, v.maxCoeff());
-}
-
-void test_eigen2_visitor()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( matrixVisitor(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( matrixVisitor(Matrix2f()) );
-    CALL_SUBTEST_3( matrixVisitor(Matrix4d()) );
-    CALL_SUBTEST_4( matrixVisitor(MatrixXd(8, 12)) );
-    CALL_SUBTEST_5( matrixVisitor(Matrix<double,Dynamic,Dynamic,RowMajor>(20, 20)) );
-    CALL_SUBTEST_6( matrixVisitor(MatrixXi(8, 12)) );
-  }
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_7( vectorVisitor(Vector4f()) );
-    CALL_SUBTEST_4( vectorVisitor(VectorXd(10)) );
-    CALL_SUBTEST_4( vectorVisitor(RowVectorXd(10)) );
-    CALL_SUBTEST_8( vectorVisitor(VectorXf(33)) );
-  }
-}
diff --git a/test/eigen2/gsl_helper.h b/test/eigen2/gsl_helper.h
deleted file mode 100644
index d1d854533..000000000
--- a/test/eigen2/gsl_helper.h
+++ /dev/null
@@ -1,175 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_GSL_HELPER
-#define EIGEN_GSL_HELPER
-
-#include <Eigen/Core>
-
-#include <gsl/gsl_blas.h>
-#include <gsl/gsl_multifit.h>
-#include <gsl/gsl_eigen.h>
-#include <gsl/gsl_linalg.h>
-#include <gsl/gsl_complex.h>
-#include <gsl/gsl_complex_math.h>
-
-namespace Eigen {
-
-template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex> struct GslTraits
-{
-  typedef gsl_matrix* Matrix;
-  typedef gsl_vector* Vector;
-  static Matrix createMatrix(int rows, int cols) { return gsl_matrix_alloc(rows,cols); }
-  static Vector createVector(int size) { return gsl_vector_alloc(size); }
-  static void free(Matrix& m) { gsl_matrix_free(m); m=0; }
-  static void free(Vector& m) { gsl_vector_free(m); m=0; }
-  static void prod(const Matrix& m, const Vector& v, Vector& x) { gsl_blas_dgemv(CblasNoTrans,1,m,v,0,x); }
-  static void cholesky(Matrix& m) { gsl_linalg_cholesky_decomp(m); }
-  static void cholesky_solve(const Matrix& m, const Vector& b, Vector& x) { gsl_linalg_cholesky_solve(m,b,x); }
-  static void eigen_symm(const Matrix& m, Vector& eval, Matrix& evec)
-  {
-    gsl_eigen_symmv_workspace * w = gsl_eigen_symmv_alloc(m->size1);
-    Matrix a = createMatrix(m->size1, m->size2);
-    gsl_matrix_memcpy(a, m);
-    gsl_eigen_symmv(a,eval,evec,w);
-    gsl_eigen_symmv_sort(eval, evec, GSL_EIGEN_SORT_VAL_ASC);
-    gsl_eigen_symmv_free(w);
-    free(a);
-  }
-  static void eigen_symm_gen(const Matrix& m, const Matrix& _b, Vector& eval, Matrix& evec)
-  {
-    gsl_eigen_gensymmv_workspace * w = gsl_eigen_gensymmv_alloc(m->size1);
-    Matrix a = createMatrix(m->size1, m->size2);
-    Matrix b = createMatrix(_b->size1, _b->size2);
-    gsl_matrix_memcpy(a, m);
-    gsl_matrix_memcpy(b, _b);
-    gsl_eigen_gensymmv(a,b,eval,evec,w);
-    gsl_eigen_symmv_sort(eval, evec, GSL_EIGEN_SORT_VAL_ASC);
-    gsl_eigen_gensymmv_free(w);
-    free(a);
-  }
-};
-
-template<typename Scalar> struct GslTraits<Scalar,true>
-{
-  typedef gsl_matrix_complex* Matrix;
-  typedef gsl_vector_complex* Vector;
-  static Matrix createMatrix(int rows, int cols) { return gsl_matrix_complex_alloc(rows,cols); }
-  static Vector createVector(int size) { return gsl_vector_complex_alloc(size); }
-  static void free(Matrix& m) { gsl_matrix_complex_free(m); m=0; }
-  static void free(Vector& m) { gsl_vector_complex_free(m); m=0; }
-  static void cholesky(Matrix& m) { gsl_linalg_complex_cholesky_decomp(m); }
-  static void cholesky_solve(const Matrix& m, const Vector& b, Vector& x) { gsl_linalg_complex_cholesky_solve(m,b,x); }
-  static void prod(const Matrix& m, const Vector& v, Vector& x)
-  { gsl_blas_zgemv(CblasNoTrans,gsl_complex_rect(1,0),m,v,gsl_complex_rect(0,0),x); }
-  static void eigen_symm(const Matrix& m, gsl_vector* &eval, Matrix& evec)
-  {
-    gsl_eigen_hermv_workspace * w = gsl_eigen_hermv_alloc(m->size1);
-    Matrix a = createMatrix(m->size1, m->size2);
-    gsl_matrix_complex_memcpy(a, m);
-    gsl_eigen_hermv(a,eval,evec,w);
-    gsl_eigen_hermv_sort(eval, evec, GSL_EIGEN_SORT_VAL_ASC);
-    gsl_eigen_hermv_free(w);
-    free(a);
-  }
-  static void eigen_symm_gen(const Matrix& m, const Matrix& _b, gsl_vector* &eval, Matrix& evec)
-  {
-    gsl_eigen_genhermv_workspace * w = gsl_eigen_genhermv_alloc(m->size1);
-    Matrix a = createMatrix(m->size1, m->size2);
-    Matrix b = createMatrix(_b->size1, _b->size2);
-    gsl_matrix_complex_memcpy(a, m);
-    gsl_matrix_complex_memcpy(b, _b);
-    gsl_eigen_genhermv(a,b,eval,evec,w);
-    gsl_eigen_hermv_sort(eval, evec, GSL_EIGEN_SORT_VAL_ASC);
-    gsl_eigen_genhermv_free(w);
-    free(a);
-  }
-};
-
-template<typename MatrixType>
-void convert(const MatrixType& m, gsl_matrix* &res)
-{
-//   if (res)
-//     gsl_matrix_free(res);
-  res = gsl_matrix_alloc(m.rows(), m.cols());
-  for (int i=0 ; i<m.rows() ; ++i)
-    for (int j=0 ; j<m.cols(); ++j)
-      gsl_matrix_set(res, i, j, m(i,j));
-}
-
-template<typename MatrixType>
-void convert(const gsl_matrix* m, MatrixType& res)
-{
-  res.resize(int(m->size1), int(m->size2));
-  for (int i=0 ; i<res.rows() ; ++i)
-    for (int j=0 ; j<res.cols(); ++j)
-      res(i,j) = gsl_matrix_get(m,i,j);
-}
-
-template<typename VectorType>
-void convert(const VectorType& m, gsl_vector* &res)
-{
-  if (res) gsl_vector_free(res);
-  res = gsl_vector_alloc(m.size());
-  for (int i=0 ; i<m.size() ; ++i)
-      gsl_vector_set(res, i, m[i]);
-}
-
-template<typename VectorType>
-void convert(const gsl_vector* m, VectorType& res)
-{
-  res.resize (m->size);
-  for (int i=0 ; i<res.rows() ; ++i)
-    res[i] = gsl_vector_get(m, i);
-}
-
-template<typename MatrixType>
-void convert(const MatrixType& m, gsl_matrix_complex* &res)
-{
-  res = gsl_matrix_complex_alloc(m.rows(), m.cols());
-  for (int i=0 ; i<m.rows() ; ++i)
-    for (int j=0 ; j<m.cols(); ++j)
-    {
-      gsl_matrix_complex_set(res, i, j,
-        gsl_complex_rect(m(i,j).real(), m(i,j).imag()));
-    }
-}
-
-template<typename MatrixType>
-void convert(const gsl_matrix_complex* m, MatrixType& res)
-{
-  res.resize(int(m->size1), int(m->size2));
-  for (int i=0 ; i<res.rows() ; ++i)
-    for (int j=0 ; j<res.cols(); ++j)
-      res(i,j) = typename MatrixType::Scalar(
-        GSL_REAL(gsl_matrix_complex_get(m,i,j)),
-        GSL_IMAG(gsl_matrix_complex_get(m,i,j)));
-}
-
-template<typename VectorType>
-void convert(const VectorType& m, gsl_vector_complex* &res)
-{
-  res = gsl_vector_complex_alloc(m.size());
-  for (int i=0 ; i<m.size() ; ++i)
-      gsl_vector_complex_set(res, i, gsl_complex_rect(m[i].real(), m[i].imag()));
-}
-
-template<typename VectorType>
-void convert(const gsl_vector_complex* m, VectorType& res)
-{
-  res.resize(m->size);
-  for (int i=0 ; i<res.rows() ; ++i)
-    res[i] = typename VectorType::Scalar(
-        GSL_REAL(gsl_vector_complex_get(m, i)),
-        GSL_IMAG(gsl_vector_complex_get(m, i)));
-}
-
-}
-
-#endif // EIGEN_GSL_HELPER
diff --git a/test/eigen2/main.h b/test/eigen2/main.h
deleted file mode 100644
index ad2ba1994..000000000
--- a/test/eigen2/main.h
+++ /dev/null
@@ -1,399 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include <cstdlib>
-#include <ctime>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#ifndef EIGEN_TEST_FUNC
-#error EIGEN_TEST_FUNC must be defined
-#endif
-
-#define DEFAULT_REPEAT 10
-
-namespace Eigen
-{
-  static std::vector<std::string> g_test_stack;
-  static int g_repeat;
-}
-
-#define EI_PP_MAKE_STRING2(S) #S
-#define EI_PP_MAKE_STRING(S) EI_PP_MAKE_STRING2(S)
-
-#define EI_PP_CAT2(a,b) a ## b
-#define EI_PP_CAT(a,b) EI_PP_CAT2(a,b)
-
-#ifndef EIGEN_NO_ASSERTION_CHECKING
-
-  namespace Eigen
-  {
-    static const bool should_raise_an_assert = false;
-
-    // Used to avoid to raise two exceptions at a time in which
-    // case the exception is not properly caught.
-    // This may happen when a second exceptions is raise in a destructor.
-    static bool no_more_assert = false;
-
-    struct eigen_assert_exception
-    {
-      eigen_assert_exception(void) {}
-      ~eigen_assert_exception() { Eigen::no_more_assert = false; }
-    };
-  }
-
-  // If EIGEN_DEBUG_ASSERTS is defined and if no assertion is raised while
-  // one should have been, then the list of excecuted assertions is printed out.
-  //
-  // EIGEN_DEBUG_ASSERTS is not enabled by default as it
-  // significantly increases the compilation time
-  // and might even introduce side effects that would hide
-  // some memory errors.
-  #ifdef EIGEN_DEBUG_ASSERTS
-
-    namespace Eigen
-    {
-      static bool ei_push_assert = false;
-      static std::vector<std::string> eigen_assert_list;
-    }
-
-    #define eigen_assert(a)                       \
-      if( (!(a)) && (!no_more_assert) )     \
-      {                                     \
-        Eigen::no_more_assert = true;       \
-        throw Eigen::eigen_assert_exception(); \
-      }                                     \
-      else if (Eigen::ei_push_assert)       \
-      {                                     \
-        eigen_assert_list.push_back(std::string(EI_PP_MAKE_STRING(__FILE__)" ("EI_PP_MAKE_STRING(__LINE__)") : "#a) ); \
-      }
-
-    #define VERIFY_RAISES_ASSERT(a)                                               \
-      {                                                                           \
-        Eigen::no_more_assert = false;                                            \
-        try {                                                                     \
-          Eigen::eigen_assert_list.clear();                                          \
-          Eigen::ei_push_assert = true;                                           \
-          a;                                                                      \
-          Eigen::ei_push_assert = false;                                          \
-          std::cerr << "One of the following asserts should have been raised:\n"; \
-          for (uint ai=0 ; ai<eigen_assert_list.size() ; ++ai)                       \
-            std::cerr << "  " << eigen_assert_list[ai] << "\n";                      \
-          VERIFY(Eigen::should_raise_an_assert && # a);                           \
-        } catch (Eigen::eigen_assert_exception e) {                                  \
-          Eigen::ei_push_assert = false; VERIFY(true);                            \
-        }                                                                         \
-      }
-
-  #else // EIGEN_DEBUG_ASSERTS
-
-    #undef eigen_assert
-
-    // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3
-    #define eigen_assert(a) \
-      if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )	\
-      {                                     \
-        Eigen::no_more_assert = true;       \
-        throw Eigen::eigen_assert_exception(); \
-      }
-
-    #define VERIFY_RAISES_ASSERT(a) {                             \
-        Eigen::no_more_assert = false;                            \
-        try { a; VERIFY(Eigen::should_raise_an_assert && # a); }  \
-        catch (Eigen::eigen_assert_exception e) { VERIFY(true); }    \
-      }
-
-  #endif // EIGEN_DEBUG_ASSERTS
-
-  #define EIGEN_USE_CUSTOM_ASSERT
-
-#else // EIGEN_NO_ASSERTION_CHECKING
-
-  #define VERIFY_RAISES_ASSERT(a) {}
-
-#endif // EIGEN_NO_ASSERTION_CHECKING
-
-
-#define EIGEN_INTERNAL_DEBUGGING
-#define EIGEN_NICE_RANDOM
-#include <Eigen/Array>
-
-
-#define VERIFY(a) do { if (!(a)) { \
-    std::cerr << "Test " << g_test_stack.back() << " failed in "EI_PP_MAKE_STRING(__FILE__) << " (" << EI_PP_MAKE_STRING(__LINE__) << ")" \
-      << std::endl << "    " << EI_PP_MAKE_STRING(a) << std::endl << std::endl; \
-    abort(); \
-  } } while (0)
-
-#define VERIFY_IS_APPROX(a, b) VERIFY(test_ei_isApprox(a, b))
-#define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_ei_isApprox(a, b))
-#define VERIFY_IS_MUCH_SMALLER_THAN(a, b) VERIFY(test_ei_isMuchSmallerThan(a, b))
-#define VERIFY_IS_NOT_MUCH_SMALLER_THAN(a, b) VERIFY(!test_ei_isMuchSmallerThan(a, b))
-#define VERIFY_IS_APPROX_OR_LESS_THAN(a, b) VERIFY(test_ei_isApproxOrLessThan(a, b))
-#define VERIFY_IS_NOT_APPROX_OR_LESS_THAN(a, b) VERIFY(!test_ei_isApproxOrLessThan(a, b))
-
-#define CALL_SUBTEST(FUNC) do { \
-    g_test_stack.push_back(EI_PP_MAKE_STRING(FUNC)); \
-    FUNC; \
-    g_test_stack.pop_back(); \
-  } while (0)
-
-namespace Eigen {
-
-template<typename T> inline typename NumTraits<T>::Real test_precision();
-template<> inline int test_precision<int>() { return 0; }
-template<> inline float test_precision<float>() { return 1e-3f; }
-template<> inline double test_precision<double>() { return 1e-6; }
-template<> inline float test_precision<std::complex<float> >() { return test_precision<float>(); }
-template<> inline double test_precision<std::complex<double> >() { return test_precision<double>(); }
-template<> inline long double test_precision<long double>() { return 1e-6; }
-
-inline bool test_ei_isApprox(const int& a, const int& b)
-{ return ei_isApprox(a, b, test_precision<int>()); }
-inline bool test_ei_isMuchSmallerThan(const int& a, const int& b)
-{ return ei_isMuchSmallerThan(a, b, test_precision<int>()); }
-inline bool test_ei_isApproxOrLessThan(const int& a, const int& b)
-{ return ei_isApproxOrLessThan(a, b, test_precision<int>()); }
-
-inline bool test_ei_isApprox(const float& a, const float& b)
-{ return ei_isApprox(a, b, test_precision<float>()); }
-inline bool test_ei_isMuchSmallerThan(const float& a, const float& b)
-{ return ei_isMuchSmallerThan(a, b, test_precision<float>()); }
-inline bool test_ei_isApproxOrLessThan(const float& a, const float& b)
-{ return ei_isApproxOrLessThan(a, b, test_precision<float>()); }
-
-inline bool test_ei_isApprox(const double& a, const double& b)
-{ return ei_isApprox(a, b, test_precision<double>()); }
-inline bool test_ei_isMuchSmallerThan(const double& a, const double& b)
-{ return ei_isMuchSmallerThan(a, b, test_precision<double>()); }
-inline bool test_ei_isApproxOrLessThan(const double& a, const double& b)
-{ return ei_isApproxOrLessThan(a, b, test_precision<double>()); }
-
-inline bool test_ei_isApprox(const std::complex<float>& a, const std::complex<float>& b)
-{ return ei_isApprox(a, b, test_precision<std::complex<float> >()); }
-inline bool test_ei_isMuchSmallerThan(const std::complex<float>& a, const std::complex<float>& b)
-{ return ei_isMuchSmallerThan(a, b, test_precision<std::complex<float> >()); }
-
-inline bool test_ei_isApprox(const std::complex<double>& a, const std::complex<double>& b)
-{ return ei_isApprox(a, b, test_precision<std::complex<double> >()); }
-inline bool test_ei_isMuchSmallerThan(const std::complex<double>& a, const std::complex<double>& b)
-{ return ei_isMuchSmallerThan(a, b, test_precision<std::complex<double> >()); }
-
-inline bool test_ei_isApprox(const long double& a, const long double& b)
-{ return ei_isApprox(a, b, test_precision<long double>()); }
-inline bool test_ei_isMuchSmallerThan(const long double& a, const long double& b)
-{ return ei_isMuchSmallerThan(a, b, test_precision<long double>()); }
-inline bool test_ei_isApproxOrLessThan(const long double& a, const long double& b)
-{ return ei_isApproxOrLessThan(a, b, test_precision<long double>()); }
-
-template<typename Type1, typename Type2>
-inline bool test_ei_isApprox(const Type1& a, const Type2& b)
-{
-  return a.isApprox(b, test_precision<typename Type1::Scalar>());
-}
-
-template<typename Derived1, typename Derived2>
-inline bool test_ei_isMuchSmallerThan(const MatrixBase<Derived1>& m1,
-                                   const MatrixBase<Derived2>& m2)
-{
-  return m1.isMuchSmallerThan(m2, test_precision<typename ei_traits<Derived1>::Scalar>());
-}
-
-template<typename Derived>
-inline bool test_ei_isMuchSmallerThan(const MatrixBase<Derived>& m,
-                                   const typename NumTraits<typename ei_traits<Derived>::Scalar>::Real& s)
-{
-  return m.isMuchSmallerThan(s, test_precision<typename ei_traits<Derived>::Scalar>());
-}
-
-} // end namespace Eigen
-
-template<typename T> struct GetDifferentType;
-
-template<> struct GetDifferentType<float> { typedef double type; };
-template<> struct GetDifferentType<double> { typedef float type; };
-template<typename T> struct GetDifferentType<std::complex<T> >
-{ typedef std::complex<typename GetDifferentType<T>::type> type; };
-
-// forward declaration of the main test function
-void EI_PP_CAT(test_,EIGEN_TEST_FUNC)();
-
-using namespace Eigen;
-
-#ifdef EIGEN_TEST_PART_1
-#define CALL_SUBTEST_1(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_1(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_2
-#define CALL_SUBTEST_2(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_2(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_3
-#define CALL_SUBTEST_3(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_3(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_4
-#define CALL_SUBTEST_4(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_4(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_5
-#define CALL_SUBTEST_5(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_5(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_6
-#define CALL_SUBTEST_6(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_6(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_7
-#define CALL_SUBTEST_7(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_7(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_8
-#define CALL_SUBTEST_8(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_8(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_9
-#define CALL_SUBTEST_9(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_9(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_10
-#define CALL_SUBTEST_10(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_10(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_11
-#define CALL_SUBTEST_11(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_11(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_12
-#define CALL_SUBTEST_12(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_12(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_13
-#define CALL_SUBTEST_13(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_13(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_14
-#define CALL_SUBTEST_14(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_14(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_15
-#define CALL_SUBTEST_15(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_15(FUNC)
-#endif
-
-#ifdef EIGEN_TEST_PART_16
-#define CALL_SUBTEST_16(FUNC) CALL_SUBTEST(FUNC)
-#else
-#define CALL_SUBTEST_16(FUNC)
-#endif
-
-
-
-int main(int argc, char *argv[])
-{
-    bool has_set_repeat = false;
-    bool has_set_seed = false;
-    bool need_help = false;
-    unsigned int seed = 0;
-    int repeat = DEFAULT_REPEAT;
-
-    for(int i = 1; i < argc; i++)
-    {
-      if(argv[i][0] == 'r')
-      {
-        if(has_set_repeat)
-        {
-          std::cout << "Argument " << argv[i] << " conflicting with a former argument" << std::endl;
-          return 1;
-        }
-        repeat = std::atoi(argv[i]+1);
-        has_set_repeat = true;
-        if(repeat <= 0)
-        {
-          std::cout << "Invalid \'repeat\' value " << argv[i]+1 << std::endl;
-          return 1;
-        }
-      }
-      else if(argv[i][0] == 's')
-      {
-        if(has_set_seed)
-        {
-          std::cout << "Argument " << argv[i] << " conflicting with a former argument" << std::endl;
-          return 1;
-        }
-        seed = int(std::strtoul(argv[i]+1, 0, 10));
-        has_set_seed = true;
-        bool ok = seed!=0;
-        if(!ok)
-        {
-          std::cout << "Invalid \'seed\' value " << argv[i]+1 << std::endl;
-          return 1;
-        }
-      }
-      else
-      {
-        need_help = true;
-      }
-    }
-
-    if(need_help)
-    {
-      std::cout << "This test application takes the following optional arguments:" << std::endl;
-      std::cout << "  rN     Repeat each test N times (default: " << DEFAULT_REPEAT << ")" << std::endl;
-      std::cout << "  sN     Use N as seed for random numbers (default: based on current time)" << std::endl;
-      return 1;
-    }
-
-    if(!has_set_seed) seed = (unsigned int) std::time(NULL);
-    if(!has_set_repeat) repeat = DEFAULT_REPEAT;
-
-    std::cout << "Initializing random number generator with seed " << seed << std::endl;
-	std::srand(seed);
-    std::cout << "Repeating each test " << repeat << " times" << std::endl;
-
-    Eigen::g_repeat = repeat;
-    Eigen::g_test_stack.push_back(EI_PP_MAKE_STRING(EIGEN_TEST_FUNC));
-
-    EI_PP_CAT(test_,EIGEN_TEST_FUNC)();
-    return 0;
-}
-
-
-
diff --git a/test/eigen2/product.h b/test/eigen2/product.h
deleted file mode 100644
index ae1b4bae4..000000000
--- a/test/eigen2/product.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <Eigen/Array>
-#include <Eigen/QR>
-
-template<typename Derived1, typename Derived2>
-bool areNotApprox(const MatrixBase<Derived1>& m1, const MatrixBase<Derived2>& m2, typename Derived1::RealScalar epsilon = precision<typename Derived1::RealScalar>())
-{
-  return !((m1-m2).cwise().abs2().maxCoeff() < epsilon * epsilon
-                          * std::max(m1.cwise().abs2().maxCoeff(), m2.cwise().abs2().maxCoeff()));
-}
-
-template<typename MatrixType> void product(const MatrixType& m)
-{
-  /* this test covers the following files:
-     Identity.h Product.h
-  */
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::FloatingPoint FloatingPoint;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> RowVectorType;
-  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> ColVectorType;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> RowSquareMatrixType;
-  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> ColSquareMatrixType;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime,
-                         MatrixType::Options^RowMajor> OtherMajorMatrixType;
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  // this test relies a lot on Random.h, and there's not much more that we can do
-  // to test it, hence I consider that we will have tested Random.h
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols),
-             m3(rows, cols);
-  RowSquareMatrixType
-             identity = RowSquareMatrixType::Identity(rows, rows),
-             square = RowSquareMatrixType::Random(rows, rows),
-             res = RowSquareMatrixType::Random(rows, rows);
-  ColSquareMatrixType
-             square2 = ColSquareMatrixType::Random(cols, cols),
-             res2 = ColSquareMatrixType::Random(cols, cols);
-  RowVectorType v1 = RowVectorType::Random(rows);
-  ColVectorType vc2 = ColVectorType::Random(cols), vcres(cols);
-  OtherMajorMatrixType tm1 = m1;
-
-  Scalar s1 = ei_random<Scalar>();
-
-  int r = ei_random<int>(0, rows-1),
-      c = ei_random<int>(0, cols-1);
-
-  // begin testing Product.h: only associativity for now
-  // (we use Transpose.h but this doesn't count as a test for it)
-
-  VERIFY_IS_APPROX((m1*m1.transpose())*m2,  m1*(m1.transpose()*m2));
-  m3 = m1;
-  m3 *= m1.transpose() * m2;
-  VERIFY_IS_APPROX(m3,                      m1 * (m1.transpose()*m2));
-  VERIFY_IS_APPROX(m3,                      m1.lazy() * (m1.transpose()*m2));
-
-  // continue testing Product.h: distributivity
-  VERIFY_IS_APPROX(square*(m1 + m2),        square*m1+square*m2);
-  VERIFY_IS_APPROX(square*(m1 - m2),        square*m1-square*m2);
-
-  // continue testing Product.h: compatibility with ScalarMultiple.h
-  VERIFY_IS_APPROX(s1*(square*m1),          (s1*square)*m1);
-  VERIFY_IS_APPROX(s1*(square*m1),          square*(m1*s1));
-
-  // again, test operator() to check const-qualification
-  s1 += (square.lazy() * m1)(r,c);
-
-  // test Product.h together with Identity.h
-  VERIFY_IS_APPROX(v1,                      identity*v1);
-  VERIFY_IS_APPROX(v1.transpose(),          v1.transpose() * identity);
-  // again, test operator() to check const-qualification
-  VERIFY_IS_APPROX(MatrixType::Identity(rows, cols)(r,c), static_cast<Scalar>(r==c));
-
-  if (rows!=cols)
-     VERIFY_RAISES_ASSERT(m3 = m1*m1);
-
-  // test the previous tests were not screwed up because operator* returns 0
-  // (we use the more accurate default epsilon)
-  if (NumTraits<Scalar>::HasFloatingPoint && std::min(rows,cols)>1)
-  {
-    VERIFY(areNotApprox(m1.transpose()*m2,m2.transpose()*m1));
-  }
-
-  // test optimized operator+= path
-  res = square;
-  res += (m1 * m2.transpose()).lazy();
-  VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
-  if (NumTraits<Scalar>::HasFloatingPoint && std::min(rows,cols)>1)
-  {
-    VERIFY(areNotApprox(res,square + m2 * m1.transpose()));
-  }
-  vcres = vc2;
-  vcres += (m1.transpose() * v1).lazy();
-  VERIFY_IS_APPROX(vcres, vc2 + m1.transpose() * v1);
-  tm1 = m1;
-  VERIFY_IS_APPROX(tm1.transpose() * v1, m1.transpose() * v1);
-  VERIFY_IS_APPROX(v1.transpose() * tm1, v1.transpose() * m1);
-
-  // test submatrix and matrix/vector product
-  for (int i=0; i<rows; ++i)
-    res.row(i) = m1.row(i) * m2.transpose();
-  VERIFY_IS_APPROX(res, m1 * m2.transpose());
-  // the other way round:
-  for (int i=0; i<rows; ++i)
-    res.col(i) = m1 * m2.transpose().col(i);
-  VERIFY_IS_APPROX(res, m1 * m2.transpose());
-
-  res2 = square2;
-  res2 += (m1.transpose() * m2).lazy();
-  VERIFY_IS_APPROX(res2, square2 + m1.transpose() * m2);
-
-  if (NumTraits<Scalar>::HasFloatingPoint && std::min(rows,cols)>1)
-  {
-    VERIFY(areNotApprox(res2,square2 + m2.transpose() * m1));
-  }
-}
-
diff --git a/test/eigen2/runtest.sh b/test/eigen2/runtest.sh
deleted file mode 100755
index bc693af13..000000000
--- a/test/eigen2/runtest.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-black='\E[30m'
-red='\E[31m'
-green='\E[32m'
-yellow='\E[33m'
-blue='\E[34m'
-magenta='\E[35m'
-cyan='\E[36m'
-white='\E[37m'
-
-if make test_$1 > /dev/null  2> .runtest.log ; then
-  if ! ./test_$1 r20 > /dev/null 2> .runtest.log ; then
-    echo -e  $red Test $1 failed: $black
-    echo -e $blue
-    cat .runtest.log
-    echo -e $black
-    exit 1
-  else
-  echo -e $green Test $1 passed$black
-  fi
-else
-  echo -e  $red Build of target $1 failed: $black
-  echo -e $blue
-  cat .runtest.log
-  echo -e $black
-  exit 1
-fi
diff --git a/test/eigen2/sparse.h b/test/eigen2/sparse.h
deleted file mode 100644
index e12f89990..000000000
--- a/test/eigen2/sparse.h
+++ /dev/null
@@ -1,154 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Daniel Gomez Ferro <dgomezferro@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TESTSPARSE_H
-
-#include "main.h"
-
-#if EIGEN_GNUC_AT_LEAST(4,0) && !defined __ICC
-#include <tr1/unordered_map>
-#define EIGEN_UNORDERED_MAP_SUPPORT
-namespace std {
-  using std::tr1::unordered_map;
-}
-#endif
-
-#ifdef EIGEN_GOOGLEHASH_SUPPORT
-  #include <google/sparse_hash_map>
-#endif
-
-#include <Eigen/Cholesky>
-#include <Eigen/LU>
-#include <Eigen/Sparse>
-
-enum {
-  ForceNonZeroDiag = 1,
-  MakeLowerTriangular = 2,
-  MakeUpperTriangular = 4,
-  ForceRealDiag = 8
-};
-
-/* Initializes both a sparse and dense matrix with same random values,
- * and a ratio of \a density non zero entries.
- * \param flags is a union of ForceNonZeroDiag, MakeLowerTriangular and MakeUpperTriangular
- *        allowing to control the shape of the matrix.
- * \param zeroCoords and nonzeroCoords allows to get the coordinate lists of the non zero,
- *        and zero coefficients respectively.
- */
-template<typename Scalar> void
-initSparse(double density,
-           Matrix<Scalar,Dynamic,Dynamic>& refMat,
-           SparseMatrix<Scalar>& sparseMat,
-           int flags = 0,
-           std::vector<Vector2i>* zeroCoords = 0,
-           std::vector<Vector2i>* nonzeroCoords = 0)
-{
-  sparseMat.startFill(int(refMat.rows()*refMat.cols()*density));
-  for(int j=0; j<refMat.cols(); j++)
-  {
-    for(int i=0; i<refMat.rows(); i++)
-    {
-      Scalar v = (ei_random<double>(0,1) < density) ? ei_random<Scalar>() : Scalar(0);
-      if ((flags&ForceNonZeroDiag) && (i==j))
-      {
-        v = ei_random<Scalar>()*Scalar(3.);
-        v = v*v + Scalar(5.);
-      }
-      if ((flags & MakeLowerTriangular) && j>i)
-        v = Scalar(0);
-      else if ((flags & MakeUpperTriangular) && j<i)
-        v = Scalar(0);
-      
-      if ((flags&ForceRealDiag) && (i==j))
-        v = ei_real(v);
-        
-      if (v!=Scalar(0))
-      {
-        sparseMat.fill(i,j) = v;
-        if (nonzeroCoords)
-          nonzeroCoords->push_back(Vector2i(i,j));
-      }
-      else if (zeroCoords)
-      {
-        zeroCoords->push_back(Vector2i(i,j));
-      }
-      refMat(i,j) = v;
-    }
-  }
-  sparseMat.endFill();
-}
-
-template<typename Scalar> void
-initSparse(double density,
-           Matrix<Scalar,Dynamic,Dynamic>& refMat,
-           DynamicSparseMatrix<Scalar>& sparseMat,
-           int flags = 0,
-           std::vector<Vector2i>* zeroCoords = 0,
-           std::vector<Vector2i>* nonzeroCoords = 0)
-{
-  sparseMat.startFill(int(refMat.rows()*refMat.cols()*density));
-  for(int j=0; j<refMat.cols(); j++)
-  {
-    for(int i=0; i<refMat.rows(); i++)
-    {
-      Scalar v = (ei_random<double>(0,1) < density) ? ei_random<Scalar>() : Scalar(0);
-      if ((flags&ForceNonZeroDiag) && (i==j))
-      {
-        v = ei_random<Scalar>()*Scalar(3.);
-        v = v*v + Scalar(5.);
-      }
-      if ((flags & MakeLowerTriangular) && j>i)
-        v = Scalar(0);
-      else if ((flags & MakeUpperTriangular) && j<i)
-        v = Scalar(0);
-      
-      if ((flags&ForceRealDiag) && (i==j))
-        v = ei_real(v);
-        
-      if (v!=Scalar(0))
-      {
-        sparseMat.fill(i,j) = v;
-        if (nonzeroCoords)
-          nonzeroCoords->push_back(Vector2i(i,j));
-      }
-      else if (zeroCoords)
-      {
-        zeroCoords->push_back(Vector2i(i,j));
-      }
-      refMat(i,j) = v;
-    }
-  }
-  sparseMat.endFill();
-}
-
-template<typename Scalar> void
-initSparse(double density,
-           Matrix<Scalar,Dynamic,1>& refVec,
-           SparseVector<Scalar>& sparseVec,
-           std::vector<int>* zeroCoords = 0,
-           std::vector<int>* nonzeroCoords = 0)
-{
-  sparseVec.reserve(int(refVec.size()*density));
-  sparseVec.setZero();
-  for(int i=0; i<refVec.size(); i++)
-  {
-    Scalar v = (ei_random<double>(0,1) < density) ? ei_random<Scalar>() : Scalar(0);
-    if (v!=Scalar(0))
-    {
-      sparseVec.fill(i) = v;
-      if (nonzeroCoords)
-        nonzeroCoords->push_back(i);
-    }
-    else if (zeroCoords)
-        zeroCoords->push_back(i);
-    refVec[i] = v;
-  }
-}
-
-#endif // EIGEN_TESTSPARSE_H
diff --git a/test/eigen2/testsuite.cmake b/test/eigen2/testsuite.cmake
deleted file mode 100644
index 12b6bfa2e..000000000
--- a/test/eigen2/testsuite.cmake
+++ /dev/null
@@ -1,197 +0,0 @@
-
-####################################################################
-#
-# Usage:
-#  - create a new folder, let's call it cdash
-#  - in that folder, do:
-#    ctest -S path/to/eigen2/test/testsuite.cmake[,option1=value1[,option2=value2]]
-#
-# Options:
-#  - EIGEN_CXX: compiler, eg.: g++-4.2
-#      default: default c++ compiler
-#  - EIGEN_SITE: eg, INRIA-Bdx_pc-gael, or the name of the contributor, etc.
-#      default: hostname
-#  - EIGEN_BUILD_STRING: a string which identify the system/compiler. It should be formed like that:
-#        <OS_name>-<OS_version>-<arch>-<compiler-version>
-#      with:
-#        <OS_name> = opensuse, debian, osx, windows, cygwin, freebsd, solaris, etc.
-#        <OS_version> = 11.1, XP, vista, leopard, etc.
-#        <arch> = i386, x86_64, ia64, powerpc, etc.
-#        <compiler-version> = gcc-4.3.2, icc-11.0, MSVC-2008, etc.
-#  - EIGEN_EXPLICIT_VECTORIZATION: novec, SSE2, Altivec
-#       default: SSE2 for x86_64 systems, novec otherwise
-#       Its value is automatically appended to EIGEN_BUILD_STRING
-#  - EIGEN_CMAKE_DIR: path to cmake executable
-#  - EIGEN_MODE: dashboard model, can be Experimental, Nightly, or Continuous
-#      default: Nightly
-#  - EIGEN_WORK_DIR: directory used to download the source files and make the builds
-#      default: folder which contains this script
-#  - EIGEN_CMAKE_ARGS: additional arguments passed to cmake
-#  - CTEST_SOURCE_DIRECTORY: path to eigen's src (use a new and empty folder, not the one you are working on)
-#      default: <EIGEN_WORK_DIR>/src
-#  - CTEST_BINARY_DIRECTORY: build directory
-#      default: <EIGEN_WORK_DIR>/nightly-<EIGEN_CXX>
-#
-# Here is an example running several compilers on a linux system:
-# #!/bin/bash
-# ARCH=`uname -m`
-# SITE=`hostname`
-# VERSION=opensuse-11.1
-# WORK_DIR=/home/gael/Coding/eigen2/cdash
-# # get the last version of the script
-# wget http://bitbucket.org/eigen/eigen/raw/tip/test/testsuite.cmake -o $WORK_DIR/testsuite.cmake
-# COMMON="ctest -S $WORK_DIR/testsuite.cmake,EIGEN_WORK_DIR=$WORK_DIR,EIGEN_SITE=$SITE,EIGEN_MODE=$1,EIGEN_BUILD_STRING=$OS_VERSION-$ARCH"
-# $COMMON-gcc-3.4.6,EIGEN_CXX=g++-3.4
-# $COMMON-gcc-4.0.1,EIGEN_CXX=g++-4.0.1
-# $COMMON-gcc-4.3.2,EIGEN_CXX=g++-4.3,EIGEN_EXPLICIT_VECTORIZATION=novec
-# $COMMON-gcc-4.3.2,EIGEN_CXX=g++-4.3,EIGEN_EXPLICIT_VECTORIZATION=SSE2
-# $COMMON-icc-11.0,EIGEN_CXX=icpc
-#
-####################################################################
-
-# process the arguments
-
-set(ARGLIST ${CTEST_SCRIPT_ARG})
-while(${ARGLIST} MATCHES  ".+.*")
-
-  # pick first
-  string(REGEX MATCH "([^,]*)(,.*)?" DUMMY ${ARGLIST})
-  SET(TOP ${CMAKE_MATCH_1})
-
-  # remove first
-  string(REGEX MATCHALL "[^,]*,(.*)" DUMMY ${ARGLIST})
-  SET(ARGLIST ${CMAKE_MATCH_1})
-
-  # decompose as a pair key=value
-  string(REGEX MATCH "([^=]*)(=.*)?" DUMMY ${TOP})
-  SET(KEY ${CMAKE_MATCH_1})
-
-  string(REGEX MATCH "[^=]*=(.*)" DUMMY ${TOP})
-  SET(VALUE ${CMAKE_MATCH_1})
-
-  # set the variable to the specified value
-  if(VALUE)
-    SET(${KEY} ${VALUE})
-  else(VALUE)
-    SET(${KEY} ON)
-  endif(VALUE)
-
-endwhile(${ARGLIST} MATCHES ".+.*")
-
-####################################################################
-# Automatically set some user variables if they have not been defined manually
-####################################################################
-cmake_minimum_required(VERSION 2.6 FATAL_ERROR)
-
-if(NOT EIGEN_SITE)
-  site_name(EIGEN_SITE)
-endif(NOT EIGEN_SITE)
-
-if(NOT EIGEN_CMAKE_DIR)
-  SET(EIGEN_CMAKE_DIR "")
-endif(NOT EIGEN_CMAKE_DIR)
-
-if(NOT EIGEN_BUILD_STRING)
-
-  # let's try to find all information we need to make the build string ourself
-
-  # OS
-  build_name(EIGEN_OS_VERSION)
-
-  # arch
-  set(EIGEN_ARCH ${CMAKE_SYSTEM_PROCESSOR})
-  if(WIN32)
-    set(EIGEN_ARCH $ENV{PROCESSOR_ARCHITECTURE})
-  else(WIN32)
-    execute_process(COMMAND uname -m OUTPUT_VARIABLE EIGEN_ARCH OUTPUT_STRIP_TRAILING_WHITESPACE)
-  endif(WIN32)
-
-  set(EIGEN_BUILD_STRING ${EIGEN_OS_VERSION}${EIGEN_ARCH}-${EIGEN_CXX})
-
-endif(NOT EIGEN_BUILD_STRING)
-
-if(DEFINED EIGEN_EXPLICIT_VECTORIZATION)
-  set(EIGEN_BUILD_STRING ${EIGEN_BUILD_STRING}-${EIGEN_EXPLICIT_VECTORIZATION})
-endif(DEFINED EIGEN_EXPLICIT_VECTORIZATION)
-
-if(NOT EIGEN_WORK_DIR)
-  set(EIGEN_WORK_DIR ${CTEST_SCRIPT_DIRECTORY})
-endif(NOT EIGEN_WORK_DIR)
-
-if(NOT CTEST_SOURCE_DIRECTORY)
-  SET (CTEST_SOURCE_DIRECTORY "${EIGEN_WORK_DIR}/src")
-endif(NOT CTEST_SOURCE_DIRECTORY)
-
-if(NOT CTEST_BINARY_DIRECTORY)
-  SET (CTEST_BINARY_DIRECTORY "${EIGEN_WORK_DIR}/nightly_${EIGEN_CXX}")
-endif(NOT CTEST_BINARY_DIRECTORY)
-
-if(NOT EIGEN_MODE)
-  set(EIGEN_MODE Nightly)
-endif(NOT EIGEN_MODE)
-
-## mandatory variables (the default should be ok in most cases):
-
-SET (CTEST_CVS_COMMAND "hg")
-SET (CTEST_CVS_CHECKOUT "${CTEST_CVS_COMMAND} clone -r 2.0 http://bitbucket.org/eigen/eigen \"${CTEST_SOURCE_DIRECTORY}\"")
-
-# which ctest command to use for running the dashboard
-SET (CTEST_COMMAND "${EIGEN_CMAKE_DIR}ctest -D ${EIGEN_MODE}")
-
-# what cmake command to use for configuring this dashboard
-SET (CTEST_CMAKE_COMMAND "${EIGEN_CMAKE_DIR}cmake -DEIGEN_BUILD_TESTS=on ")
-
-####################################################################
-# The values in this section are optional you can either
-# have them or leave them commented out
-####################################################################
-
-# this make sure we get consistent outputs
-SET($ENV{LC_MESSAGES} "en_EN")
-
-# should ctest wipe the binary tree before running
-SET(CTEST_START_WITH_EMPTY_BINARY_DIRECTORY TRUE)
-SET(CTEST_BACKUP_AND_RESTORE TRUE)
-
-# this is the initial cache to use for the binary tree, be careful to escape
-# any quotes inside of this string if you use it
-if(WIN32 AND NOT UNIX)
-  #message(SEND_ERROR "win32")
-  set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -G \"NMake Makefiles\" -DCMAKE_MAKE_PROGRAM=nmake")
-  SET (CTEST_INITIAL_CACHE "
-    MAKECOMMAND:STRING=nmake -i
-    CMAKE_MAKE_PROGRAM:FILEPATH=nmake
-    CMAKE_GENERATOR:INTERNAL=NMake Makefiles
-    BUILDNAME:STRING=${EIGEN_BUILD_STRING}
-    SITE:STRING=${EIGEN_SITE}
-  ")
-else(WIN32 AND NOT UNIX)
-  SET (CTEST_INITIAL_CACHE "
-    BUILDNAME:STRING=${EIGEN_BUILD_STRING}
-    SITE:STRING=${EIGEN_SITE}
-  ")
-endif(WIN32 AND NOT UNIX)
-
-# set any extra environment variables to use during the execution of the script here:
-
-if(EIGEN_CXX)
-  set(CTEST_ENVIRONMENT "CXX=${EIGEN_CXX}")
-endif(EIGEN_CXX)
-
-if(DEFINED EIGEN_EXPLICIT_VECTORIZATION)
-  if(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSE2)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_SSE2=ON")
-  elseif(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSE3)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_SSE2=ON -DEIGEN_TEST_SSE3=ON")
-  elseif(EIGEN_EXPLICIT_VECTORIZATION MATCHES Altivec)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_ALTIVEC=ON")
-  elseif(EIGEN_EXPLICIT_VECTORIZATION MATCHES novec)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_NO_EXPLICIT_VECTORIZATION=ON")
-  else(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSE2)
-    message(FATAL_ERROR "Invalid value for EIGEN_EXPLICIT_VECTORIZATION (${EIGEN_EXPLICIT_VECTORIZATION}), must be: novec, SSE2, SSE3, Altivec")
-  endif(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSE2)
-endif(DEFINED EIGEN_EXPLICIT_VECTORIZATION)
-
-if(DEFINED EIGEN_CMAKE_ARGS)
-  set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} ${EIGEN_CMAKE_ARGS}")
-endif(DEFINED EIGEN_CMAKE_ARGS)
diff --git a/test/eigen2support.cpp b/test/eigen2support.cpp
index 1fa49a8c8..ad1d98091 100644
--- a/test/eigen2support.cpp
+++ b/test/eigen2support.cpp
@@ -8,7 +8,6 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #define EIGEN2_SUPPORT
-#define EIGEN_NO_EIGEN2_DEPRECATED_WARNING
 
 #include "main.h"
 
diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp
index c9d8c0877..293b1b265 100644
--- a/test/eigensolver_complex.cpp
+++ b/test/eigensolver_complex.cpp
@@ -13,20 +13,59 @@
 #include <Eigen/Eigenvalues>
 #include <Eigen/LU>
 
-/* Check that two column vectors are approximately equal upto permutations,
-   by checking that the k-th power sums are equal for k = 1, ..., vec1.rows() */
+template<typename MatrixType> bool find_pivot(typename MatrixType::Scalar tol, MatrixType &diffs, Index col=0)
+{
+  bool match = diffs.diagonal().sum() <= tol;
+  if(match || col==diffs.cols())
+  {
+    return match;
+  }
+  else
+  {
+    Index n = diffs.cols();
+    std::vector<std::pair<Index,Index> > transpositions;
+    for(Index i=col; i<n; ++i)
+    {
+      Index best_index(0);
+      if(diffs.col(col).segment(col,n-i).minCoeff(&best_index) > tol)
+        break;
+      
+      best_index += col;
+      
+      diffs.row(col).swap(diffs.row(best_index));
+      if(find_pivot(tol,diffs,col+1)) return true;
+      diffs.row(col).swap(diffs.row(best_index));
+      
+      // move current pivot to the end
+      diffs.row(n-(i-col)-1).swap(diffs.row(best_index));
+      transpositions.push_back(std::pair<Index,Index>(n-(i-col)-1,best_index));
+    }
+    // restore
+    for(Index k=transpositions.size()-1; k>=0; --k)
+      diffs.row(transpositions[k].first).swap(diffs.row(transpositions[k].second));
+  }
+  return false;
+}
+
+/* Check that two column vectors are approximately equal upto permutations.
+ * Initially, this method checked that the k-th power sums are equal for all k = 1, ..., vec1.rows(),
+ * however this strategy is numerically inacurate because of numerical cancellation issues.
+ */
 template<typename VectorType>
 void verify_is_approx_upto_permutation(const VectorType& vec1, const VectorType& vec2)
 {
-  typedef typename NumTraits<typename VectorType::Scalar>::Real RealScalar;
+  typedef typename VectorType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
 
   VERIFY(vec1.cols() == 1);
   VERIFY(vec2.cols() == 1);
   VERIFY(vec1.rows() == vec2.rows());
-  for (int k = 1; k <= vec1.rows(); ++k)
-  {
-    VERIFY_IS_APPROX(vec1.array().pow(RealScalar(k)).sum(), vec2.array().pow(RealScalar(k)).sum());
-  }
+  
+  Index n = vec1.rows();
+  RealScalar tol = test_precision<RealScalar>()*test_precision<RealScalar>()*numext::maxi(vec1.squaredNorm(),vec2.squaredNorm());
+  Matrix<RealScalar,Dynamic,Dynamic> diffs = (vec1.rowwise().replicate(n) - vec2.rowwise().replicate(n).transpose()).cwiseAbs2();
+  
+  VERIFY( find_pivot(tol, diffs) );
 }
 
 
@@ -79,13 +118,28 @@ template<typename MatrixType> void eigensolver(const MatrixType& m)
   MatrixType id = MatrixType::Identity(rows, cols);
   VERIFY_IS_APPROX(id.operatorNorm(), RealScalar(1));
 
-  if (rows > 1)
+  if (rows > 1 && rows < 20)
   {
     // Test matrix with NaN
     a(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
     ComplexEigenSolver<MatrixType> eiNaN(a);
     VERIFY_IS_EQUAL(eiNaN.info(), NoConvergence);
   }
+
+  // regression test for bug 1098
+  {
+    ComplexEigenSolver<MatrixType> eig(a.adjoint() * a);
+    eig.compute(a.adjoint() * a);
+  }
+
+  // regression test for bug 478
+  {
+    a.setZero();
+    ComplexEigenSolver<MatrixType> ei3(a);
+    VERIFY_IS_EQUAL(ei3.info(), Success);
+    VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1));
+    VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity());
+  }
 }
 
 template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m)
@@ -108,6 +162,7 @@ void test_eigensolver_complex()
     CALL_SUBTEST_2( eigensolver(MatrixXcd(s,s)) );
     CALL_SUBTEST_3( eigensolver(Matrix<std::complex<float>, 1, 1>()) );
     CALL_SUBTEST_4( eigensolver(Matrix3f()) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
   CALL_SUBTEST_1( eigensolver_verify_assert(Matrix4cf()) );
   s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp
index 566a4bdc6..9c0838ba4 100644
--- a/test/eigensolver_generalized_real.cpp
+++ b/test/eigensolver_generalized_real.cpp
@@ -1,15 +1,17 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define EIGEN_RUNTIME_NO_MALLOC
 #include "main.h"
 #include <limits>
 #include <Eigen/Eigenvalues>
+#include <Eigen/LU>
 
 template<typename MatrixType> void generalized_eigensolver_real(const MatrixType& m)
 {
@@ -21,6 +23,7 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
   Index cols = m.cols();
 
   typedef typename MatrixType::Scalar Scalar;
+  typedef std::complex<Scalar> ComplexScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
 
   MatrixType a = MatrixType::Random(rows,cols);
@@ -31,14 +34,49 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
   MatrixType spdB =  b.adjoint() * b + b1.adjoint() * b1;
 
   // lets compare to GeneralizedSelfAdjointEigenSolver
-  GeneralizedSelfAdjointEigenSolver<MatrixType> symmEig(spdA, spdB);
-  GeneralizedEigenSolver<MatrixType> eig(spdA, spdB);
+  {
+    GeneralizedSelfAdjointEigenSolver<MatrixType> symmEig(spdA, spdB);
+    GeneralizedEigenSolver<MatrixType> eig(spdA, spdB);
 
-  VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0);
+    VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0);
 
-  VectorType realEigenvalues = eig.eigenvalues().real();
-  std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size());
-  VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues());
+    VectorType realEigenvalues = eig.eigenvalues().real();
+    std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size());
+    VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues());
+
+    // check eigenvectors
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType D = eig.eigenvalues().asDiagonal();
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType V = eig.eigenvectors();
+    VERIFY_IS_APPROX(spdA*V, spdB*V*D);
+  }
+
+  // non symmetric case:
+  {
+    GeneralizedEigenSolver<MatrixType> eig(rows);
+    // TODO enable full-prealocation of required memory, this probably requires an in-place mode for HessenbergDecomposition
+    //Eigen::internal::set_is_malloc_allowed(false);
+    eig.compute(a,b);
+    //Eigen::internal::set_is_malloc_allowed(true);
+    for(Index k=0; k<cols; ++k)
+    {
+      Matrix<ComplexScalar,Dynamic,Dynamic> tmp = (eig.betas()(k)*a).template cast<ComplexScalar>() - eig.alphas()(k)*b;
+      if(tmp.size()>1 && tmp.norm()>(std::numeric_limits<Scalar>::min)())
+        tmp /= tmp.norm();
+      VERIFY_IS_MUCH_SMALLER_THAN( std::abs(tmp.determinant()), Scalar(1) );
+    }
+    // check eigenvectors
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType D = eig.eigenvalues().asDiagonal();
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType V = eig.eigenvectors();
+    VERIFY_IS_APPROX(a*V, b*V*D);
+  }
+
+  // regression test for bug 1098
+  {
+    GeneralizedSelfAdjointEigenSolver<MatrixType> eig1(a.adjoint() * a,b.adjoint() * b);
+    eig1.compute(a.adjoint() * a,b.adjoint() * b);
+    GeneralizedEigenSolver<MatrixType> eig2(a.adjoint() * a,b.adjoint() * b);
+    eig2.compute(a.adjoint() * a,b.adjoint() * b);
+  }
 }
 
 void test_eigensolver_generalized_real()
@@ -49,7 +87,7 @@ void test_eigensolver_generalized_real()
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(s,s)) );
 
-    // some trivial but implementation-wise tricky cases
+    // some trivial but implementation-wise special cases
     CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(1,1)) );
     CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) );
     CALL_SUBTEST_3( generalized_eigensolver_real(Matrix<double,1,1>()) );
diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp
index 005af81eb..d0e644d4b 100644
--- a/test/eigensolver_generic.cpp
+++ b/test/eigensolver_generic.cpp
@@ -63,13 +63,28 @@ template<typename MatrixType> void eigensolver(const MatrixType& m)
   MatrixType id = MatrixType::Identity(rows, cols);
   VERIFY_IS_APPROX(id.operatorNorm(), RealScalar(1));
 
-  if (rows > 2)
+  if (rows > 2 && rows < 20)
   {
     // Test matrix with NaN
     a(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
     EigenSolver<MatrixType> eiNaN(a);
     VERIFY_IS_EQUAL(eiNaN.info(), NoConvergence);
   }
+
+  // regression test for bug 1098
+  {
+    EigenSolver<MatrixType> eig(a.adjoint() * a);
+    eig.compute(a.adjoint() * a);
+  }
+
+  // regression test for bug 478
+  {
+    a.setZero();
+    EigenSolver<MatrixType> ei3(a);
+    VERIFY_IS_EQUAL(ei3.info(), Success);
+    VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1));
+    VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity());
+  }
 }
 
 template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m)
@@ -93,6 +108,7 @@ void test_eigensolver_generic()
     CALL_SUBTEST_1( eigensolver(Matrix4f()) );
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_2( eigensolver(MatrixXd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
 
     // some trivial but implementation-wise tricky cases
     CALL_SUBTEST_2( eigensolver(MatrixXd(1,1)) );
@@ -114,12 +130,37 @@ void test_eigensolver_generic()
   CALL_SUBTEST_2(
   {
      MatrixXd A(1,1);
-     A(0,0) = std::sqrt(-1.);
+     A(0,0) = std::sqrt(-1.); // is Not-a-Number
      Eigen::EigenSolver<MatrixXd> solver(A);
-     MatrixXd V(1, 1);
-     V(0,0) = solver.eigenvectors()(0,0).real();
+     VERIFY_IS_EQUAL(solver.info(), NumericalIssue);
   }
   );
   
+#ifdef EIGEN_TEST_PART_2
+  {
+    // regression test for bug 793
+    MatrixXd a(3,3);
+    a << 0,  0,  1,
+        1,  1, 1,
+        1, 1e+200,  1;
+    Eigen::EigenSolver<MatrixXd> eig(a);
+    double scale = 1e-200; // scale to avoid overflow during the comparisons
+    VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale);
+    VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale);
+  }
+  {
+    // check a case where all eigenvalues are null.
+    MatrixXd a(2,2);
+    a << 1,  1,
+        -1, -1;
+    Eigen::EigenSolver<MatrixXd> eig(a);
+    VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.);
+    VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.);
+    VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.);
+    VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.);
+    VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.);
+  }
+#endif
+  
   TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index 38689cfbf..39ad4130e 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -9,8 +9,62 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
+#include "svd_fill.h"
 #include <limits>
 #include <Eigen/Eigenvalues>
+#include <Eigen/SparseCore>
+
+
+template<typename MatrixType> void selfadjointeigensolver_essential_check(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  RealScalar eival_eps = numext::mini<RealScalar>(test_precision<RealScalar>(),  NumTraits<Scalar>::dummy_precision()*20000);
+  
+  SelfAdjointEigenSolver<MatrixType> eiSymm(m);
+  VERIFY_IS_EQUAL(eiSymm.info(), Success);
+
+  RealScalar scaling = m.cwiseAbs().maxCoeff();
+
+  if(scaling<(std::numeric_limits<RealScalar>::min)())
+  {
+    VERIFY(eiSymm.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
+  }
+  else
+  {
+    VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiSymm.eigenvectors())/scaling,
+                     (eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal())/scaling);
+  }
+  VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues(), eiSymm.eigenvalues());
+  VERIFY_IS_UNITARY(eiSymm.eigenvectors());
+
+  if(m.cols()<=4)
+  {
+    SelfAdjointEigenSolver<MatrixType> eiDirect;
+    eiDirect.computeDirect(m);  
+    VERIFY_IS_EQUAL(eiDirect.info(), Success);
+    if(! eiSymm.eigenvalues().isApprox(eiDirect.eigenvalues(), eival_eps) )
+    {
+      std::cerr << "reference eigenvalues: " << eiSymm.eigenvalues().transpose() << "\n"
+                << "obtained eigenvalues:  " << eiDirect.eigenvalues().transpose() << "\n"
+                << "diff:                  " << (eiSymm.eigenvalues()-eiDirect.eigenvalues()).transpose() << "\n"
+                << "error (eps):           " << (eiSymm.eigenvalues()-eiDirect.eigenvalues()).norm() / eiSymm.eigenvalues().norm() << "  (" << eival_eps << ")\n";
+    }
+    if(scaling<(std::numeric_limits<RealScalar>::min)())
+    {
+      VERIFY(eiDirect.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
+    }
+    else
+    {
+      VERIFY_IS_APPROX(eiSymm.eigenvalues()/scaling, eiDirect.eigenvalues()/scaling);
+      VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiDirect.eigenvectors())/scaling,
+                       (eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal())/scaling);
+      VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues()/scaling, eiDirect.eigenvalues()/scaling);
+    }
+
+    VERIFY_IS_UNITARY(eiDirect.eigenvectors());
+  }
+}
 
 template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
 {
@@ -31,17 +85,8 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
   MatrixType symmA =  a.adjoint() * a + a1.adjoint() * a1;
   MatrixType symmC = symmA;
   
-  // randomly nullify some rows/columns
-  {
-    Index count = 1;//internal::random<Index>(-cols,cols);
-    for(Index k=0; k<count; ++k)
-    {
-      Index i = internal::random<Index>(0,cols-1);
-      symmA.row(i).setZero();
-      symmA.col(i).setZero();
-    }
-  }
-  
+  svd_fill_random(symmA,Symmetric);
+
   symmA.template triangularView<StrictlyUpper>().setZero();
   symmC.template triangularView<StrictlyUpper>().setZero();
 
@@ -49,23 +94,13 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
   MatrixType b1 = MatrixType::Random(rows,cols);
   MatrixType symmB = b.adjoint() * b + b1.adjoint() * b1;
   symmB.template triangularView<StrictlyUpper>().setZero();
+  
+  CALL_SUBTEST( selfadjointeigensolver_essential_check(symmA) );
 
   SelfAdjointEigenSolver<MatrixType> eiSymm(symmA);
-  SelfAdjointEigenSolver<MatrixType> eiDirect;
-  eiDirect.computeDirect(symmA);
   // generalized eigen pb
   GeneralizedSelfAdjointEigenSolver<MatrixType> eiSymmGen(symmC, symmB);
 
-  VERIFY_IS_EQUAL(eiSymm.info(), Success);
-  VERIFY((symmA.template selfadjointView<Lower>() * eiSymm.eigenvectors()).isApprox(
-          eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal(), largerEps));
-  VERIFY_IS_APPROX(symmA.template selfadjointView<Lower>().eigenvalues(), eiSymm.eigenvalues());
-  
-  VERIFY_IS_EQUAL(eiDirect.info(), Success);
-  VERIFY((symmA.template selfadjointView<Lower>() * eiDirect.eigenvectors()).isApprox(
-          eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal(), largerEps));
-  VERIFY_IS_APPROX(symmA.template selfadjointView<Lower>().eigenvalues(), eiDirect.eigenvalues());
-
   SelfAdjointEigenSolver<MatrixType> eiSymmNoEivecs(symmA, false);
   VERIFY_IS_EQUAL(eiSymmNoEivecs.info(), Success);
   VERIFY_IS_APPROX(eiSymm.eigenvalues(), eiSymmNoEivecs.eigenvalues());
@@ -111,37 +146,111 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
 
   // test Tridiagonalization's methods
   Tridiagonalization<MatrixType> tridiag(symmC);
-  // FIXME tridiag.matrixQ().adjoint() does not work
+  VERIFY_IS_APPROX(tridiag.diagonal(), tridiag.matrixT().diagonal());
+  VERIFY_IS_APPROX(tridiag.subDiagonal(), tridiag.matrixT().template diagonal<-1>());
+  Matrix<RealScalar,Dynamic,Dynamic> T = tridiag.matrixT();
+  if(rows>1 && cols>1) {
+    // FIXME check that upper and lower part are 0:
+    //VERIFY(T.topRightCorner(rows-2, cols-2).template triangularView<Upper>().isZero());
+  }
+  VERIFY_IS_APPROX(tridiag.diagonal(), T.diagonal());
+  VERIFY_IS_APPROX(tridiag.subDiagonal(), T.template diagonal<1>());
   VERIFY_IS_APPROX(MatrixType(symmC.template selfadjointView<Lower>()), tridiag.matrixQ() * tridiag.matrixT().eval() * MatrixType(tridiag.matrixQ()).adjoint());
+  VERIFY_IS_APPROX(MatrixType(symmC.template selfadjointView<Lower>()), tridiag.matrixQ() * tridiag.matrixT() * tridiag.matrixQ().adjoint());
   
-  if (rows > 1)
+  // Test computation of eigenvalues from tridiagonal matrix
+  if(rows > 1)
+  {
+    SelfAdjointEigenSolver<MatrixType> eiSymmTridiag;
+    eiSymmTridiag.computeFromTridiagonal(tridiag.matrixT().diagonal(), tridiag.matrixT().diagonal(-1), ComputeEigenvectors);
+    VERIFY_IS_APPROX(eiSymm.eigenvalues(), eiSymmTridiag.eigenvalues());
+    VERIFY_IS_APPROX(tridiag.matrixT(), eiSymmTridiag.eigenvectors().real() * eiSymmTridiag.eigenvalues().asDiagonal() * eiSymmTridiag.eigenvectors().real().transpose());
+  }
+
+  if (rows > 1 && rows < 20)
   {
     // Test matrix with NaN
     symmC(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
     SelfAdjointEigenSolver<MatrixType> eiSymmNaN(symmC);
     VERIFY_IS_EQUAL(eiSymmNaN.info(), NoConvergence);
   }
+
+  // regression test for bug 1098
+  {
+    SelfAdjointEigenSolver<MatrixType> eig(a.adjoint() * a);
+    eig.compute(a.adjoint() * a);
+  }
+
+  // regression test for bug 478
+  {
+    a.setZero();
+    SelfAdjointEigenSolver<MatrixType> ei3(a);
+    VERIFY_IS_EQUAL(ei3.info(), Success);
+    VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1));
+    VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity());
+  }
+}
+
+template<int>
+void bug_854()
+{
+  Matrix3d m;
+  m << 850.961, 51.966, 0,
+       51.966, 254.841, 0,
+            0,       0, 0;
+  selfadjointeigensolver_essential_check(m);
+}
+
+template<int>
+void bug_1014()
+{
+  Matrix3d m;
+  m <<        0.11111111111111114658, 0, 0,
+       0,     0.11111111111111109107, 0,
+       0, 0,  0.11111111111111107719;
+  selfadjointeigensolver_essential_check(m);
+}
+
+template<int>
+void bug_1225()
+{
+  Matrix3d m1, m2;
+  m1.setRandom();
+  m1 = m1*m1.transpose();
+  m2 = m1.triangularView<Upper>();
+  SelfAdjointEigenSolver<Matrix3d> eig1(m1);
+  SelfAdjointEigenSolver<Matrix3d> eig2(m2.selfadjointView<Upper>());
+  VERIFY_IS_APPROX(eig1.eigenvalues(), eig2.eigenvalues());
+}
+
+template<int>
+void bug_1204()
+{
+  SparseMatrix<double> A(2,2);
+  A.setIdentity();
+  SelfAdjointEigenSolver<Eigen::SparseMatrix<double> > eig(A);
 }
 
 void test_eigensolver_selfadjoint()
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
+    // trivial test for 1x1 matrices:
+    CALL_SUBTEST_1( selfadjointeigensolver(Matrix<float, 1, 1>()));
+    CALL_SUBTEST_1( selfadjointeigensolver(Matrix<double, 1, 1>()));
     // very important to test 3x3 and 2x2 matrices since we provide special paths for them
-    CALL_SUBTEST_1( selfadjointeigensolver(Matrix2f()) );
-    CALL_SUBTEST_1( selfadjointeigensolver(Matrix2d()) );
-    CALL_SUBTEST_1( selfadjointeigensolver(Matrix3f()) );
-    CALL_SUBTEST_1( selfadjointeigensolver(Matrix3d()) );
+    CALL_SUBTEST_12( selfadjointeigensolver(Matrix2f()) );
+    CALL_SUBTEST_12( selfadjointeigensolver(Matrix2d()) );
+    CALL_SUBTEST_13( selfadjointeigensolver(Matrix3f()) );
+    CALL_SUBTEST_13( selfadjointeigensolver(Matrix3d()) );
     CALL_SUBTEST_2( selfadjointeigensolver(Matrix4d()) );
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(s,s)) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(s,s)) );
-    
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_9( selfadjointeigensolver(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
 
     // some trivial but implementation-wise tricky cases
     CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(1,1)) );
@@ -149,6 +258,11 @@ void test_eigensolver_selfadjoint()
     CALL_SUBTEST_6( selfadjointeigensolver(Matrix<double,1,1>()) );
     CALL_SUBTEST_7( selfadjointeigensolver(Matrix<double,2,2>()) );
   }
+  
+  CALL_SUBTEST_13( bug_854<0>() );
+  CALL_SUBTEST_13( bug_1014<0>() );
+  CALL_SUBTEST_13( bug_1204<0>() );
+  CALL_SUBTEST_13( bug_1225<0>() );
 
   // Test problem size constructors
   s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
diff --git a/test/evaluator_common.h b/test/evaluator_common.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/test/evaluator_common.h
diff --git a/test/evaluators.cpp b/test/evaluators.cpp
new file mode 100644
index 000000000..aed5a05a7
--- /dev/null
+++ b/test/evaluators.cpp
@@ -0,0 +1,499 @@
+
+#include "main.h"
+
+namespace Eigen {
+
+  template<typename Lhs,typename Rhs>
+  const Product<Lhs,Rhs>
+  prod(const Lhs& lhs, const Rhs& rhs)
+  {
+    return Product<Lhs,Rhs>(lhs,rhs);
+  }
+
+  template<typename Lhs,typename Rhs>
+  const Product<Lhs,Rhs,LazyProduct>
+  lazyprod(const Lhs& lhs, const Rhs& rhs)
+  {
+    return Product<Lhs,Rhs,LazyProduct>(lhs,rhs);
+  }
+  
+  template<typename DstXprType, typename SrcXprType>
+  EIGEN_STRONG_INLINE
+  DstXprType& copy_using_evaluator(const EigenBase<DstXprType> &dst, const SrcXprType &src)
+  {
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+    return dst.const_cast_derived();
+  }
+  
+  template<typename DstXprType, template <typename> class StorageBase, typename SrcXprType>
+  EIGEN_STRONG_INLINE
+  const DstXprType& copy_using_evaluator(const NoAlias<DstXprType, StorageBase>& dst, const SrcXprType &src)
+  {
+    call_assignment(dst, src.derived(), internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+    return dst.expression();
+  }
+  
+  template<typename DstXprType, typename SrcXprType>
+  EIGEN_STRONG_INLINE
+  DstXprType& copy_using_evaluator(const PlainObjectBase<DstXprType> &dst, const SrcXprType &src)
+  {
+    #ifdef EIGEN_NO_AUTOMATIC_RESIZING
+    eigen_assert((dst.size()==0 || (IsVectorAtCompileTime ? (dst.size() == src.size())
+                                                          : (dst.rows() == src.rows() && dst.cols() == src.cols())))
+                && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+  #else
+    dst.const_cast_derived().resizeLike(src.derived());
+  #endif
+    
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+    return dst.const_cast_derived();
+  }
+
+  template<typename DstXprType, typename SrcXprType>
+  void add_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+  {
+    typedef typename DstXprType::Scalar Scalar;
+    call_assignment(const_cast<DstXprType&>(dst), src.derived(), internal::add_assign_op<Scalar,typename SrcXprType::Scalar>());
+  }
+
+  template<typename DstXprType, typename SrcXprType>
+  void subtract_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+  {
+    typedef typename DstXprType::Scalar Scalar;
+    call_assignment(const_cast<DstXprType&>(dst), src.derived(), internal::sub_assign_op<Scalar,typename SrcXprType::Scalar>());
+  }
+
+  template<typename DstXprType, typename SrcXprType>
+  void multiply_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+  {
+    typedef typename DstXprType::Scalar Scalar;
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::mul_assign_op<Scalar,typename SrcXprType::Scalar>());
+  }
+
+  template<typename DstXprType, typename SrcXprType>
+  void divide_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+  {
+    typedef typename DstXprType::Scalar Scalar;
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::div_assign_op<Scalar,typename SrcXprType::Scalar>());
+  }
+  
+  template<typename DstXprType, typename SrcXprType>
+  void swap_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+  {
+    typedef typename DstXprType::Scalar Scalar;
+    call_assignment(dst.const_cast_derived(), src.const_cast_derived(), internal::swap_assign_op<Scalar>());
+  }
+
+  namespace internal {
+    template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+    EIGEN_DEVICE_FUNC void call_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+    {
+      call_assignment_no_alias(dst.expression(), src, func);
+    }
+  }
+  
+}
+
+template<typename XprType> long get_cost(const XprType& ) { return Eigen::internal::evaluator<XprType>::CoeffReadCost; }
+
+using namespace std;
+
+#define VERIFY_IS_APPROX_EVALUATOR(DEST,EXPR) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (EXPR).eval());
+#define VERIFY_IS_APPROX_EVALUATOR2(DEST,EXPR,REF) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (REF).eval());
+
+void test_evaluators()
+{
+  // Testing Matrix evaluator and Transpose
+  Vector2d v = Vector2d::Random();
+  const Vector2d v_const(v);
+  Vector2d v2;
+  RowVector2d w;
+
+  VERIFY_IS_APPROX_EVALUATOR(v2, v);
+  VERIFY_IS_APPROX_EVALUATOR(v2, v_const);
+
+  // Testing Transpose
+  VERIFY_IS_APPROX_EVALUATOR(w, v.transpose()); // Transpose as rvalue
+  VERIFY_IS_APPROX_EVALUATOR(w, v_const.transpose());
+
+  copy_using_evaluator(w.transpose(), v); // Transpose as lvalue
+  VERIFY_IS_APPROX(w,v.transpose().eval());
+
+  copy_using_evaluator(w.transpose(), v_const);
+  VERIFY_IS_APPROX(w,v_const.transpose().eval());
+
+  // Testing Array evaluator
+  {
+    ArrayXXf a(2,3);
+    ArrayXXf b(3,2);
+    a << 1,2,3, 4,5,6;
+    const ArrayXXf a_const(a);
+
+    VERIFY_IS_APPROX_EVALUATOR(b, a.transpose());
+
+    VERIFY_IS_APPROX_EVALUATOR(b, a_const.transpose());
+
+    // Testing CwiseNullaryOp evaluator
+    copy_using_evaluator(w, RowVector2d::Random());
+    VERIFY((w.array() >= -1).all() && (w.array() <= 1).all()); // not easy to test ...
+
+    VERIFY_IS_APPROX_EVALUATOR(w, RowVector2d::Zero());
+
+    VERIFY_IS_APPROX_EVALUATOR(w, RowVector2d::Constant(3));
+    
+    // mix CwiseNullaryOp and transpose
+    VERIFY_IS_APPROX_EVALUATOR(w, Vector2d::Zero().transpose());
+  }
+
+  {
+    // test product expressions
+    int s = internal::random<int>(1,100);
+    MatrixXf a(s,s), b(s,s), c(s,s), d(s,s);
+    a.setRandom();
+    b.setRandom();
+    c.setRandom();
+    d.setRandom();
+    VERIFY_IS_APPROX_EVALUATOR(d, (a + b));
+    VERIFY_IS_APPROX_EVALUATOR(d, (a + b).transpose());
+    VERIFY_IS_APPROX_EVALUATOR2(d, prod(a,b), a*b);
+    VERIFY_IS_APPROX_EVALUATOR2(d.noalias(), prod(a,b), a*b);
+    VERIFY_IS_APPROX_EVALUATOR2(d, prod(a,b) + c, a*b + c);
+    VERIFY_IS_APPROX_EVALUATOR2(d, s * prod(a,b), s * a*b);
+    VERIFY_IS_APPROX_EVALUATOR2(d, prod(a,b).transpose(), (a*b).transpose());
+    VERIFY_IS_APPROX_EVALUATOR2(d, prod(a,b) + prod(b,c), a*b + b*c);
+
+    // check that prod works even with aliasing present
+    c = a*a;
+    copy_using_evaluator(a, prod(a,a));
+    VERIFY_IS_APPROX(a,c);
+
+    // check compound assignment of products
+    d = c;
+    add_assign_using_evaluator(c.noalias(), prod(a,b));
+    d.noalias() += a*b;
+    VERIFY_IS_APPROX(c, d);
+
+    d = c;
+    subtract_assign_using_evaluator(c.noalias(), prod(a,b));
+    d.noalias() -= a*b;
+    VERIFY_IS_APPROX(c, d);
+  }
+
+  {
+    // test product with all possible sizes
+    int s = internal::random<int>(1,100);
+    Matrix<float,      1,      1> m11, res11;  m11.setRandom(1,1);
+    Matrix<float,      1,      4> m14, res14;  m14.setRandom(1,4);
+    Matrix<float,      1,Dynamic> m1X, res1X;  m1X.setRandom(1,s);
+    Matrix<float,      4,      1> m41, res41;  m41.setRandom(4,1);
+    Matrix<float,      4,      4> m44, res44;  m44.setRandom(4,4);
+    Matrix<float,      4,Dynamic> m4X, res4X;  m4X.setRandom(4,s);
+    Matrix<float,Dynamic,      1> mX1, resX1;  mX1.setRandom(s,1);
+    Matrix<float,Dynamic,      4> mX4, resX4;  mX4.setRandom(s,4);
+    Matrix<float,Dynamic,Dynamic> mXX, resXX;  mXX.setRandom(s,s);
+
+    VERIFY_IS_APPROX_EVALUATOR2(res11, prod(m11,m11), m11*m11);
+    VERIFY_IS_APPROX_EVALUATOR2(res11, prod(m14,m41), m14*m41);
+    VERIFY_IS_APPROX_EVALUATOR2(res11, prod(m1X,mX1), m1X*mX1);
+    VERIFY_IS_APPROX_EVALUATOR2(res14, prod(m11,m14), m11*m14);
+    VERIFY_IS_APPROX_EVALUATOR2(res14, prod(m14,m44), m14*m44);
+    VERIFY_IS_APPROX_EVALUATOR2(res14, prod(m1X,mX4), m1X*mX4);
+    VERIFY_IS_APPROX_EVALUATOR2(res1X, prod(m11,m1X), m11*m1X);
+    VERIFY_IS_APPROX_EVALUATOR2(res1X, prod(m14,m4X), m14*m4X);
+    VERIFY_IS_APPROX_EVALUATOR2(res1X, prod(m1X,mXX), m1X*mXX);
+    VERIFY_IS_APPROX_EVALUATOR2(res41, prod(m41,m11), m41*m11);
+    VERIFY_IS_APPROX_EVALUATOR2(res41, prod(m44,m41), m44*m41);
+    VERIFY_IS_APPROX_EVALUATOR2(res41, prod(m4X,mX1), m4X*mX1);
+    VERIFY_IS_APPROX_EVALUATOR2(res44, prod(m41,m14), m41*m14);
+    VERIFY_IS_APPROX_EVALUATOR2(res44, prod(m44,m44), m44*m44);
+    VERIFY_IS_APPROX_EVALUATOR2(res44, prod(m4X,mX4), m4X*mX4);
+    VERIFY_IS_APPROX_EVALUATOR2(res4X, prod(m41,m1X), m41*m1X);
+    VERIFY_IS_APPROX_EVALUATOR2(res4X, prod(m44,m4X), m44*m4X);
+    VERIFY_IS_APPROX_EVALUATOR2(res4X, prod(m4X,mXX), m4X*mXX);
+    VERIFY_IS_APPROX_EVALUATOR2(resX1, prod(mX1,m11), mX1*m11);
+    VERIFY_IS_APPROX_EVALUATOR2(resX1, prod(mX4,m41), mX4*m41);
+    VERIFY_IS_APPROX_EVALUATOR2(resX1, prod(mXX,mX1), mXX*mX1);
+    VERIFY_IS_APPROX_EVALUATOR2(resX4, prod(mX1,m14), mX1*m14);
+    VERIFY_IS_APPROX_EVALUATOR2(resX4, prod(mX4,m44), mX4*m44);
+    VERIFY_IS_APPROX_EVALUATOR2(resX4, prod(mXX,mX4), mXX*mX4);
+    VERIFY_IS_APPROX_EVALUATOR2(resXX, prod(mX1,m1X), mX1*m1X);
+    VERIFY_IS_APPROX_EVALUATOR2(resXX, prod(mX4,m4X), mX4*m4X);
+    VERIFY_IS_APPROX_EVALUATOR2(resXX, prod(mXX,mXX), mXX*mXX);
+  }
+
+  {
+    ArrayXXf a(2,3);
+    ArrayXXf b(3,2);
+    a << 1,2,3, 4,5,6;
+    const ArrayXXf a_const(a);
+    
+    // this does not work because Random is eval-before-nested: 
+    // copy_using_evaluator(w, Vector2d::Random().transpose());
+
+    // test CwiseUnaryOp
+    VERIFY_IS_APPROX_EVALUATOR(v2, 3 * v);
+    VERIFY_IS_APPROX_EVALUATOR(w, (3 * v).transpose());
+    VERIFY_IS_APPROX_EVALUATOR(b, (a + 3).transpose());
+    VERIFY_IS_APPROX_EVALUATOR(b, (2 * a_const + 3).transpose());
+
+    // test CwiseBinaryOp
+    VERIFY_IS_APPROX_EVALUATOR(v2, v + Vector2d::Ones());
+    VERIFY_IS_APPROX_EVALUATOR(w, (v + Vector2d::Ones()).transpose().cwiseProduct(RowVector2d::Constant(3)));
+
+    // dynamic matrices and arrays
+    MatrixXd mat1(6,6), mat2(6,6);
+    VERIFY_IS_APPROX_EVALUATOR(mat1, MatrixXd::Identity(6,6));
+    VERIFY_IS_APPROX_EVALUATOR(mat2, mat1);
+    copy_using_evaluator(mat2.transpose(), mat1);
+    VERIFY_IS_APPROX(mat2.transpose(), mat1);
+
+    ArrayXXd arr1(6,6), arr2(6,6);
+    VERIFY_IS_APPROX_EVALUATOR(arr1, ArrayXXd::Constant(6,6, 3.0));
+    VERIFY_IS_APPROX_EVALUATOR(arr2, arr1);
+    
+    // test automatic resizing
+    mat2.resize(3,3);
+    VERIFY_IS_APPROX_EVALUATOR(mat2, mat1);
+    arr2.resize(9,9);
+    VERIFY_IS_APPROX_EVALUATOR(arr2, arr1);
+
+    // test direct traversal
+    Matrix3f m3;
+    Array33f a3;
+    VERIFY_IS_APPROX_EVALUATOR(m3, Matrix3f::Identity());  // matrix, nullary
+    // TODO: find a way to test direct traversal with array
+    VERIFY_IS_APPROX_EVALUATOR(m3.transpose(), Matrix3f::Identity().transpose());  // transpose
+    VERIFY_IS_APPROX_EVALUATOR(m3, 2 * Matrix3f::Identity());  // unary
+    VERIFY_IS_APPROX_EVALUATOR(m3, Matrix3f::Identity() + Matrix3f::Zero());  // binary
+    VERIFY_IS_APPROX_EVALUATOR(m3.block(0,0,2,2), Matrix3f::Identity().block(1,1,2,2));  // block
+
+    // test linear traversal
+    VERIFY_IS_APPROX_EVALUATOR(m3, Matrix3f::Zero());  // matrix, nullary
+    VERIFY_IS_APPROX_EVALUATOR(a3, Array33f::Zero());  // array
+    VERIFY_IS_APPROX_EVALUATOR(m3.transpose(), Matrix3f::Zero().transpose());  // transpose
+    VERIFY_IS_APPROX_EVALUATOR(m3, 2 * Matrix3f::Zero());  // unary
+    VERIFY_IS_APPROX_EVALUATOR(m3, Matrix3f::Zero() + m3);  // binary  
+
+    // test inner vectorization
+    Matrix4f m4, m4src = Matrix4f::Random();
+    Array44f a4, a4src = Matrix4f::Random();
+    VERIFY_IS_APPROX_EVALUATOR(m4, m4src);  // matrix
+    VERIFY_IS_APPROX_EVALUATOR(a4, a4src);  // array
+    VERIFY_IS_APPROX_EVALUATOR(m4.transpose(), m4src.transpose());  // transpose
+    // TODO: find out why Matrix4f::Zero() does not allow inner vectorization
+    VERIFY_IS_APPROX_EVALUATOR(m4, 2 * m4src);  // unary
+    VERIFY_IS_APPROX_EVALUATOR(m4, m4src + m4src);  // binary
+
+    // test linear vectorization
+    MatrixXf mX(6,6), mXsrc = MatrixXf::Random(6,6);
+    ArrayXXf aX(6,6), aXsrc = ArrayXXf::Random(6,6);
+    VERIFY_IS_APPROX_EVALUATOR(mX, mXsrc);  // matrix
+    VERIFY_IS_APPROX_EVALUATOR(aX, aXsrc);  // array
+    VERIFY_IS_APPROX_EVALUATOR(mX.transpose(), mXsrc.transpose());  // transpose
+    VERIFY_IS_APPROX_EVALUATOR(mX, MatrixXf::Zero(6,6));  // nullary
+    VERIFY_IS_APPROX_EVALUATOR(mX, 2 * mXsrc);  // unary
+    VERIFY_IS_APPROX_EVALUATOR(mX, mXsrc + mXsrc);  // binary
+
+    // test blocks and slice vectorization
+    VERIFY_IS_APPROX_EVALUATOR(m4, (mXsrc.block<4,4>(1,0)));
+    VERIFY_IS_APPROX_EVALUATOR(aX, ArrayXXf::Constant(10, 10, 3.0).block(2, 3, 6, 6));
+
+    Matrix4f m4ref = m4;
+    copy_using_evaluator(m4.block(1, 1, 2, 3), m3.bottomRows(2));
+    m4ref.block(1, 1, 2, 3) = m3.bottomRows(2);
+    VERIFY_IS_APPROX(m4, m4ref);
+
+    mX.setIdentity(20,20);
+    MatrixXf mXref = MatrixXf::Identity(20,20);
+    mXsrc = MatrixXf::Random(9,12);
+    copy_using_evaluator(mX.block(4, 4, 9, 12), mXsrc);
+    mXref.block(4, 4, 9, 12) = mXsrc;
+    VERIFY_IS_APPROX(mX, mXref);
+
+    // test Map
+    const float raw[3] = {1,2,3};
+    float buffer[3] = {0,0,0};
+    Vector3f v3;
+    Array3f a3f;
+    VERIFY_IS_APPROX_EVALUATOR(v3, Map<const Vector3f>(raw));
+    VERIFY_IS_APPROX_EVALUATOR(a3f, Map<const Array3f>(raw));
+    Vector3f::Map(buffer) = 2*v3;
+    VERIFY(buffer[0] == 2);
+    VERIFY(buffer[1] == 4);
+    VERIFY(buffer[2] == 6);
+
+    // test CwiseUnaryView
+    mat1.setRandom();
+    mat2.setIdentity();
+    MatrixXcd matXcd(6,6), matXcd_ref(6,6);
+    copy_using_evaluator(matXcd.real(), mat1);
+    copy_using_evaluator(matXcd.imag(), mat2);
+    matXcd_ref.real() = mat1;
+    matXcd_ref.imag() = mat2;
+    VERIFY_IS_APPROX(matXcd, matXcd_ref);
+
+    // test Select
+    VERIFY_IS_APPROX_EVALUATOR(aX, (aXsrc > 0).select(aXsrc, -aXsrc));
+
+    // test Replicate
+    mXsrc = MatrixXf::Random(6, 6);
+    VectorXf vX = VectorXf::Random(6);
+    mX.resize(6, 6);
+    VERIFY_IS_APPROX_EVALUATOR(mX, mXsrc.colwise() + vX);
+    matXcd.resize(12, 12);
+    VERIFY_IS_APPROX_EVALUATOR(matXcd, matXcd_ref.replicate(2,2));
+    VERIFY_IS_APPROX_EVALUATOR(matXcd, (matXcd_ref.replicate<2,2>()));
+
+    // test partial reductions
+    VectorXd vec1(6);
+    VERIFY_IS_APPROX_EVALUATOR(vec1, mat1.rowwise().sum());
+    VERIFY_IS_APPROX_EVALUATOR(vec1, mat1.colwise().sum().transpose());
+
+    // test MatrixWrapper and ArrayWrapper
+    mat1.setRandom(6,6);
+    arr1.setRandom(6,6);
+    VERIFY_IS_APPROX_EVALUATOR(mat2, arr1.matrix());
+    VERIFY_IS_APPROX_EVALUATOR(arr2, mat1.array());
+    VERIFY_IS_APPROX_EVALUATOR(mat2, (arr1 + 2).matrix());
+    VERIFY_IS_APPROX_EVALUATOR(arr2, mat1.array() + 2);
+    mat2.array() = arr1 * arr1;
+    VERIFY_IS_APPROX(mat2, (arr1 * arr1).matrix());
+    arr2.matrix() = MatrixXd::Identity(6,6);
+    VERIFY_IS_APPROX(arr2, MatrixXd::Identity(6,6).array());
+
+    // test Reverse
+    VERIFY_IS_APPROX_EVALUATOR(arr2, arr1.reverse());
+    VERIFY_IS_APPROX_EVALUATOR(arr2, arr1.colwise().reverse());
+    VERIFY_IS_APPROX_EVALUATOR(arr2, arr1.rowwise().reverse());
+    arr2.reverse() = arr1;
+    VERIFY_IS_APPROX(arr2, arr1.reverse());
+    mat2.array() = mat1.array().reverse();
+    VERIFY_IS_APPROX(mat2.array(), mat1.array().reverse());
+
+    // test Diagonal
+    VERIFY_IS_APPROX_EVALUATOR(vec1, mat1.diagonal());
+    vec1.resize(5);
+    VERIFY_IS_APPROX_EVALUATOR(vec1, mat1.diagonal(1));
+    VERIFY_IS_APPROX_EVALUATOR(vec1, mat1.diagonal<-1>());
+    vec1.setRandom();
+
+    mat2 = mat1;
+    copy_using_evaluator(mat1.diagonal(1), vec1);
+    mat2.diagonal(1) = vec1;
+    VERIFY_IS_APPROX(mat1, mat2);
+
+    copy_using_evaluator(mat1.diagonal<-1>(), mat1.diagonal(1));
+    mat2.diagonal<-1>() = mat2.diagonal(1);
+    VERIFY_IS_APPROX(mat1, mat2);
+  }
+  
+  {
+    // test swapping
+    MatrixXd mat1, mat2, mat1ref, mat2ref;
+    mat1ref = mat1 = MatrixXd::Random(6, 6);
+    mat2ref = mat2 = 2 * mat1 + MatrixXd::Identity(6, 6);
+    swap_using_evaluator(mat1, mat2);
+    mat1ref.swap(mat2ref);
+    VERIFY_IS_APPROX(mat1, mat1ref);
+    VERIFY_IS_APPROX(mat2, mat2ref);
+
+    swap_using_evaluator(mat1.block(0, 0, 3, 3), mat2.block(3, 3, 3, 3));
+    mat1ref.block(0, 0, 3, 3).swap(mat2ref.block(3, 3, 3, 3));
+    VERIFY_IS_APPROX(mat1, mat1ref);
+    VERIFY_IS_APPROX(mat2, mat2ref);
+
+    swap_using_evaluator(mat1.row(2), mat2.col(3).transpose());
+    mat1.row(2).swap(mat2.col(3).transpose());
+    VERIFY_IS_APPROX(mat1, mat1ref);
+    VERIFY_IS_APPROX(mat2, mat2ref);
+  }
+
+  {
+    // test compound assignment
+    const Matrix4d mat_const = Matrix4d::Random(); 
+    Matrix4d mat, mat_ref;
+    mat = mat_ref = Matrix4d::Identity();
+    add_assign_using_evaluator(mat, mat_const);
+    mat_ref += mat_const;
+    VERIFY_IS_APPROX(mat, mat_ref);
+
+    subtract_assign_using_evaluator(mat.row(1), 2*mat.row(2));
+    mat_ref.row(1) -= 2*mat_ref.row(2);
+    VERIFY_IS_APPROX(mat, mat_ref);
+
+    const ArrayXXf arr_const = ArrayXXf::Random(5,3); 
+    ArrayXXf arr, arr_ref;
+    arr = arr_ref = ArrayXXf::Constant(5, 3, 0.5);
+    multiply_assign_using_evaluator(arr, arr_const);
+    arr_ref *= arr_const;
+    VERIFY_IS_APPROX(arr, arr_ref);
+
+    divide_assign_using_evaluator(arr.row(1), arr.row(2) + 1);
+    arr_ref.row(1) /= (arr_ref.row(2) + 1);
+    VERIFY_IS_APPROX(arr, arr_ref);
+  }
+  
+  {
+    // test triangular shapes
+    MatrixXd A = MatrixXd::Random(6,6), B(6,6), C(6,6), D(6,6);
+    A.setRandom();B.setRandom();
+    VERIFY_IS_APPROX_EVALUATOR2(B, A.triangularView<Upper>(), MatrixXd(A.triangularView<Upper>()));
+    
+    A.setRandom();B.setRandom();
+    VERIFY_IS_APPROX_EVALUATOR2(B, A.triangularView<UnitLower>(), MatrixXd(A.triangularView<UnitLower>()));
+    
+    A.setRandom();B.setRandom();
+    VERIFY_IS_APPROX_EVALUATOR2(B, A.triangularView<UnitUpper>(), MatrixXd(A.triangularView<UnitUpper>()));
+    
+    A.setRandom();B.setRandom();
+    C = B; C.triangularView<Upper>() = A;
+    copy_using_evaluator(B.triangularView<Upper>(), A);
+    VERIFY(B.isApprox(C) && "copy_using_evaluator(B.triangularView<Upper>(), A)");
+    
+    A.setRandom();B.setRandom();
+    C = B; C.triangularView<Lower>() = A.triangularView<Lower>();
+    copy_using_evaluator(B.triangularView<Lower>(), A.triangularView<Lower>());
+    VERIFY(B.isApprox(C) && "copy_using_evaluator(B.triangularView<Lower>(), A.triangularView<Lower>())");
+    
+    
+    A.setRandom();B.setRandom();
+    C = B; C.triangularView<Lower>() = A.triangularView<Upper>().transpose();
+    copy_using_evaluator(B.triangularView<Lower>(), A.triangularView<Upper>().transpose());
+    VERIFY(B.isApprox(C) && "copy_using_evaluator(B.triangularView<Lower>(), A.triangularView<Lower>().transpose())");
+    
+    
+    A.setRandom();B.setRandom(); C = B; D = A;
+    C.triangularView<Upper>().swap(D.triangularView<Upper>());
+    swap_using_evaluator(B.triangularView<Upper>(), A.triangularView<Upper>());
+    VERIFY(B.isApprox(C) && "swap_using_evaluator(B.triangularView<Upper>(), A.triangularView<Upper>())");
+    
+    
+    VERIFY_IS_APPROX_EVALUATOR2(B, prod(A.triangularView<Upper>(),A), MatrixXd(A.triangularView<Upper>()*A));
+    
+    VERIFY_IS_APPROX_EVALUATOR2(B, prod(A.selfadjointView<Upper>(),A), MatrixXd(A.selfadjointView<Upper>()*A));
+  }
+
+  {
+    // test diagonal shapes
+    VectorXd d = VectorXd::Random(6);
+    MatrixXd A = MatrixXd::Random(6,6), B(6,6);
+    A.setRandom();B.setRandom();
+    
+    VERIFY_IS_APPROX_EVALUATOR2(B, lazyprod(d.asDiagonal(),A), MatrixXd(d.asDiagonal()*A));
+    VERIFY_IS_APPROX_EVALUATOR2(B, lazyprod(A,d.asDiagonal()), MatrixXd(A*d.asDiagonal()));
+  }
+
+  {
+    // test CoeffReadCost
+    Matrix4d a, b;
+    VERIFY_IS_EQUAL( get_cost(a), 1 );
+    VERIFY_IS_EQUAL( get_cost(a+b), 3);
+    VERIFY_IS_EQUAL( get_cost(2*a+b), 4);
+    VERIFY_IS_EQUAL( get_cost(a*b), 1);
+    VERIFY_IS_EQUAL( get_cost(a.lazyProduct(b)), 15);
+    VERIFY_IS_EQUAL( get_cost(a*(a*b)), 1);
+    VERIFY_IS_EQUAL( get_cost(a.lazyProduct(a*b)), 15);
+    VERIFY_IS_EQUAL( get_cost(a*(a+b)), 1);
+    VERIFY_IS_EQUAL( get_cost(a.lazyProduct(a+b)), 15);
+  }
+}
diff --git a/test/fastmath.cpp b/test/fastmath.cpp
new file mode 100644
index 000000000..cc5db0746
--- /dev/null
+++ b/test/fastmath.cpp
@@ -0,0 +1,99 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+void check(bool b, bool ref)
+{
+  std::cout << b;
+  if(b==ref)
+    std::cout << " OK  ";
+  else
+    std::cout << " BAD ";
+}
+
+#if EIGEN_COMP_MSVC && EIGEN_COMP_MSVC < 1800
+namespace std {
+  template<typename T> bool (isfinite)(T x) { return _finite(x); }
+  template<typename T> bool (isnan)(T x) { return _isnan(x); }
+  template<typename T> bool (isinf)(T x) { return _fpclass(x)==_FPCLASS_NINF || _fpclass(x)==_FPCLASS_PINF; }
+}
+#endif
+
+template<typename T>
+void check_inf_nan(bool dryrun) {
+  Matrix<T,Dynamic,1> m(10);
+  m.setRandom();
+  m(3) = std::numeric_limits<T>::quiet_NaN();
+
+  if(dryrun)
+  {
+    std::cout << "std::isfinite(" << m(3) << ") = "; check((std::isfinite)(m(3)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(3)), false); std::cout << "\n";
+    std::cout << "std::isinf(" << m(3) << ")    = "; check((std::isinf)(m(3)),false);    std::cout << "  ; numext::isinf    = "; check((numext::isinf)(m(3)), false); std::cout << "\n";
+    std::cout << "std::isnan(" << m(3) << ")    = "; check((std::isnan)(m(3)),true);     std::cout << "  ; numext::isnan    = "; check((numext::isnan)(m(3)), true); std::cout << "\n";
+    std::cout << "allFinite: "; check(m.allFinite(), 0); std::cout << "\n";
+    std::cout << "hasNaN:    "; check(m.hasNaN(), 1);    std::cout << "\n";
+    std::cout << "\n";
+  }
+  else
+  {
+    VERIFY( !(numext::isfinite)(m(3)) );
+    VERIFY( !(numext::isinf)(m(3)) );
+    VERIFY(  (numext::isnan)(m(3)) );
+    VERIFY( !m.allFinite() );
+    VERIFY(  m.hasNaN() );
+  }
+  T hidden_zero = (std::numeric_limits<T>::min)()*(std::numeric_limits<T>::min)();
+  m(4) /= hidden_zero;
+  if(dryrun)
+  {
+    std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";
+    std::cout << "std::isinf(" << m(4) << ")    = "; check((std::isinf)(m(4)),true);     std::cout << "  ; numext::isinf    = "; check((numext::isinf)(m(4)), true); std::cout << "\n";
+    std::cout << "std::isnan(" << m(4) << ")    = "; check((std::isnan)(m(4)),false);    std::cout << "  ; numext::isnan    = "; check((numext::isnan)(m(4)), false); std::cout << "\n";
+    std::cout << "allFinite: "; check(m.allFinite(), 0); std::cout << "\n";
+    std::cout << "hasNaN:    "; check(m.hasNaN(), 1);    std::cout << "\n";
+    std::cout << "\n";
+  }
+  else
+  {
+    VERIFY( !(numext::isfinite)(m(4)) );
+    VERIFY(  (numext::isinf)(m(4)) );
+    VERIFY( !(numext::isnan)(m(4)) );
+    VERIFY( !m.allFinite() );
+    VERIFY(  m.hasNaN() );
+  }
+  m(3) = 0;
+  if(dryrun)
+  {
+    std::cout << "std::isfinite(" << m(3) << ") = "; check((std::isfinite)(m(3)),true); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(3)), true); std::cout << "\n";
+    std::cout << "std::isinf(" << m(3) << ")    = "; check((std::isinf)(m(3)),false);    std::cout << "  ; numext::isinf    = "; check((numext::isinf)(m(3)), false); std::cout << "\n";
+    std::cout << "std::isnan(" << m(3) << ")    = "; check((std::isnan)(m(3)),false);     std::cout << "  ; numext::isnan    = "; check((numext::isnan)(m(3)), false); std::cout << "\n";
+    std::cout << "allFinite: "; check(m.allFinite(), 0); std::cout << "\n";
+    std::cout << "hasNaN:    "; check(m.hasNaN(), 0);    std::cout << "\n";
+    std::cout << "\n\n";
+  }
+  else
+  {
+    VERIFY(  (numext::isfinite)(m(3)) );
+    VERIFY( !(numext::isinf)(m(3)) );
+    VERIFY( !(numext::isnan)(m(3)) );
+    VERIFY( !m.allFinite() );
+    VERIFY( !m.hasNaN() );
+  }
+}
+
+void test_fastmath() {
+  std::cout << "*** float *** \n\n"; check_inf_nan<float>(true);
+  std::cout << "*** double ***\n\n"; check_inf_nan<double>(true);
+  std::cout << "*** long double *** \n\n"; check_inf_nan<long double>(true);
+
+  check_inf_nan<float>(false);
+  check_inf_nan<double>(false);
+  check_inf_nan<long double>(false);
+}
diff --git a/test/first_aligned.cpp b/test/first_aligned.cpp
index 467f94510..ae2d4bc42 100644
--- a/test/first_aligned.cpp
+++ b/test/first_aligned.cpp
@@ -13,7 +13,7 @@ template<typename Scalar>
 void test_first_aligned_helper(Scalar *array, int size)
 {
   const int packet_size = sizeof(Scalar) * internal::packet_traits<Scalar>::size;
-  VERIFY(((size_t(array) + sizeof(Scalar) * internal::first_aligned(array, size)) % packet_size) == 0);
+  VERIFY(((size_t(array) + sizeof(Scalar) * internal::first_default_aligned(array, size)) % packet_size) == 0);
 }
 
 template<typename Scalar>
@@ -21,7 +21,7 @@ void test_none_aligned_helper(Scalar *array, int size)
 {
   EIGEN_UNUSED_VARIABLE(array);
   EIGEN_UNUSED_VARIABLE(size);
-  VERIFY(internal::packet_traits<Scalar>::size == 1 || internal::first_aligned(array, size) == size);
+  VERIFY(internal::packet_traits<Scalar>::size == 1 || internal::first_default_aligned(array, size) == size);
 }
 
 struct some_non_vectorizable_type { float x; };
@@ -41,7 +41,7 @@ void test_first_aligned()
   test_first_aligned_helper(array_double+1, 50);
   test_first_aligned_helper(array_double+2, 50);
   
-  double *array_double_plus_4_bytes = (double*)(size_t(array_double)+4);
+  double *array_double_plus_4_bytes = (double*)(internal::UIntPtr(array_double)+4);
   test_none_aligned_helper(array_double_plus_4_bytes, 50);
   test_none_aligned_helper(array_double_plus_4_bytes+1, 50);
   
diff --git a/test/geo_alignedbox.cpp b/test/geo_alignedbox.cpp
index 8e36adbe3..d2339a651 100644
--- a/test/geo_alignedbox.cpp
+++ b/test/geo_alignedbox.cpp
@@ -16,7 +16,7 @@
 using namespace std;
 
 template<typename T> EIGEN_DONT_INLINE
-void kill_extra_precision(T& x) { eigen_assert(&x != 0); }
+void kill_extra_precision(T& x) { eigen_assert((void*)(&x) != (void*)0); }
 
 
 template<typename BoxType> void alignedbox(const BoxType& _box)
@@ -48,12 +48,21 @@ template<typename BoxType> void alignedbox(const BoxType& _box)
   b0.extend(p0);
   b0.extend(p1);
   VERIFY(b0.contains(p0*s1+(Scalar(1)-s1)*p1));
+  VERIFY(b0.contains(b0.center()));
+  VERIFY_IS_APPROX(b0.center(),(p0+p1)/Scalar(2));
 
   (b2 = b0).extend(b1);
   VERIFY(b2.contains(b0));
   VERIFY(b2.contains(b1));
   VERIFY_IS_APPROX(b2.clamp(b0), b0);
 
+  // intersection
+  BoxType box1(VectorType::Random(dim));
+  box1.extend(VectorType::Random(dim));
+  BoxType box2(VectorType::Random(dim));
+  box2.extend(VectorType::Random(dim));
+
+  VERIFY(box1.intersects(box2) == !box1.intersection(box2).isEmpty()); 
 
   // alignment -- make sure there is no memory alignment assertion
   BoxType *bp0 = new BoxType(dim);
@@ -172,6 +181,8 @@ void test_geo_alignedbox()
     CALL_SUBTEST_9( alignedbox(AlignedBox1i()) );
     CALL_SUBTEST_10( alignedbox(AlignedBox2i()) );
     CALL_SUBTEST_11( alignedbox(AlignedBox3i()) );
+
+    CALL_SUBTEST_14( alignedbox(AlignedBox<double,Dynamic>(4)) );
   }
   CALL_SUBTEST_12( specificTest1() );
   CALL_SUBTEST_13( specificTest2() );
diff --git a/test/geo_eulerangles.cpp b/test/geo_eulerangles.cpp
index b4830bd41..932ebe773 100644
--- a/test/geo_eulerangles.cpp
+++ b/test/geo_eulerangles.cpp
@@ -26,16 +26,16 @@ void verify_euler(const Matrix<Scalar,3,1>& ea, int i, int j, int k)
   VERIFY_IS_APPROX(m,  mbis); 
   /* If I==K, and ea[1]==0, then there no unique solution. */ 
   /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ 
-  if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(M_PI/2),test_precision<Scalar>())) ) 
+  if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
     VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
   
   // approx_or_less_than does not work for 0
   VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
-  VERIFY_IS_APPROX_OR_LESS_THAN(eabis[0], Scalar(M_PI));
-  VERIFY_IS_APPROX_OR_LESS_THAN(-Scalar(M_PI), eabis[1]);
-  VERIFY_IS_APPROX_OR_LESS_THAN(eabis[1], Scalar(M_PI));
-  VERIFY_IS_APPROX_OR_LESS_THAN(-Scalar(M_PI), eabis[2]);
-  VERIFY_IS_APPROX_OR_LESS_THAN(eabis[2], Scalar(M_PI));
+  VERIFY_IS_APPROX_OR_LESS_THAN(eabis[0], Scalar(EIGEN_PI));
+  VERIFY_IS_APPROX_OR_LESS_THAN(-Scalar(EIGEN_PI), eabis[1]);
+  VERIFY_IS_APPROX_OR_LESS_THAN(eabis[1], Scalar(EIGEN_PI));
+  VERIFY_IS_APPROX_OR_LESS_THAN(-Scalar(EIGEN_PI), eabis[2]);
+  VERIFY_IS_APPROX_OR_LESS_THAN(eabis[2], Scalar(EIGEN_PI));
 }
 
 template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
@@ -64,7 +64,7 @@ template<typename Scalar> void eulerangles()
   typedef Quaternion<Scalar> Quaternionx;
   typedef AngleAxis<Scalar> AngleAxisx;
 
-  Scalar a = internal::random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
   Quaternionx q1;
   q1 = AngleAxisx(a, Vector3::Random().normalized());
   Matrix3 m;
@@ -84,13 +84,13 @@ template<typename Scalar> void eulerangles()
   check_all_var(ea);
   
   // Check with random angles in range [0:pi]x[-pi:pi]x[-pi:pi].
-  ea = (Array3::Random() + Array3(1,0,0))*Scalar(M_PI)*Array3(0.5,1,1);
+  ea = (Array3::Random() + Array3(1,0,0))*Scalar(EIGEN_PI)*Array3(0.5,1,1);
   check_all_var(ea);
   
-  ea[2] = ea[0] = internal::random<Scalar>(0,Scalar(M_PI));
+  ea[2] = ea[0] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
   check_all_var(ea);
   
-  ea[0] = ea[1] = internal::random<Scalar>(0,Scalar(M_PI));
+  ea[0] = ea[1] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
   check_all_var(ea);
   
   ea[1] = 0;
diff --git a/test/geo_homogeneous.cpp b/test/geo_homogeneous.cpp
index c91bde819..2187c7bf9 100644
--- a/test/geo_homogeneous.cpp
+++ b/test/geo_homogeneous.cpp
@@ -38,6 +38,10 @@ template<typename Scalar,int Size> void homogeneous(void)
   hv0 << v0, 1;
   VERIFY_IS_APPROX(v0.homogeneous(), hv0);
   VERIFY_IS_APPROX(v0, hv0.hnormalized());
+  
+  VERIFY_IS_APPROX(v0.homogeneous().sum(), hv0.sum());
+  VERIFY_IS_APPROX(v0.homogeneous().minCoeff(), hv0.minCoeff());
+  VERIFY_IS_APPROX(v0.homogeneous().maxCoeff(), hv0.maxCoeff());
 
   hm0 << m0, ones.transpose();
   VERIFY_IS_APPROX(m0.colwise().homogeneous(), hm0);
@@ -54,10 +58,11 @@ template<typename Scalar,int Size> void homogeneous(void)
   T2MatrixType t2 = T2MatrixType::Random();
   VERIFY_IS_APPROX(t2 * (v0.homogeneous().eval()), t2 * v0.homogeneous());
   VERIFY_IS_APPROX(t2 * (m0.colwise().homogeneous().eval()), t2 * m0.colwise().homogeneous());
+  VERIFY_IS_APPROX(t2 * (v0.homogeneous().asDiagonal()), t2 * hv0.asDiagonal());
+  VERIFY_IS_APPROX((v0.homogeneous().asDiagonal()) * t2, hv0.asDiagonal() * t2);
 
   VERIFY_IS_APPROX((v0.transpose().rowwise().homogeneous().eval()) * t2,
                     v0.transpose().rowwise().homogeneous() * t2);
-                    m0.transpose().rowwise().homogeneous().eval();
   VERIFY_IS_APPROX((m0.transpose().rowwise().homogeneous().eval()) * t2,
                     m0.transpose().rowwise().homogeneous() * t2);
 
@@ -82,7 +87,7 @@ template<typename Scalar,int Size> void homogeneous(void)
   VERIFY_IS_APPROX(aff  * pts.colwise().homogeneous(), (aff  * pts1).colwise().hnormalized());
   VERIFY_IS_APPROX(caff * pts.colwise().homogeneous(), (caff * pts1).colwise().hnormalized());
   VERIFY_IS_APPROX(proj * pts.colwise().homogeneous(), (proj * pts1));
-  
+
   VERIFY_IS_APPROX((aff  * pts1).colwise().hnormalized(),  aff  * pts);
   VERIFY_IS_APPROX((caff * pts1).colwise().hnormalized(), caff * pts);
   
@@ -91,6 +96,23 @@ template<typename Scalar,int Size> void homogeneous(void)
   VERIFY_IS_APPROX((aff  * pts2).colwise().hnormalized(), aff  * pts2.colwise().hnormalized());
   VERIFY_IS_APPROX((caff * pts2).colwise().hnormalized(), caff * pts2.colwise().hnormalized());
   VERIFY_IS_APPROX((proj * pts2).colwise().hnormalized(), (proj * pts2.colwise().hnormalized().colwise().homogeneous()).colwise().hnormalized());
+  
+  // Test combination of homogeneous
+  
+  VERIFY_IS_APPROX( (t2 * v0.homogeneous()).hnormalized(),
+                       (t2.template topLeftCorner<Size,Size>() * v0 + t2.template topRightCorner<Size,1>())
+                     / ((t2.template bottomLeftCorner<1,Size>()*v0).value() + t2(Size,Size)) );
+  
+  VERIFY_IS_APPROX( (t2 * pts.colwise().homogeneous()).colwise().hnormalized(),
+                    (Matrix<Scalar, Size+1, Dynamic>(t2 * pts1).colwise().hnormalized()) );
+  
+  VERIFY_IS_APPROX( (t2 .lazyProduct( v0.homogeneous() )).hnormalized(), (t2 * v0.homogeneous()).hnormalized() );
+  VERIFY_IS_APPROX( (t2 .lazyProduct  ( pts.colwise().homogeneous() )).colwise().hnormalized(), (t2 * pts1).colwise().hnormalized() );
+  
+  VERIFY_IS_APPROX( (v0.transpose().homogeneous() .lazyProduct( t2 )).hnormalized(), (v0.transpose().homogeneous()*t2).hnormalized() );
+  VERIFY_IS_APPROX( (pts.transpose().rowwise().homogeneous() .lazyProduct( t2 )).rowwise().hnormalized(), (pts1.transpose()*t2).rowwise().hnormalized() );
+
+  VERIFY_IS_APPROX( (t2.template triangularView<Lower>() * v0.homogeneous()).eval(), (t2.template triangularView<Lower>()*hv0) );
 }
 
 void test_geo_homogeneous()
diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp
index 327537801..27892850d 100644
--- a/test/geo_hyperplane.cpp
+++ b/test/geo_hyperplane.cpp
@@ -18,10 +18,12 @@ template<typename HyperplaneType> void hyperplane(const HyperplaneType& _plane)
   /* this test covers the following files:
      Hyperplane.h
   */
+  using std::abs;
   typedef typename HyperplaneType::Index Index;
   const Index dim = _plane.dim();
   enum { Options = HyperplaneType::Options };
   typedef typename HyperplaneType::Scalar Scalar;
+  typedef typename HyperplaneType::RealScalar RealScalar;
   typedef Matrix<Scalar, HyperplaneType::AmbientDimAtCompileTime, 1> VectorType;
   typedef Matrix<Scalar, HyperplaneType::AmbientDimAtCompileTime,
                          HyperplaneType::AmbientDimAtCompileTime> MatrixType;
@@ -42,7 +44,10 @@ template<typename HyperplaneType> void hyperplane(const HyperplaneType& _plane)
   VERIFY_IS_APPROX( n1.dot(n1), Scalar(1) );
 
   VERIFY_IS_MUCH_SMALLER_THAN( pl0.absDistance(p0), Scalar(1) );
-  VERIFY_IS_APPROX( pl1.signedDistance(p1 + n1 * s0), s0 );
+  if(numext::abs2(s0)>RealScalar(1e-6))
+    VERIFY_IS_APPROX( pl1.signedDistance(p1 + n1 * s0), s0);
+  else
+    VERIFY_IS_MUCH_SMALLER_THAN( abs(pl1.signedDistance(p1 + n1 * s0) - s0), Scalar(1) );
   VERIFY_IS_MUCH_SMALLER_THAN( pl1.signedDistance(pl1.projection(p0)), Scalar(1) );
   VERIFY_IS_MUCH_SMALLER_THAN( pl1.absDistance(p1 +  pl1.normal().unitOrthogonal() * s1), Scalar(1) );
 
@@ -52,6 +57,8 @@ template<typename HyperplaneType> void hyperplane(const HyperplaneType& _plane)
     MatrixType rot = MatrixType::Random(dim,dim).householderQr().householderQ();
     DiagonalMatrix<Scalar,HyperplaneType::AmbientDimAtCompileTime> scaling(VectorType::Random());
     Translation<Scalar,HyperplaneType::AmbientDimAtCompileTime> translation(VectorType::Random());
+    
+    while(scaling.diagonal().cwiseAbs().minCoeff()<RealScalar(1e-4)) scaling.diagonal() = VectorType::Random();
 
     pl2 = pl1;
     VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot).absDistance(rot * p1), Scalar(1) );
@@ -59,12 +66,15 @@ template<typename HyperplaneType> void hyperplane(const HyperplaneType& _plane)
     VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot,Isometry).absDistance(rot * p1), Scalar(1) );
     pl2 = pl1;
     VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*scaling).absDistance((rot*scaling) * p1), Scalar(1) );
+    VERIFY_IS_APPROX( pl2.normal().norm(), RealScalar(1) );
     pl2 = pl1;
     VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*scaling*translation)
-                                 .absDistance((rot*scaling*translation) * p1), Scalar(1) );
+                                  .absDistance((rot*scaling*translation) * p1), Scalar(1) );
+    VERIFY_IS_APPROX( pl2.normal().norm(), RealScalar(1) );
     pl2 = pl1;
     VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*translation,Isometry)
                                  .absDistance((rot*translation) * p1), Scalar(1) );
+    VERIFY_IS_APPROX( pl2.normal().norm(), RealScalar(1) );
   }
 
   // casting
@@ -90,9 +100,9 @@ template<typename Scalar> void lines()
     Vector u = Vector::Random();
     Vector v = Vector::Random();
     Scalar a = internal::random<Scalar>();
-    while (abs(a-1) < 1e-4) a = internal::random<Scalar>();
-    while (u.norm() < 1e-4) u = Vector::Random();
-    while (v.norm() < 1e-4) v = Vector::Random();
+    while (abs(a-1) < Scalar(1e-4)) a = internal::random<Scalar>();
+    while (u.norm() < Scalar(1e-4)) u = Vector::Random();
+    while (v.norm() < Scalar(1e-4)) v = Vector::Random();
 
     HLine line_u = HLine::Through(center + u, center + a*u);
     HLine line_v = HLine::Through(center + v, center + a*v);
@@ -104,12 +114,15 @@ template<typename Scalar> void lines()
     Vector result = line_u.intersection(line_v);
 
     // the lines should intersect at the point we called "center"
-    VERIFY_IS_APPROX(result, center);
+    if(abs(a-1) > Scalar(1e-2) && abs(v.normalized().dot(u.normalized()))<Scalar(0.9))
+      VERIFY_IS_APPROX(result, center);
 
     // check conversions between two types of lines
     PLine pl(line_u); // gcc 3.3 will commit suicide if we don't name this variable
-    CoeffsType converted_coeffs = HLine(pl).coeffs();
-    converted_coeffs *= (line_u.coeffs()[0])/(converted_coeffs[0]);
+    HLine line_u2(pl);
+    CoeffsType converted_coeffs = line_u2.coeffs();
+    if(line_u2.normal().dot(line_u.normal())<Scalar(0))
+      converted_coeffs = -line_u2.coeffs();
     VERIFY(line_u.coeffs().isApprox(converted_coeffs));
   }
 }
@@ -145,9 +158,9 @@ template<typename Scalar> void hyperplane_alignment()
   typedef Hyperplane<Scalar,3,AutoAlign> Plane3a;
   typedef Hyperplane<Scalar,3,DontAlign> Plane3u;
 
-  EIGEN_ALIGN16 Scalar array1[4];
-  EIGEN_ALIGN16 Scalar array2[4];
-  EIGEN_ALIGN16 Scalar array3[4+1];
+  EIGEN_ALIGN_MAX Scalar array1[4];
+  EIGEN_ALIGN_MAX Scalar array2[4];
+  EIGEN_ALIGN_MAX Scalar array3[4+1];
   Scalar* array3u = array3+1;
 
   Plane3a *p1 = ::new(reinterpret_cast<void*>(array1)) Plane3a;
@@ -161,8 +174,8 @@ template<typename Scalar> void hyperplane_alignment()
   VERIFY_IS_APPROX(p1->coeffs(), p2->coeffs());
   VERIFY_IS_APPROX(p1->coeffs(), p3->coeffs());
   
-  #if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY
-  if(internal::packet_traits<Scalar>::Vectorizable)
+  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES > 0
+  if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
     VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Plane3a));
   #endif
 }
diff --git a/test/geo_orthomethods.cpp b/test/geo_orthomethods.cpp
index c836dae40..e178df257 100644
--- a/test/geo_orthomethods.cpp
+++ b/test/geo_orthomethods.cpp
@@ -33,12 +33,16 @@ template<typename Scalar> void orthomethods_3()
   VERIFY_IS_MUCH_SMALLER_THAN(v1.dot(v1.cross(v2)), Scalar(1));
   VERIFY_IS_MUCH_SMALLER_THAN(v1.cross(v2).dot(v2), Scalar(1));
   VERIFY_IS_MUCH_SMALLER_THAN(v2.dot(v1.cross(v2)), Scalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN(v1.cross(Vector3::Random()).dot(v1), Scalar(1));
   Matrix3 mat3;
   mat3 << v0.normalized(),
          (v0.cross(v1)).normalized(),
          (v0.cross(v1).cross(v0)).normalized();
   VERIFY(mat3.isUnitary());
-
+  
+  mat3.setRandom();
+  VERIFY_IS_APPROX(v0.cross(mat3*v1), -(mat3*v1).cross(v0));
+  VERIFY_IS_APPROX(v0.cross(mat3.lazyProduct(v1)), -(mat3.lazyProduct(v1)).cross(v0));
 
   // colwise/rowwise cross product
   mat3.setRandom();
@@ -47,6 +51,13 @@ template<typename Scalar> void orthomethods_3()
   int i = internal::random<int>(0,2);
   mcross = mat3.colwise().cross(vec3);
   VERIFY_IS_APPROX(mcross.col(i), mat3.col(i).cross(vec3));
+  
+  VERIFY_IS_MUCH_SMALLER_THAN((mat3.adjoint() * mat3.colwise().cross(vec3)).diagonal().cwiseAbs().sum(), Scalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN((mat3.adjoint() * mat3.colwise().cross(Vector3::Random())).diagonal().cwiseAbs().sum(), Scalar(1));
+  
+  VERIFY_IS_MUCH_SMALLER_THAN((vec3.adjoint() * mat3.colwise().cross(vec3)).cwiseAbs().sum(), Scalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN((vec3.adjoint() * Matrix3::Random().colwise().cross(vec3)).cwiseAbs().sum(), Scalar(1));
+  
   mcross = mat3.rowwise().cross(vec3);
   VERIFY_IS_APPROX(mcross.row(i), mat3.row(i).cross(vec3));
 
@@ -57,6 +68,7 @@ template<typename Scalar> void orthomethods_3()
   v40.w() = v41.w() = v42.w() = 0;
   v42.template head<3>() = v40.template head<3>().cross(v41.template head<3>());
   VERIFY_IS_APPROX(v40.cross3(v41), v42);
+  VERIFY_IS_MUCH_SMALLER_THAN(v40.cross3(Vector4::Random()).dot(v40), Scalar(1));
   
   // check mixed product
   typedef Matrix<RealScalar, 3, 1> RealVector3;
diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp
index f0462d40a..9bf5f3c1d 100644
--- a/test/geo_parametrizedline.cpp
+++ b/test/geo_parametrizedline.cpp
@@ -66,9 +66,9 @@ template<typename Scalar> void parametrizedline_alignment()
   typedef ParametrizedLine<Scalar,4,AutoAlign> Line4a;
   typedef ParametrizedLine<Scalar,4,DontAlign> Line4u;
 
-  EIGEN_ALIGN16 Scalar array1[8];
-  EIGEN_ALIGN16 Scalar array2[8];
-  EIGEN_ALIGN16 Scalar array3[8+1];
+  EIGEN_ALIGN_MAX Scalar array1[16];
+  EIGEN_ALIGN_MAX Scalar array2[16];
+  EIGEN_ALIGN_MAX Scalar array3[16+1];
   Scalar* array3u = array3+1;
 
   Line4a *p1 = ::new(reinterpret_cast<void*>(array1)) Line4a;
@@ -85,8 +85,8 @@ template<typename Scalar> void parametrizedline_alignment()
   VERIFY_IS_APPROX(p1->direction(), p2->direction());
   VERIFY_IS_APPROX(p1->direction(), p3->direction());
   
-  #if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY
-  if(internal::packet_traits<Scalar>::Vectorizable)
+  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+  if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
     VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Line4a));
   #endif
 }
diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp
index 1694b32c7..96889e722 100644
--- a/test/geo_quaternion.cpp
+++ b/test/geo_quaternion.cpp
@@ -30,8 +30,8 @@ template<typename QuatType> void check_slerp(const QuatType& q0, const QuatType&
   Scalar largeEps = test_precision<Scalar>();
 
   Scalar theta_tot = AA(q1*q0.inverse()).angle();
-  if(theta_tot>M_PI)
-    theta_tot = Scalar(2.*M_PI)-theta_tot;
+  if(theta_tot>Scalar(EIGEN_PI))
+    theta_tot = Scalar(2.)*Scalar(EIGEN_PI)-theta_tot;
   for(Scalar t=0; t<=Scalar(1.001); t+=Scalar(0.1))
   {
     QuatType q = q0.slerp(t,q1);
@@ -49,13 +49,13 @@ template<typename Scalar, int Options> void quaternion(void)
   */
   using std::abs;
   typedef Matrix<Scalar,3,1> Vector3;
-  typedef Matrix<Scalar,4,1> Vector4;
+  typedef Matrix<Scalar,3,3> Matrix3;
   typedef Quaternion<Scalar,Options> Quaternionx;
   typedef AngleAxis<Scalar> AngleAxisx;
 
   Scalar largeEps = test_precision<Scalar>();
   if (internal::is_same<Scalar,float>::value)
-    largeEps = 1e-3f;
+    largeEps = Scalar(1e-3);
 
   Scalar eps = internal::random<Scalar>() * Scalar(1e-2);
 
@@ -64,8 +64,8 @@ template<typename Scalar, int Options> void quaternion(void)
           v2 = Vector3::Random(),
           v3 = Vector3::Random();
 
-  Scalar  a = internal::random<Scalar>(-Scalar(M_PI), Scalar(M_PI)),
-          b = internal::random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
+  Scalar  a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI)),
+          b = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
 
   // Quaternion: Identity(), setIdentity();
   Quaternionx q1, q2;
@@ -82,8 +82,8 @@ template<typename Scalar, int Options> void quaternion(void)
 
   // angular distance
   Scalar refangle = abs(AngleAxisx(q1.inverse()*q2).angle());
-  if (refangle>Scalar(M_PI))
-    refangle = Scalar(2)*Scalar(M_PI) - refangle;
+  if (refangle>Scalar(EIGEN_PI))
+    refangle = Scalar(2)*Scalar(EIGEN_PI) - refangle;
 
   if((q1.coeffs()-q2.coeffs()).norm() > 10*largeEps)
   {
@@ -101,6 +101,11 @@ template<typename Scalar, int Options> void quaternion(void)
   q2 = q1.toRotationMatrix();
   VERIFY_IS_APPROX(q1*v1,q2*v1);
 
+  Matrix3 rot1(q1);
+  VERIFY_IS_APPROX(q1*v1,rot1*v1);
+  Quaternionx q3(rot1.transpose()*rot1);
+  VERIFY_IS_APPROX(q3*v1,v1);
+
 
   // angle-axis conversion
   AngleAxisx aa = AngleAxisx(q1);
@@ -109,8 +114,8 @@ template<typename Scalar, int Options> void quaternion(void)
   // Do not execute the test if the rotation angle is almost zero, or
   // the rotation axis and v1 are almost parallel.
   if (abs(aa.angle()) > 5*test_precision<Scalar>()
-      && (aa.axis() - v1.normalized()).norm() < 1.99
-      && (aa.axis() + v1.normalized()).norm() < 1.99) 
+      && (aa.axis() - v1.normalized()).norm() < Scalar(1.99)
+      && (aa.axis() + v1.normalized()).norm() < Scalar(1.99))
   {
     VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1);
   }
@@ -151,19 +156,19 @@ template<typename Scalar, int Options> void quaternion(void)
   Quaternionx *q = new Quaternionx;
   delete q;
 
-  q1 = AngleAxisx(a, v0.normalized());
-  q2 = AngleAxisx(b, v1.normalized());
+  q1 = Quaternionx::UnitRandom();
+  q2 = Quaternionx::UnitRandom();
   check_slerp(q1,q2);
 
   q1 = AngleAxisx(b, v1.normalized());
-  q2 = AngleAxisx(b+Scalar(M_PI), v1.normalized());
+  q2 = AngleAxisx(b+Scalar(EIGEN_PI), v1.normalized());
   check_slerp(q1,q2);
 
   q1 = AngleAxisx(b,  v1.normalized());
   q2 = AngleAxisx(-b, -v1.normalized());
   check_slerp(q1,q2);
 
-  q1.coeffs() = Vector4::Random().normalized();
+  q1 = Quaternionx::UnitRandom();
   q2.coeffs() = -q1.coeffs();
   check_slerp(q1,q2);
 }
@@ -179,11 +184,11 @@ template<typename Scalar> void mapQuaternion(void){
   
   Vector3 v0 = Vector3::Random(),
           v1 = Vector3::Random();
-  Scalar  a = internal::random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
+  Scalar  a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
 
-  EIGEN_ALIGN16 Scalar array1[4];
-  EIGEN_ALIGN16 Scalar array2[4];
-  EIGEN_ALIGN16 Scalar array3[4+1];
+  EIGEN_ALIGN_MAX Scalar array1[4];
+  EIGEN_ALIGN_MAX Scalar array2[4];
+  EIGEN_ALIGN_MAX Scalar array3[4+1];
   Scalar* array3unaligned = array3+1;
   
   MQuaternionA    mq1(array1);
@@ -232,9 +237,9 @@ template<typename Scalar> void quaternionAlignment(void){
   typedef Quaternion<Scalar,AutoAlign> QuaternionA;
   typedef Quaternion<Scalar,DontAlign> QuaternionUA;
 
-  EIGEN_ALIGN16 Scalar array1[4];
-  EIGEN_ALIGN16 Scalar array2[4];
-  EIGEN_ALIGN16 Scalar array3[4+1];
+  EIGEN_ALIGN_MAX Scalar array1[4];
+  EIGEN_ALIGN_MAX Scalar array2[4];
+  EIGEN_ALIGN_MAX Scalar array3[4+1];
   Scalar* arrayunaligned = array3+1;
 
   QuaternionA *q1 = ::new(reinterpret_cast<void*>(array1)) QuaternionA;
@@ -247,8 +252,8 @@ template<typename Scalar> void quaternionAlignment(void){
 
   VERIFY_IS_APPROX(q1->coeffs(), q2->coeffs());
   VERIFY_IS_APPROX(q1->coeffs(), q3->coeffs());
-  #if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY
-  if(internal::packet_traits<Scalar>::Vectorizable)
+  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+  if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
     VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(arrayunaligned)) QuaternionA));
   #endif
 }
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index 4ad3793d8..278e527c2 100644..100755
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -12,6 +12,17 @@
 #include <Eigen/LU>
 #include <Eigen/SVD>
 
+template<typename T>
+Matrix<T,2,1> angleToVec(T a)
+{
+  return Matrix<T,2,1>(std::cos(a), std::sin(a));
+}
+
+// This permits to workaround a bug in clang/llvm code generation.
+template<typename T>
+EIGEN_DONT_INLINE
+void dont_over_optimize(T& x) { volatile typename T::Scalar tmp = x(0); x(0) = tmp; }
+
 template<typename Scalar, int Mode, int Options> void non_projective_only()
 {
     /* this test covers the following files:
@@ -29,7 +40,7 @@ template<typename Scalar, int Mode, int Options> void non_projective_only()
 
   Transform3 t0, t1, t2;
 
-  Scalar a = internal::random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
 
   Quaternionx q1, q2;
 
@@ -97,16 +108,14 @@ template<typename Scalar, int Mode, int Options> void transformations()
           v1 = Vector3::Random();
   Matrix3 matrot1, m;
 
-  Scalar a = internal::random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
-  Scalar s0 = internal::random<Scalar>(),
-         s1 = internal::random<Scalar>();
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+  Scalar s0 = internal::random<Scalar>(), s1 = internal::random<Scalar>();
   
   while(v0.norm() < test_precision<Scalar>()) v0 = Vector3::Random();
   while(v1.norm() < test_precision<Scalar>()) v1 = Vector3::Random();
-    
 
   VERIFY_IS_APPROX(v0, AngleAxisx(a, v0.normalized()) * v0);
-  VERIFY_IS_APPROX(-v0, AngleAxisx(Scalar(M_PI), v0.unitOrthogonal()) * v0);
+  VERIFY_IS_APPROX(-v0, AngleAxisx(Scalar(EIGEN_PI), v0.unitOrthogonal()) * v0);
   if(abs(cos(a)) > test_precision<Scalar>())
   {
     VERIFY_IS_APPROX(cos(a)*v0.squaredNorm(), v0.dot(AngleAxisx(a, v0.unitOrthogonal()) * v0));
@@ -132,14 +141,16 @@ template<typename Scalar, int Mode, int Options> void transformations()
   AngleAxisx aa = AngleAxisx(q1);
   VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
   
-  if(abs(aa.angle()) > NumTraits<Scalar>::dummy_precision())
+  // The following test is stable only if 2*angle != angle and v1 is not colinear with axis
+  if( (abs(aa.angle()) > test_precision<Scalar>()) && (abs(aa.axis().dot(v1.normalized()))<(Scalar(1)-Scalar(4)*test_precision<Scalar>())) )
   {
     VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) );
   }
 
   aa.fromRotationMatrix(aa.toRotationMatrix());
   VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
-  if(abs(aa.angle()) > NumTraits<Scalar>::dummy_precision())
+  // The following test is stable only if 2*angle != angle and v1 is not colinear with axis
+  if( (abs(aa.angle()) > test_precision<Scalar>()) && (abs(aa.axis().dot(v1.normalized()))<(Scalar(1)-Scalar(4)*test_precision<Scalar>())) )
   {
     VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) );
   }
@@ -158,7 +169,7 @@ template<typename Scalar, int Mode, int Options> void transformations()
   // TODO complete the tests !
   a = 0;
   while (abs(a)<Scalar(0.1))
-    a = internal::random<Scalar>(-Scalar(0.4)*Scalar(M_PI), Scalar(0.4)*Scalar(M_PI));
+    a = internal::random<Scalar>(-Scalar(0.4)*Scalar(EIGEN_PI), Scalar(0.4)*Scalar(EIGEN_PI));
   q1 = AngleAxisx(a, v0.normalized());
   Transform3 t0, t1, t2;
 
@@ -204,7 +215,7 @@ template<typename Scalar, int Mode, int Options> void transformations()
     tmat4.matrix()(3,3) = Scalar(1);
   VERIFY_IS_APPROX(tmat3.matrix(), tmat4.matrix());
 
-  Scalar a3 = internal::random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
+  Scalar a3 = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
   Vector3 v3 = Vector3::Random().normalized();
   AngleAxisx aa3(a3, v3);
   Transform3 t3(aa3);
@@ -216,12 +227,15 @@ template<typename Scalar, int Mode, int Options> void transformations()
   t4 *= aa3;
   VERIFY_IS_APPROX(t3.matrix(), t4.matrix());
 
-  v3 = Vector3::Random();
+  do {
+    v3 = Vector3::Random();
+    dont_over_optimize(v3);
+  } while (v3.cwiseAbs().minCoeff()<NumTraits<Scalar>::epsilon());
   Translation3 tv3(v3);
   Transform3 t5(tv3);
   t4 = tv3;
   VERIFY_IS_APPROX(t5.matrix(), t4.matrix());
-  t4.translate(-v3);
+  t4.translate((-v3).eval());
   VERIFY_IS_APPROX(t4.matrix(), MatrixType::Identity());
   t4 *= tv3;
   VERIFY_IS_APPROX(t5.matrix(), t4.matrix());
@@ -413,12 +427,28 @@ template<typename Scalar, int Mode, int Options> void transformations()
   VERIFY_IS_APPROX(r2d1f.template cast<Scalar>(),r2d1);
   Rotation2D<double> r2d1d = r2d1.template cast<double>();
   VERIFY_IS_APPROX(r2d1d.template cast<Scalar>(),r2d1);
-
-  t20 = Translation2(v20) * (Rotation2D<Scalar>(s0) * Eigen::Scaling(s0));
-  t21 = Translation2(v20) * Rotation2D<Scalar>(s0) * Eigen::Scaling(s0);
-  VERIFY_IS_APPROX(t20,t21);
   
+  for(int k=0; k<100; ++k)
+  {
+    Scalar angle = internal::random<Scalar>(-100,100);
+    Rotation2D<Scalar> rot2(angle);
+    VERIFY( rot2.smallestPositiveAngle() >= 0 );
+    VERIFY( rot2.smallestPositiveAngle() <= Scalar(2)*Scalar(EIGEN_PI) );
+    VERIFY_IS_APPROX( angleToVec(rot2.smallestPositiveAngle()), angleToVec(rot2.angle()) );
+    
+    VERIFY( rot2.smallestAngle() >= -Scalar(EIGEN_PI) );
+    VERIFY( rot2.smallestAngle() <=  Scalar(EIGEN_PI) );
+    VERIFY_IS_APPROX( angleToVec(rot2.smallestAngle()), angleToVec(rot2.angle()) );
+
+    Matrix<Scalar,2,2> rot2_as_mat(rot2);
+    Rotation2D<Scalar> rot3(rot2_as_mat);
+    VERIFY_IS_APPROX( angleToVec(rot2.smallestAngle()),  angleToVec(rot3.angle()) );
+  }
+
+  s0 = internal::random<Scalar>(-100,100);
+  s1 = internal::random<Scalar>(-100,100);
   Rotation2D<Scalar> R0(s0), R1(s1);
+  
   t20 = Translation2(v20) * (R0 * Eigen::Scaling(s0));
   t21 = Translation2(v20) * R0 * Eigen::Scaling(s0);
   VERIFY_IS_APPROX(t20,t21);
@@ -428,9 +458,24 @@ template<typename Scalar, int Mode, int Options> void transformations()
   VERIFY_IS_APPROX(t20,t21);
   
   VERIFY_IS_APPROX(s0, (R0.slerp(0, R1)).angle());
-  VERIFY_IS_APPROX(s1, (R0.slerp(1, R1)).angle());
-  VERIFY_IS_APPROX(s0, (R0.slerp(0.5, R0)).angle());
-  VERIFY_IS_APPROX(Scalar(0), (R0.slerp(0.5, R0.inverse())).angle());
+  VERIFY_IS_APPROX( angleToVec(R1.smallestPositiveAngle()), angleToVec((R0.slerp(1, R1)).smallestPositiveAngle()) );
+  VERIFY_IS_APPROX(R0.smallestPositiveAngle(), (R0.slerp(0.5, R0)).smallestPositiveAngle());
+
+  if(std::cos(s0)>0)
+    VERIFY_IS_MUCH_SMALLER_THAN((R0.slerp(0.5, R0.inverse())).smallestAngle(), Scalar(1));
+  else
+    VERIFY_IS_APPROX(Scalar(EIGEN_PI), (R0.slerp(0.5, R0.inverse())).smallestPositiveAngle());
+  
+  // Check path length
+  Scalar l = 0;
+  int path_steps = 100;
+  for(int k=0; k<path_steps; ++k)
+  {
+    Scalar a1 = R0.slerp(Scalar(k)/Scalar(path_steps), R1).angle();
+    Scalar a2 = R0.slerp(Scalar(k+1)/Scalar(path_steps), R1).angle();
+    l += std::abs(a2-a1);
+  }
+  VERIFY(l<=Scalar(EIGEN_PI)*(Scalar(1)+NumTraits<Scalar>::epsilon()*Scalar(path_steps/2)));
   
   // check basic features
   {
@@ -520,9 +565,9 @@ template<typename Scalar> void transform_alignment()
   typedef Transform<Scalar,3,Projective,AutoAlign> Projective3a;
   typedef Transform<Scalar,3,Projective,DontAlign> Projective3u;
 
-  EIGEN_ALIGN16 Scalar array1[16];
-  EIGEN_ALIGN16 Scalar array2[16];
-  EIGEN_ALIGN16 Scalar array3[16+1];
+  EIGEN_ALIGN_MAX Scalar array1[16];
+  EIGEN_ALIGN_MAX Scalar array2[16];
+  EIGEN_ALIGN_MAX Scalar array3[16+1];
   Scalar* array3u = array3+1;
 
   Projective3a *p1 = ::new(reinterpret_cast<void*>(array1)) Projective3a;
@@ -538,7 +583,7 @@ template<typename Scalar> void transform_alignment()
   
   VERIFY_IS_APPROX( (*p1) * (*p1), (*p2)*(*p3));
   
-  #if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY
+  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
   if(internal::packet_traits<Scalar>::Vectorizable)
     VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Projective3a));
   #endif
diff --git a/test/half_float.cpp b/test/half_float.cpp
new file mode 100644
index 000000000..f8d438e2f
--- /dev/null
+++ b/test/half_float.cpp
@@ -0,0 +1,252 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <sstream>
+
+#include "main.h"
+
+#include <Eigen/src/Core/arch/CUDA/Half.h>
+
+// Make sure it's possible to forward declare Eigen::half
+namespace Eigen {
+struct half;
+}
+
+using Eigen::half;
+
+void test_conversion()
+{
+  using Eigen::half_impl::__half;
+
+  // Conversion from float.
+  VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
+  VERIFY_IS_EQUAL(half(0.5f).x, 0x3800);
+  VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555);
+  VERIFY_IS_EQUAL(half(0.0f).x, 0x0000);
+  VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000);
+  VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff);
+  VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00);  // Becomes infinity.
+
+  // Denormals.
+  VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001);
+  VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001);
+  VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
+
+  // Verify round-to-nearest-even behavior.
+  float val1 = float(half(__half(0x3c00)));
+  float val2 = float(half(__half(0x3c01)));
+  float val3 = float(half(__half(0x3c02)));
+  VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
+  VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
+
+  // Conversion from int.
+  VERIFY_IS_EQUAL(half(-1).x, 0xbc00);
+  VERIFY_IS_EQUAL(half(0).x, 0x0000);
+  VERIFY_IS_EQUAL(half(1).x, 0x3c00);
+  VERIFY_IS_EQUAL(half(2).x, 0x4000);
+  VERIFY_IS_EQUAL(half(3).x, 0x4200);
+
+  // Conversion from bool.
+  VERIFY_IS_EQUAL(half(false).x, 0x0000);
+  VERIFY_IS_EQUAL(half(true).x, 0x3c00);
+
+  // Conversion to float.
+  VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f);
+  VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f);
+
+  // Denormals.
+  VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f);
+  VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f);
+  VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f);
+
+  // NaNs and infinities.
+  VERIFY(!(numext::isinf)(float(half(65504.0f))));  // Largest finite number.
+  VERIFY(!(numext::isnan)(float(half(0.0f))));
+  VERIFY((numext::isinf)(float(half(__half(0xfc00)))));
+  VERIFY((numext::isnan)(float(half(__half(0xfc01)))));
+  VERIFY((numext::isinf)(float(half(__half(0x7c00)))));
+  VERIFY((numext::isnan)(float(half(__half(0x7c01)))));
+
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY((numext::isnan)(float(half(0.0 / 0.0))));
+  VERIFY((numext::isinf)(float(half(1.0 / 0.0))));
+  VERIFY((numext::isinf)(float(half(-1.0 / 0.0))));
+#endif
+
+  // Exactly same checks as above, just directly on the half representation.
+  VERIFY(!(numext::isinf)(half(__half(0x7bff))));
+  VERIFY(!(numext::isnan)(half(__half(0x0000))));
+  VERIFY((numext::isinf)(half(__half(0xfc00))));
+  VERIFY((numext::isnan)(half(__half(0xfc01))));
+  VERIFY((numext::isinf)(half(__half(0x7c00))));
+  VERIFY((numext::isnan)(half(__half(0x7c01))));
+
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY((numext::isnan)(half(0.0 / 0.0)));
+  VERIFY((numext::isinf)(half(1.0 / 0.0)));
+  VERIFY((numext::isinf)(half(-1.0 / 0.0)));
+#endif
+}
+
+void test_numtraits()
+{
+  std::cout << "epsilon  = " << NumTraits<half>::epsilon() << std::endl;
+  std::cout << "highest  = " << NumTraits<half>::highest() << std::endl;
+  std::cout << "lowest   = " << NumTraits<half>::lowest() << std::endl;
+  std::cout << "inifinty = " << NumTraits<half>::infinity() << std::endl;
+  std::cout << "nan      = " << NumTraits<half>::quiet_NaN() << std::endl;
+
+}
+
+void test_arithmetic()
+{
+  VERIFY_IS_EQUAL(float(half(2) + half(2)), 4);
+  VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0);
+  VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f);
+  VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f);
+  VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f);
+  VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f);
+  VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f);
+}
+
+void test_comparison()
+{
+  VERIFY(half(1.0f) > half(0.5f));
+  VERIFY(half(0.5f) < half(1.0f));
+  VERIFY(!(half(1.0f) < half(0.5f)));
+  VERIFY(!(half(0.5f) > half(1.0f)));
+
+  VERIFY(!(half(4.0f) > half(4.0f)));
+  VERIFY(!(half(4.0f) < half(4.0f)));
+
+  VERIFY(!(half(0.0f) < half(-0.0f)));
+  VERIFY(!(half(-0.0f) < half(0.0f)));
+  VERIFY(!(half(0.0f) > half(-0.0f)));
+  VERIFY(!(half(-0.0f) > half(0.0f)));
+
+  VERIFY(half(0.2f) > half(-1.0f));
+  VERIFY(half(-1.0f) < half(0.2f));
+  VERIFY(half(-16.0f) < half(-15.0f));
+
+  VERIFY(half(1.0f) == half(1.0f));
+  VERIFY(half(1.0f) != half(2.0f));
+
+  // Comparisons with NaNs and infinities.
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0)));
+  VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0));
+
+  VERIFY(!(half(1.0) == half(0.0 / 0.0)));
+  VERIFY(!(half(1.0) < half(0.0 / 0.0)));
+  VERIFY(!(half(1.0) > half(0.0 / 0.0)));
+  VERIFY(half(1.0) != half(0.0 / 0.0));
+
+  VERIFY(half(1.0) < half(1.0 / 0.0));
+  VERIFY(half(1.0) > half(-1.0 / 0.0));
+#endif
+}
+
+void test_basic_functions()
+{
+  VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(float(abs(half(3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(float(abs(half(-3.5f))), 3.5f);
+
+  VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(float(floor(half(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f);
+  VERIFY_IS_EQUAL(float(floor(half(-3.5f))), -4.0f);
+
+  VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(float(ceil(half(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f);
+  VERIFY_IS_EQUAL(float(ceil(half(-3.5f))), -3.0f);
+
+  VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(sqrt(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f);
+  VERIFY_IS_APPROX(float(sqrt(half(4.0f))), 2.0f);
+
+  VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(pow(half(0.0f), half(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f);
+  VERIFY_IS_APPROX(float(pow(half(2.0f), half(2.0f))), 4.0f);
+
+  VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f);
+  VERIFY_IS_EQUAL(float(exp(half(0.0f))), 1.0f);
+  VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI));
+  VERIFY_IS_APPROX(float(exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI));
+
+  VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f);
+  VERIFY_IS_EQUAL(float(log(half(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f);
+  VERIFY_IS_APPROX(float(log(half(10.0f))), 2.30273f);
+
+  VERIFY_IS_EQUAL(float(numext::log1p(half(0.0f))), 0.0f);
+  VERIFY_IS_EQUAL(float(log1p(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::log1p(half(10.0f))), 2.3978953f);
+  VERIFY_IS_APPROX(float(log1p(half(10.0f))), 2.3978953f);
+}
+
+void test_trigonometric_functions()
+{
+  VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f)));
+  VERIFY_IS_APPROX(cos(half(0.0f)), half(cosf(0.0f)));
+  VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI)));
+  //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2)));
+  //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f)));
+  VERIFY_IS_APPROX(sin(half(0.0f)), half(sinf(0.0f)));
+  //  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI)));
+  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f)));
+  VERIFY_IS_APPROX(tan(half(0.0f)), half(tanf(0.0f)));
+  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI)));
+  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2)));
+  //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f)));
+}
+
+void test_array()
+{
+  typedef Array<half,1,Dynamic> ArrayXh;
+  Index size = internal::random<Index>(1,10);
+  Index i = internal::random<Index>(0,size-1);
+  ArrayXh a1 = ArrayXh::Random(size), a2 = ArrayXh::Random(size);
+  VERIFY_IS_APPROX( a1+a1, half(2)*a1 );
+  VERIFY( (a1.abs() >= half(0)).all() );
+  VERIFY_IS_APPROX( (a1*a1).sqrt(), a1.abs() );
+
+  VERIFY( ((a1.min)(a2) <= (a1.max)(a2)).all() );
+  a1(i) = half(-10.);
+  VERIFY_IS_EQUAL( a1.minCoeff(), half(-10.) );
+  a1(i) = half(10.);
+  VERIFY_IS_EQUAL( a1.maxCoeff(), half(10.) );
+
+  std::stringstream ss;
+  ss << a1;
+}
+
+void test_half_float()
+{
+  CALL_SUBTEST(test_conversion());
+  CALL_SUBTEST(test_numtraits());
+  CALL_SUBTEST(test_arithmetic());
+  CALL_SUBTEST(test_comparison());
+  CALL_SUBTEST(test_basic_functions());
+  CALL_SUBTEST(test_trigonometric_functions());
+  CALL_SUBTEST(test_array());
+}
diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp
new file mode 100644
index 000000000..59ffe9259
--- /dev/null
+++ b/test/incomplete_cholesky.cpp
@@ -0,0 +1,65 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// #define EIGEN_DONT_VECTORIZE
+// #define EIGEN_MAX_ALIGN_BYTES 0
+#include "sparse_solver.h"
+#include <Eigen/IterativeLinearSolvers>
+#include <unsupported/Eigen/IterativeSolvers>
+
+template<typename T, typename I> void test_incomplete_cholesky_T()
+{
+  typedef SparseMatrix<T,0,I> SparseMatrixType;
+  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >        cg_illt_lower_amd;
+  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, NaturalOrdering<I> > >    cg_illt_lower_nat;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, AMDOrdering<I> > >        cg_illt_upper_amd;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, NaturalOrdering<I> > >    cg_illt_upper_nat;
+  ConjugateGradient<SparseMatrixType, Upper|Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >  cg_illt_uplo_amd;
+  
+
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_amd) );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_nat) );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_amd) );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_nat) );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) );
+}
+
+void test_incomplete_cholesky()
+{
+  CALL_SUBTEST_1(( test_incomplete_cholesky_T<double,int>() ));
+  CALL_SUBTEST_2(( test_incomplete_cholesky_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_incomplete_cholesky_T<double,long int>() ));
+
+#ifdef EIGEN_TEST_PART_1
+    // regression for bug 1150
+  for(int N = 1; N<20; ++N)
+  {
+    Eigen::MatrixXd b( N, N );
+    b.setOnes();
+
+    Eigen::SparseMatrix<double> m( N, N );
+    m.reserve(Eigen::VectorXi::Constant(N,4));
+    for( int i = 0; i < N; ++i )
+    {
+        m.insert( i, i ) = 1;
+        m.coeffRef( i, i / 2 ) = 2;
+        m.coeffRef( i, i / 3 ) = 2;
+        m.coeffRef( i, i / 4 ) = 2;
+    }
+
+    Eigen::SparseMatrix<double> A;
+    A = m * m.transpose();
+
+    Eigen::ConjugateGradient<Eigen::SparseMatrix<double>,
+        Eigen::Lower | Eigen::Upper,
+        Eigen::IncompleteCholesky<double> > solver( A );
+    VERIFY(solver.preconditioner().info() == Eigen::Success);
+    VERIFY(solver.info() == Eigen::Success);
+  }
+#endif
+}
diff --git a/test/inplace_decomposition.cpp b/test/inplace_decomposition.cpp
new file mode 100644
index 000000000..92d0d91b6
--- /dev/null
+++ b/test/inplace_decomposition.cpp
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/LU>
+#include <Eigen/Cholesky>
+#include <Eigen/QR>
+
+// This file test inplace decomposition through Ref<>, as supported by Cholesky, LU, and QR decompositions.
+
+template<typename DecType,typename MatrixType> void inplace(bool square = false, bool SPD = false)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> RhsType;
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> ResType;
+
+  Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random<Index>(2,EIGEN_TEST_MAX_SIZE/2) : Index(MatrixType::RowsAtCompileTime);
+  Index cols = MatrixType::ColsAtCompileTime==Dynamic ? (square?rows:internal::random<Index>(2,rows))    : Index(MatrixType::ColsAtCompileTime);
+
+  MatrixType A = MatrixType::Random(rows,cols);
+  RhsType b = RhsType::Random(rows);
+  ResType x(cols);
+
+  if(SPD)
+  {
+    assert(square);
+    A.topRows(cols) = A.topRows(cols).adjoint() * A.topRows(cols);
+    A.diagonal().array() += 1e-3;
+  }
+
+  MatrixType A0 = A;
+  MatrixType A1 = A;
+
+  DecType dec(A);
+
+  // Check that the content of A has been modified
+  VERIFY_IS_NOT_APPROX( A, A0 );
+
+  // Check that the decomposition is correct:
+  if(rows==cols)
+  {
+    VERIFY_IS_APPROX( A0 * (x = dec.solve(b)), b );
+  }
+  else
+  {
+    VERIFY_IS_APPROX( A0.transpose() * A0 * (x = dec.solve(b)), A0.transpose() * b );
+  }
+
+  // Check that modifying A breaks the current dec:
+  A.setRandom();
+  if(rows==cols)
+  {
+    VERIFY_IS_NOT_APPROX( A0 * (x = dec.solve(b)), b );
+  }
+  else
+  {
+    VERIFY_IS_NOT_APPROX( A0.transpose() * A0 * (x = dec.solve(b)), A0.transpose() * b );
+  }
+
+  // Check that calling compute(A1) does not modify A1:
+  A = A0;
+  dec.compute(A1);
+  VERIFY_IS_EQUAL(A0,A1);
+  VERIFY_IS_NOT_APPROX( A, A0 );
+  if(rows==cols)
+  {
+    VERIFY_IS_APPROX( A0 * (x = dec.solve(b)), b );
+  }
+  else
+  {
+    VERIFY_IS_APPROX( A0.transpose() * A0 * (x = dec.solve(b)), A0.transpose() * b );
+  }
+}
+
+
+void test_inplace_decomposition()
+{
+  EIGEN_UNUSED typedef Matrix<double,4,3> Matrix43d;
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(( inplace<LLT<Ref<MatrixXd> >, MatrixXd>(true,true) ));
+    CALL_SUBTEST_1(( inplace<LLT<Ref<Matrix4d> >, Matrix4d>(true,true) ));
+
+    CALL_SUBTEST_2(( inplace<LDLT<Ref<MatrixXd> >, MatrixXd>(true,true) ));
+    CALL_SUBTEST_2(( inplace<LDLT<Ref<Matrix4d> >, Matrix4d>(true,true) ));
+
+    CALL_SUBTEST_3(( inplace<PartialPivLU<Ref<MatrixXd> >, MatrixXd>(true,false) ));
+    CALL_SUBTEST_3(( inplace<PartialPivLU<Ref<Matrix4d> >, Matrix4d>(true,false) ));
+
+    CALL_SUBTEST_4(( inplace<FullPivLU<Ref<MatrixXd> >, MatrixXd>(true,false) ));
+    CALL_SUBTEST_4(( inplace<FullPivLU<Ref<Matrix4d> >, Matrix4d>(true,false) ));
+
+    CALL_SUBTEST_5(( inplace<HouseholderQR<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_5(( inplace<HouseholderQR<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+
+    CALL_SUBTEST_6(( inplace<ColPivHouseholderQR<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_6(( inplace<ColPivHouseholderQR<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+
+    CALL_SUBTEST_7(( inplace<FullPivHouseholderQR<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_7(( inplace<FullPivHouseholderQR<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+
+    CALL_SUBTEST_8(( inplace<CompleteOrthogonalDecomposition<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_8(( inplace<CompleteOrthogonalDecomposition<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+  }
+}
diff --git a/test/integer_types.cpp b/test/integer_types.cpp
index 950f8e9be..a21f73a81 100644
--- a/test/integer_types.cpp
+++ b/test/integer_types.cpp
@@ -158,4 +158,12 @@ void test_integer_types()
 
     CALL_SUBTEST_8( integer_type_tests(Matrix<unsigned long long, Dynamic, 5>(1, 5)) );
   }
+#ifdef EIGEN_TEST_PART_9
+  VERIFY_IS_EQUAL(internal::scalar_div_cost<int>::value, 8);
+  VERIFY_IS_EQUAL(internal::scalar_div_cost<unsigned int>::value, 8);
+  if(sizeof(long)>sizeof(int)) {
+    VERIFY(internal::scalar_div_cost<long>::value > internal::scalar_div_cost<int>::value);
+    VERIFY(internal::scalar_div_cost<unsigned long>::value > internal::scalar_div_cost<int>::value);
+  }
+#endif
 }
diff --git a/test/inverse.cpp b/test/inverse.cpp
index 8187b088d..5c6777a18 100644
--- a/test/inverse.cpp
+++ b/test/inverse.cpp
@@ -68,6 +68,15 @@ template<typename MatrixType> void inverse(const MatrixType& m)
   VERIFY_IS_MUCH_SMALLER_THAN(abs(det-m3.determinant()), RealScalar(1));
   m3.computeInverseWithCheck(m4, invertible);
   VERIFY( rows==1 ? invertible : !invertible );
+  
+  // check with submatrices
+  {
+    Matrix<Scalar, MatrixType::RowsAtCompileTime+1, MatrixType::RowsAtCompileTime+1, MatrixType::Options> m5;
+    m5.setRandom();
+    m5.topLeftCorner(rows,rows) = m1;
+    m2 = m5.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>().inverse();
+    VERIFY_IS_APPROX( (m5.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>()), m2.inverse() );
+  }
 #endif
 
   // check in-place inversion
@@ -93,12 +102,16 @@ void test_inverse()
     CALL_SUBTEST_3( inverse(Matrix3f()) );
     CALL_SUBTEST_4( inverse(Matrix4f()) );
     CALL_SUBTEST_4( inverse(Matrix<float,4,4,DontAlign>()) );
+    
     s = internal::random<int>(50,320); 
     CALL_SUBTEST_5( inverse(MatrixXf(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(25,100);
     CALL_SUBTEST_6( inverse(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     CALL_SUBTEST_7( inverse(Matrix4d()) );
     CALL_SUBTEST_7( inverse(Matrix<double,4,4,DontAlign>()) );
   }
-  TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp
new file mode 100644
index 000000000..2c7838ce9
--- /dev/null
+++ b/test/is_same_dense.cpp
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+using internal::is_same_dense;
+
+void test_is_same_dense()
+{
+  typedef Matrix<double,Dynamic,Dynamic,ColMajor> ColMatrixXd;
+  ColMatrixXd m1(10,10);
+  Ref<ColMatrixXd> ref_m1(m1);
+  Ref<const ColMatrixXd> const_ref_m1(m1);
+  VERIFY(is_same_dense(m1,m1));
+  VERIFY(is_same_dense(m1,ref_m1));
+  VERIFY(is_same_dense(const_ref_m1,m1));
+  VERIFY(is_same_dense(const_ref_m1,ref_m1));
+  
+  VERIFY(is_same_dense(m1.block(0,0,m1.rows(),m1.cols()),m1));
+  VERIFY(!is_same_dense(m1.row(0),m1.col(0)));
+  
+  Ref<const ColMatrixXd> const_ref_m1_row(m1.row(1));
+  VERIFY(!is_same_dense(m1.row(1),const_ref_m1_row));
+  
+  Ref<const ColMatrixXd> const_ref_m1_col(m1.col(1));
+  VERIFY(is_same_dense(m1.col(1),const_ref_m1_col));
+}
diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp
index 12c556e59..7f5f71562 100644
--- a/test/jacobisvd.cpp
+++ b/test/jacobisvd.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -14,279 +14,47 @@
 #include "main.h"
 #include <Eigen/SVD>
 
-template<typename MatrixType, int QRPreconditioner>
-void jacobisvd_check_full(const MatrixType& m, const JacobiSVD<MatrixType, QRPreconditioner>& svd)
-{
-  typedef typename MatrixType::Index Index;
-  Index rows = m.rows();
-  Index cols = m.cols();
-
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime
-  };
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime> MatrixUType;
-  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime> MatrixVType;
-
-  MatrixType sigma = MatrixType::Zero(rows,cols);
-  sigma.diagonal() = svd.singularValues().template cast<Scalar>();
-  MatrixUType u = svd.matrixU();
-  MatrixVType v = svd.matrixV();
-
-  VERIFY_IS_APPROX(m, u * sigma * v.adjoint());
-  VERIFY_IS_UNITARY(u);
-  VERIFY_IS_UNITARY(v);
-}
-
-template<typename MatrixType, int QRPreconditioner>
-void jacobisvd_compare_to_full(const MatrixType& m,
-                               unsigned int computationOptions,
-                               const JacobiSVD<MatrixType, QRPreconditioner>& referenceSvd)
-{
-  typedef typename MatrixType::Index Index;
-  Index rows = m.rows();
-  Index cols = m.cols();
-  Index diagSize = (std::min)(rows, cols);
-
-  JacobiSVD<MatrixType, QRPreconditioner> svd(m, computationOptions);
-
-  VERIFY_IS_APPROX(svd.singularValues(), referenceSvd.singularValues());
-  if(computationOptions & ComputeFullU)
-    VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU());
-  if(computationOptions & ComputeThinU)
-    VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU().leftCols(diagSize));
-  if(computationOptions & ComputeFullV)
-    VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV());
-  if(computationOptions & ComputeThinV)
-    VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV().leftCols(diagSize));
-}
-
-template<typename MatrixType, int QRPreconditioner>
-void jacobisvd_solve(const MatrixType& m, unsigned int computationOptions)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
-  Index rows = m.rows();
-  Index cols = m.cols();
-
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime
-  };
-
-  typedef Matrix<Scalar, RowsAtCompileTime, Dynamic> RhsType;
-  typedef Matrix<Scalar, ColsAtCompileTime, Dynamic> SolutionType;
-
-  RhsType rhs = RhsType::Random(rows, internal::random<Index>(1, cols));
-  JacobiSVD<MatrixType, QRPreconditioner> svd(m, computationOptions);
-
-       if(internal::is_same<RealScalar,double>::value) svd.setThreshold(1e-8);
-  else if(internal::is_same<RealScalar,float>::value)  svd.setThreshold(1e-4);
-  
-  SolutionType x = svd.solve(rhs);
-  
-  RealScalar residual = (m*x-rhs).norm();
-  // Check that there is no significantly better solution in the neighborhood of x
-  if(!test_isMuchSmallerThan(residual,rhs.norm()))
-  {
-    // If the residual is very small, then we have an exact solution, so we are already good.
-    for(int k=0;k<x.rows();++k)
-    {
-      SolutionType y(x);
-      y.row(k).array() += 2*NumTraits<RealScalar>::epsilon();
-      RealScalar residual_y = (m*y-rhs).norm();
-      VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
-      
-      y.row(k) = x.row(k).array() - 2*NumTraits<RealScalar>::epsilon();
-      residual_y = (m*y-rhs).norm();
-      VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
-    }
-  }
-  
-  // evaluate normal equation which works also for least-squares solutions
-  if(internal::is_same<RealScalar,double>::value)
-  {
-    // This test is not stable with single precision.
-    // This is probably because squaring m signicantly affects the precision.
-    VERIFY_IS_APPROX(m.adjoint()*m*x,m.adjoint()*rhs);
-  }
-  
-  // check minimal norm solutions
-  {
-    // generate a full-rank m x n problem with m<n
-    enum {
-      RankAtCompileTime2 = ColsAtCompileTime==Dynamic ? Dynamic : (ColsAtCompileTime)/2+1,
-      RowsAtCompileTime3 = ColsAtCompileTime==Dynamic ? Dynamic : ColsAtCompileTime+1
-    };
-    typedef Matrix<Scalar, RankAtCompileTime2, ColsAtCompileTime> MatrixType2;
-    typedef Matrix<Scalar, RankAtCompileTime2, 1> RhsType2;
-    typedef Matrix<Scalar, ColsAtCompileTime, RankAtCompileTime2> MatrixType2T;
-    Index rank = RankAtCompileTime2==Dynamic ? internal::random<Index>(1,cols) : Index(RankAtCompileTime2);
-    MatrixType2 m2(rank,cols);
-    int guard = 0;
-    do {
-      m2.setRandom();
-    } while(m2.jacobiSvd().setThreshold(test_precision<Scalar>()).rank()!=rank && (++guard)<10);
-    VERIFY(guard<10);
-    RhsType2 rhs2 = RhsType2::Random(rank);
-    // use QR to find a reference minimal norm solution
-    HouseholderQR<MatrixType2T> qr(m2.adjoint());
-    Matrix<Scalar,Dynamic,1> tmp = qr.matrixQR().topLeftCorner(rank,rank).template triangularView<Upper>().adjoint().solve(rhs2);
-    tmp.conservativeResize(cols);
-    tmp.tail(cols-rank).setZero();
-    SolutionType x21 = qr.householderQ() * tmp;
-    // now check with SVD
-    JacobiSVD<MatrixType2, ColPivHouseholderQRPreconditioner> svd2(m2, computationOptions);
-    SolutionType x22 = svd2.solve(rhs2);
-    VERIFY_IS_APPROX(m2*x21, rhs2);
-    VERIFY_IS_APPROX(m2*x22, rhs2);
-    VERIFY_IS_APPROX(x21, x22);
-    
-    // Now check with a rank deficient matrix
-    typedef Matrix<Scalar, RowsAtCompileTime3, ColsAtCompileTime> MatrixType3;
-    typedef Matrix<Scalar, RowsAtCompileTime3, 1> RhsType3;
-    Index rows3 = RowsAtCompileTime3==Dynamic ? internal::random<Index>(rank+1,2*cols) : Index(RowsAtCompileTime3);
-    Matrix<Scalar,RowsAtCompileTime3,Dynamic> C = Matrix<Scalar,RowsAtCompileTime3,Dynamic>::Random(rows3,rank);
-    MatrixType3 m3 = C * m2;
-    RhsType3 rhs3 = C * rhs2;
-    JacobiSVD<MatrixType3, ColPivHouseholderQRPreconditioner> svd3(m3, computationOptions);
-    SolutionType x3 = svd3.solve(rhs3);
-    if(svd3.rank()!=rank) {
-      std::cout << m3 << "\n\n";
-      std::cout << svd3.singularValues().transpose() << "\n";
-    std::cout << svd3.rank() << " == " << rank << "\n";
-    std::cout << x21.norm() << " == " << x3.norm() << "\n";
-    }
-//     VERIFY_IS_APPROX(m3*x3, rhs3);
-    VERIFY_IS_APPROX(m3*x21, rhs3);
-    VERIFY_IS_APPROX(m2*x3, rhs2);
-    
-    VERIFY_IS_APPROX(x21, x3);
-  }
-}
-
-template<typename MatrixType, int QRPreconditioner>
-void jacobisvd_test_all_computation_options(const MatrixType& m)
-{
-  if (QRPreconditioner == NoQRPreconditioner && m.rows() != m.cols())
-    return;
-  JacobiSVD<MatrixType, QRPreconditioner> fullSvd(m, ComputeFullU|ComputeFullV);
-  CALL_SUBTEST(( jacobisvd_check_full(m, fullSvd) ));
-  CALL_SUBTEST(( jacobisvd_solve<MatrixType, QRPreconditioner>(m, ComputeFullU | ComputeFullV) ));
-  
-  #if defined __INTEL_COMPILER
-  // remark #111: statement is unreachable
-  #pragma warning disable 111
-  #endif
-  if(QRPreconditioner == FullPivHouseholderQRPreconditioner)
-    return;
-
-  CALL_SUBTEST(( jacobisvd_compare_to_full(m, ComputeFullU, fullSvd) ));
-  CALL_SUBTEST(( jacobisvd_compare_to_full(m, ComputeFullV, fullSvd) ));
-  CALL_SUBTEST(( jacobisvd_compare_to_full(m, 0, fullSvd) ));
-
-  if (MatrixType::ColsAtCompileTime == Dynamic) {
-    // thin U/V are only available with dynamic number of columns
-    CALL_SUBTEST(( jacobisvd_compare_to_full(m, ComputeFullU|ComputeThinV, fullSvd) ));
-    CALL_SUBTEST(( jacobisvd_compare_to_full(m,              ComputeThinV, fullSvd) ));
-    CALL_SUBTEST(( jacobisvd_compare_to_full(m, ComputeThinU|ComputeFullV, fullSvd) ));
-    CALL_SUBTEST(( jacobisvd_compare_to_full(m, ComputeThinU             , fullSvd) ));
-    CALL_SUBTEST(( jacobisvd_compare_to_full(m, ComputeThinU|ComputeThinV, fullSvd) ));
-    CALL_SUBTEST(( jacobisvd_solve<MatrixType, QRPreconditioner>(m, ComputeFullU | ComputeThinV) ));
-    CALL_SUBTEST(( jacobisvd_solve<MatrixType, QRPreconditioner>(m, ComputeThinU | ComputeFullV) ));
-    CALL_SUBTEST(( jacobisvd_solve<MatrixType, QRPreconditioner>(m, ComputeThinU | ComputeThinV) ));
-
-    // test reconstruction
-    typedef typename MatrixType::Index Index;
-    Index diagSize = (std::min)(m.rows(), m.cols());
-    JacobiSVD<MatrixType, QRPreconditioner> svd(m, ComputeThinU | ComputeThinV);
-    VERIFY_IS_APPROX(m, svd.matrixU().leftCols(diagSize) * svd.singularValues().asDiagonal() * svd.matrixV().leftCols(diagSize).adjoint());
-  }
-}
+#define SVD_DEFAULT(M) JacobiSVD<M>
+#define SVD_FOR_MIN_NORM(M) JacobiSVD<M,ColPivHouseholderQRPreconditioner>
+#include "svd_common.h"
 
+// Check all variants of JacobiSVD
 template<typename MatrixType>
 void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
 {
   MatrixType m = a;
   if(pickrandom)
-  {
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    Index diagSize = (std::min)(a.rows(), a.cols());
-    RealScalar s = std::numeric_limits<RealScalar>::max_exponent10/4;
-    s = internal::random<RealScalar>(1,s);
-    Matrix<RealScalar,Dynamic,1> d =  Matrix<RealScalar,Dynamic,1>::Random(diagSize);
-    for(Index k=0; k<diagSize; ++k)
-      d(k) = d(k)*std::pow(RealScalar(10),internal::random<RealScalar>(-s,s));
-    m = Matrix<Scalar,Dynamic,Dynamic>::Random(a.rows(),diagSize) * d.asDiagonal() * Matrix<Scalar,Dynamic,Dynamic>::Random(diagSize,a.cols());
-    // cancel some coeffs
-    Index n  = internal::random<Index>(0,m.size()-1);
-    for(Index i=0; i<n; ++i)
-      m(internal::random<Index>(0,m.rows()-1), internal::random<Index>(0,m.cols()-1)) = Scalar(0);
-  }
+    svd_fill_random(m);
 
-  CALL_SUBTEST(( jacobisvd_test_all_computation_options<MatrixType, FullPivHouseholderQRPreconditioner>(m) ));
-  CALL_SUBTEST(( jacobisvd_test_all_computation_options<MatrixType, ColPivHouseholderQRPreconditioner>(m) ));
-  CALL_SUBTEST(( jacobisvd_test_all_computation_options<MatrixType, HouseholderQRPreconditioner>(m) ));
-  CALL_SUBTEST(( jacobisvd_test_all_computation_options<MatrixType, NoQRPreconditioner>(m) ));
+  CALL_SUBTEST(( svd_test_all_computation_options<JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner> >(m, true)  )); // check full only
+  CALL_SUBTEST(( svd_test_all_computation_options<JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>  >(m, false) ));
+  CALL_SUBTEST(( svd_test_all_computation_options<JacobiSVD<MatrixType, HouseholderQRPreconditioner>        >(m, false) ));
+  if(m.rows()==m.cols())
+    CALL_SUBTEST(( svd_test_all_computation_options<JacobiSVD<MatrixType, NoQRPreconditioner>               >(m, false) ));
 }
 
 template<typename MatrixType> void jacobisvd_verify_assert(const MatrixType& m)
 {
-  typedef typename MatrixType::Scalar Scalar;
+  svd_verify_assert<JacobiSVD<MatrixType> >(m);
   typedef typename MatrixType::Index Index;
   Index rows = m.rows();
   Index cols = m.cols();
 
   enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime
   };
 
-  typedef Matrix<Scalar, RowsAtCompileTime, 1> RhsType;
-
-  RhsType rhs(rows);
-
-  JacobiSVD<MatrixType> svd;
-  VERIFY_RAISES_ASSERT(svd.matrixU())
-  VERIFY_RAISES_ASSERT(svd.singularValues())
-  VERIFY_RAISES_ASSERT(svd.matrixV())
-  VERIFY_RAISES_ASSERT(svd.solve(rhs))
 
   MatrixType a = MatrixType::Zero(rows, cols);
   a.setZero();
-  svd.compute(a, 0);
-  VERIFY_RAISES_ASSERT(svd.matrixU())
-  VERIFY_RAISES_ASSERT(svd.matrixV())
-  svd.singularValues();
-  VERIFY_RAISES_ASSERT(svd.solve(rhs))
 
   if (ColsAtCompileTime == Dynamic)
   {
-    svd.compute(a, ComputeThinU);
-    svd.matrixU();
-    VERIFY_RAISES_ASSERT(svd.matrixV())
-    VERIFY_RAISES_ASSERT(svd.solve(rhs))
-
-    svd.compute(a, ComputeThinV);
-    svd.matrixV();
-    VERIFY_RAISES_ASSERT(svd.matrixU())
-    VERIFY_RAISES_ASSERT(svd.solve(rhs))
-
     JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner> svd_fullqr;
     VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeFullU|ComputeThinV))
     VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeThinU|ComputeThinV))
     VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeThinU|ComputeFullV))
   }
-  else
-  {
-    VERIFY_RAISES_ASSERT(svd.compute(a, ComputeThinU))
-    VERIFY_RAISES_ASSERT(svd.compute(a, ComputeThinV))
-  }
 }
 
 template<typename MatrixType>
@@ -302,126 +70,17 @@ void jacobisvd_method()
   VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).solve(m), m);
 }
 
-// work around stupid msvc error when constructing at compile time an expression that involves
-// a division by zero, even if the numeric type has floating point
-template<typename Scalar>
-EIGEN_DONT_INLINE Scalar zero() { return Scalar(0); }
-
-// workaround aggressive optimization in ICC
-template<typename T> EIGEN_DONT_INLINE  T sub(T a, T b) { return a - b; }
-
-template<typename MatrixType>
-void jacobisvd_inf_nan()
-{
-  // all this function does is verify we don't iterate infinitely on nan/inf values
-
-  JacobiSVD<MatrixType> svd;
-  typedef typename MatrixType::Scalar Scalar;
-  Scalar some_inf = Scalar(1) / zero<Scalar>();
-  VERIFY(sub(some_inf, some_inf) != sub(some_inf, some_inf));
-  svd.compute(MatrixType::Constant(10,10,some_inf), ComputeFullU | ComputeFullV);
-
-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-  VERIFY(nan != nan);
-  svd.compute(MatrixType::Constant(10,10,nan), ComputeFullU | ComputeFullV);
-
-  MatrixType m = MatrixType::Zero(10,10);
-  m(internal::random<int>(0,9), internal::random<int>(0,9)) = some_inf;
-  svd.compute(m, ComputeFullU | ComputeFullV);
-
-  m = MatrixType::Zero(10,10);
-  m(internal::random<int>(0,9), internal::random<int>(0,9)) = nan;
-  svd.compute(m, ComputeFullU | ComputeFullV);
-  
-  // regression test for bug 791
-  m.resize(3,3);
-  m << 0,    2*NumTraits<Scalar>::epsilon(),  0.5,
-       0,   -0.5,                             0,
-       nan,  0,                               0;
-  svd.compute(m, ComputeFullU | ComputeFullV);
-}
-
-// Regression test for bug 286: JacobiSVD loops indefinitely with some
-// matrices containing denormal numbers.
-void jacobisvd_bug286()
-{
-#if defined __INTEL_COMPILER
-// shut up warning #239: floating point underflow
-#pragma warning push
-#pragma warning disable 239
-#endif
-  Matrix2d M;
-  M << -7.90884e-313, -4.94e-324,
-                 0, 5.60844e-313;
-#if defined __INTEL_COMPILER
-#pragma warning pop
-#endif
-  JacobiSVD<Matrix2d> svd;
-  svd.compute(M); // just check we don't loop indefinitely
-}
-
-void jacobisvd_preallocate()
-{
-  Vector3f v(3.f, 2.f, 1.f);
-  MatrixXf m = v.asDiagonal();
-
-  internal::set_is_malloc_allowed(false);
-  VERIFY_RAISES_ASSERT(VectorXf tmp(10);)
-  JacobiSVD<MatrixXf> svd;
-  internal::set_is_malloc_allowed(true);
-  svd.compute(m);
-  VERIFY_IS_APPROX(svd.singularValues(), v);
-
-  JacobiSVD<MatrixXf> svd2(3,3);
-  internal::set_is_malloc_allowed(false);
-  svd2.compute(m);
-  internal::set_is_malloc_allowed(true);
-  VERIFY_IS_APPROX(svd2.singularValues(), v);
-  VERIFY_RAISES_ASSERT(svd2.matrixU());
-  VERIFY_RAISES_ASSERT(svd2.matrixV());
-  svd2.compute(m, ComputeFullU | ComputeFullV);
-  VERIFY_IS_APPROX(svd2.matrixU(), Matrix3f::Identity());
-  VERIFY_IS_APPROX(svd2.matrixV(), Matrix3f::Identity());
-  internal::set_is_malloc_allowed(false);
-  svd2.compute(m);
-  internal::set_is_malloc_allowed(true);
-
-  JacobiSVD<MatrixXf> svd3(3,3,ComputeFullU|ComputeFullV);
-  internal::set_is_malloc_allowed(false);
-  svd2.compute(m);
-  internal::set_is_malloc_allowed(true);
-  VERIFY_IS_APPROX(svd2.singularValues(), v);
-  VERIFY_IS_APPROX(svd2.matrixU(), Matrix3f::Identity());
-  VERIFY_IS_APPROX(svd2.matrixV(), Matrix3f::Identity());
-  internal::set_is_malloc_allowed(false);
-  svd2.compute(m, ComputeFullU|ComputeFullV);
-  internal::set_is_malloc_allowed(true);
-}
-
 void test_jacobisvd()
 {
   CALL_SUBTEST_3(( jacobisvd_verify_assert(Matrix3f()) ));
   CALL_SUBTEST_4(( jacobisvd_verify_assert(Matrix4d()) ));
   CALL_SUBTEST_7(( jacobisvd_verify_assert(MatrixXf(10,12)) ));
   CALL_SUBTEST_8(( jacobisvd_verify_assert(MatrixXcd(7,5)) ));
+  
+  CALL_SUBTEST_11(svd_all_trivial_2x2(jacobisvd<Matrix2cd>));
+  CALL_SUBTEST_12(svd_all_trivial_2x2(jacobisvd<Matrix2d>));
 
   for(int i = 0; i < g_repeat; i++) {
-    Matrix2cd m;
-    m << 0, 1,
-         0, 1;
-    CALL_SUBTEST_1(( jacobisvd(m, false) ));
-    m << 1, 0,
-         1, 0;
-    CALL_SUBTEST_1(( jacobisvd(m, false) ));
-
-    Matrix2d n;
-    n << 0, 0,
-         0, 0;
-    CALL_SUBTEST_2(( jacobisvd(n, false) ));
-    n << 0, 0,
-         0, 1;
-    CALL_SUBTEST_2(( jacobisvd(n, false) ));
-    
     CALL_SUBTEST_3(( jacobisvd<Matrix3f>() ));
     CALL_SUBTEST_4(( jacobisvd<Matrix4d>() ));
     CALL_SUBTEST_5(( jacobisvd<Matrix<float,3,5> >() ));
@@ -440,8 +99,14 @@ void test_jacobisvd()
     (void) c;
 
     // Test on inf/nan matrix
-    CALL_SUBTEST_7( jacobisvd_inf_nan<MatrixXf>() );
-    CALL_SUBTEST_10( jacobisvd_inf_nan<MatrixXd>() );
+    CALL_SUBTEST_7(  (svd_inf_nan<JacobiSVD<MatrixXf>, MatrixXf>()) );
+    CALL_SUBTEST_10( (svd_inf_nan<JacobiSVD<MatrixXd>, MatrixXd>()) );
+
+    // bug1395 test compile-time vectors as input
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,6,1>()) ));
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,1,6>()) ));
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,Dynamic,1>(r)) ));
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,1,Dynamic>(c)) ));
   }
 
   CALL_SUBTEST_7(( jacobisvd<MatrixXf>(MatrixXf(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
@@ -455,8 +120,7 @@ void test_jacobisvd()
   CALL_SUBTEST_7( JacobiSVD<MatrixXf>(10,10) );
 
   // Check that preallocation avoids subsequent mallocs
-  CALL_SUBTEST_9( jacobisvd_preallocate() );
+  CALL_SUBTEST_9( svd_preallocate<void>() );
 
-  // Regression check for bug 286
-  CALL_SUBTEST_2( jacobisvd_bug286() );
+  CALL_SUBTEST_2( svd_underoverflow<void>() );
 }
diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp
index 618984d5c..17474af10 100644
--- a/test/linearstructure.cpp
+++ b/test/linearstructure.cpp
@@ -2,11 +2,15 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+static bool g_called;
+#define EIGEN_SCALAR_BINARY_OP_PLUGIN { g_called |= (!internal::is_same<LhsScalar,RhsScalar>::value); }
+
 #include "main.h"
 
 template<typename MatrixType> void linearStructure(const MatrixType& m)
@@ -17,6 +21,7 @@ template<typename MatrixType> void linearStructure(const MatrixType& m)
   */
   typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
 
   Index rows = m.rows();
   Index cols = m.cols();
@@ -28,7 +33,7 @@ template<typename MatrixType> void linearStructure(const MatrixType& m)
              m3(rows, cols);
 
   Scalar s1 = internal::random<Scalar>();
-  while (abs(s1)<1e-3) s1 = internal::random<Scalar>();
+  while (abs(s1)<RealScalar(1e-3)) s1 = internal::random<Scalar>();
 
   Index r = internal::random<Index>(0, rows-1),
         c = internal::random<Index>(0, cols-1);
@@ -68,8 +73,48 @@ template<typename MatrixType> void linearStructure(const MatrixType& m)
   VERIFY_IS_APPROX(m1.block(0,0,rows,cols) * s1, m1 * s1);
 }
 
+// Make sure that complex * real and real * complex are properly optimized
+template<typename MatrixType> void real_complex(DenseIndex rows = MatrixType::RowsAtCompileTime, DenseIndex cols = MatrixType::ColsAtCompileTime)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  
+  RealScalar s = internal::random<RealScalar>();
+  MatrixType m1 = MatrixType::Random(rows, cols);
+  
+  g_called = false;
+  VERIFY_IS_APPROX(s*m1, Scalar(s)*m1);
+  VERIFY(g_called && "real * matrix<complex> not properly optimized");
+  
+  g_called = false;
+  VERIFY_IS_APPROX(m1*s, m1*Scalar(s));
+  VERIFY(g_called && "matrix<complex> * real not properly optimized");
+  
+  g_called = false;
+  VERIFY_IS_APPROX(m1/s, m1/Scalar(s));
+  VERIFY(g_called && "matrix<complex> / real not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(s+m1.array(), Scalar(s)+m1.array());
+  VERIFY(g_called && "real + matrix<complex> not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(m1.array()+s, m1.array()+Scalar(s));
+  VERIFY(g_called && "matrix<complex> + real not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(s-m1.array(), Scalar(s)-m1.array());
+  VERIFY(g_called && "real - matrix<complex> not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(m1.array()-s, m1.array()-Scalar(s));
+  VERIFY(g_called && "matrix<complex> - real not properly optimized");
+}
+
 void test_linearstructure()
 {
+  g_called = true;
+  VERIFY(g_called); // avoid `unneeded-internal-declaration` warning.
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( linearStructure(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( linearStructure(Matrix2f()) );
@@ -80,5 +125,25 @@ void test_linearstructure()
     CALL_SUBTEST_7( linearStructure(MatrixXi (internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_8( linearStructure(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_9( linearStructure(ArrayXXf (internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_10( linearStructure(ArrayXXcf (internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    
+    CALL_SUBTEST_11( real_complex<Matrix4cd>() );
+    CALL_SUBTEST_11( real_complex<MatrixXcf>(10,10) );
+    CALL_SUBTEST_11( real_complex<ArrayXXcf>(10,10) );
+  }
+  
+#ifdef EIGEN_TEST_PART_4
+  {
+    // make sure that /=scalar and /scalar do not overflow
+    // rational: 1.0/4.94e-320 overflow, but m/4.94e-320 should not
+    Matrix4d m2, m3;
+    m3 = m2 =  Matrix4d::Random()*1e-20;
+    m2 = m2 / 4.9e-320;
+    VERIFY_IS_APPROX(m2.cwiseQuotient(m2), Matrix4d::Ones());
+    m3 /= 4.9e-320;
+    VERIFY_IS_APPROX(m3.cwiseQuotient(m3), Matrix4d::Ones());
+    
+    
   }
+#endif
 }
diff --git a/test/lscg.cpp b/test/lscg.cpp
new file mode 100644
index 000000000..daa62a954
--- /dev/null
+++ b/test/lscg.cpp
@@ -0,0 +1,29 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse_solver.h"
+#include <Eigen/IterativeLinearSolvers>
+
+template<typename T> void test_lscg_T()
+{
+  LeastSquaresConjugateGradient<SparseMatrix<T> > lscg_colmajor_diag;
+  LeastSquaresConjugateGradient<SparseMatrix<T>, IdentityPreconditioner> lscg_colmajor_I;
+
+  CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_I)     );
+  
+  CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_I)     );
+}
+
+void test_lscg()
+{
+  CALL_SUBTEST_1(test_lscg_T<double>());
+  CALL_SUBTEST_2(test_lscg_T<std::complex<double> >());
+}
diff --git a/test/lu.cpp b/test/lu.cpp
index 374652694..9787f4d86 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp
@@ -11,6 +11,11 @@
 #include <Eigen/LU>
 using namespace std;
 
+template<typename MatrixType>
+typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
+  return m.cwiseAbs().colwise().sum().maxCoeff();
+}
+
 template<typename MatrixType> void lu_non_invertible()
 {
   typedef typename MatrixType::Index Index;
@@ -92,6 +97,26 @@ template<typename MatrixType> void lu_non_invertible()
   // test that the code, which does resize(), may be applied to an xpr
   m2.block(0,0,m2.rows(),m2.cols()) = lu.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
+
+  // test solve with transposed
+  m3 = MatrixType::Random(rows,cols2);
+  m2 = m1.transpose()*m3;
+  m3 = MatrixType::Random(rows,cols2);
+  lu.template _solve_impl_transposed<false>(m2, m3);
+  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
+  m3 = MatrixType::Random(rows,cols2);
+  m3 = lu.transpose().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
+
+  // test solve with conjugate transposed
+  m3 = MatrixType::Random(rows,cols2);
+  m2 = m1.adjoint()*m3;
+  m3 = MatrixType::Random(rows,cols2);
+  lu.template _solve_impl_transposed<true>(m2, m3);
+  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
+  m3 = MatrixType::Random(rows,cols2);
+  m3 = lu.adjoint().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
 }
 
 template<typename MatrixType> void lu_invertible()
@@ -100,9 +125,9 @@ template<typename MatrixType> void lu_invertible()
      LU.h
   */
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  DenseIndex size = MatrixType::RowsAtCompileTime;
+  Index size = MatrixType::RowsAtCompileTime;
   if( size==Dynamic)
-    size = internal::random<DenseIndex>(1,EIGEN_TEST_MAX_SIZE);
+    size = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
 
   MatrixType m1(size, size), m2(size, size), m3(size, size);
   FullPivLU<MatrixType> lu;
@@ -123,7 +148,28 @@ template<typename MatrixType> void lu_invertible()
   m3 = MatrixType::Random(size,size);
   m2 = lu.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
-  VERIFY_IS_APPROX(m2, lu.inverse()*m3);
+  MatrixType m1_inverse = lu.inverse();
+  VERIFY_IS_APPROX(m2, m1_inverse*m3);
+
+  RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse);
+  const RealScalar rcond_est = lu.rcond();
+  // Verify that the estimated condition number is within a factor of 10 of the
+  // truth.
+  VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
+  // test solve with transposed
+  lu.template _solve_impl_transposed<false>(m3, m2);
+  VERIFY_IS_APPROX(m3, m1.transpose()*m2);
+  m3 = MatrixType::Random(size,size);
+  m3 = lu.transpose().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
+
+  // test solve with conjugate transposed
+  lu.template _solve_impl_transposed<true>(m3, m2);
+  VERIFY_IS_APPROX(m3, m1.adjoint()*m2);
+  m3 = MatrixType::Random(size,size);
+  m3 = lu.adjoint().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
 
   // Regression test for Bug 302
   MatrixType m4 = MatrixType::Random(size,size);
@@ -136,14 +182,39 @@ template<typename MatrixType> void lu_partial_piv()
      PartialPivLU.h
   */
   typedef typename MatrixType::Index Index;
-  Index rows = internal::random<Index>(1,4);
-  Index cols = rows;
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  Index size = internal::random<Index>(1,4);
 
-  MatrixType m1(cols, rows);
+  MatrixType m1(size, size), m2(size, size), m3(size, size);
   m1.setRandom();
   PartialPivLU<MatrixType> plu(m1);
 
   VERIFY_IS_APPROX(m1, plu.reconstructedMatrix());
+
+  m3 = MatrixType::Random(size,size);
+  m2 = plu.solve(m3);
+  VERIFY_IS_APPROX(m3, m1*m2);
+  MatrixType m1_inverse = plu.inverse();
+  VERIFY_IS_APPROX(m2, m1_inverse*m3);
+
+  RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse);
+  const RealScalar rcond_est = plu.rcond();
+  // Verify that the estimate is within a factor of 10 of the truth.
+  VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
+  // test solve with transposed
+  plu.template _solve_impl_transposed<false>(m3, m2);
+  VERIFY_IS_APPROX(m3, m1.transpose()*m2);
+  m3 = MatrixType::Random(size,size);
+  m3 = plu.transpose().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
+
+  // test solve with conjugate transposed
+  plu.template _solve_impl_transposed<true>(m3, m2);
+  VERIFY_IS_APPROX(m3, m1.adjoint()*m2);
+  m3 = MatrixType::Random(size,size);
+  m3 = plu.adjoint().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
 }
 
 template<typename MatrixType> void lu_verify_assert()
diff --git a/test/main.h b/test/main.h
index 664204866..25d2dcf43 100644
--- a/test/main.h
+++ b/test/main.h
@@ -41,7 +41,14 @@
 #include <complex>
 #include <deque>
 #include <queue>
+#include <cassert>
 #include <list>
+#if __cplusplus >= 201103L
+#include <random>
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
+#endif
 
 // To test that all calls from Eigen code to std::min() and std::max() are
 // protected by parenthesis against macro expansion, the min()/max() macros
@@ -49,14 +56,48 @@
 // compiler error.
 #define min(A,B) please_protect_your_min_with_parentheses
 #define max(A,B) please_protect_your_max_with_parentheses
+#define isnan(X) please_protect_your_isnan_with_parentheses
+#define isinf(X) please_protect_your_isinf_with_parentheses
+#define isfinite(X) please_protect_your_isfinite_with_parentheses
+#ifdef M_PI
+#undef M_PI
+#endif
+#define M_PI please_use_EIGEN_PI_instead_of_M_PI
 
 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
 #define B0 FORBIDDEN_IDENTIFIER
 
+// Unit tests calling Eigen's blas library must preserve the default blocking size
+// to avoid troubles.
+#ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
+#define EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+#endif
 
 // shuts down ICC's remark #593: variable "XXX" was set but never used
-#define TEST_SET_BUT_UNUSED_VARIABLE(X) X = X + 0;
+#define TEST_SET_BUT_UNUSED_VARIABLE(X) EIGEN_UNUSED_VARIABLE(X)
+
+#ifdef TEST_ENABLE_TEMPORARY_TRACKING
+
+static long int nb_temporaries;
+static long int nb_temporaries_on_assert = -1;
+
+inline void on_temporary_creation(long int size) {
+  // here's a great place to set a breakpoint when debugging failures in this test!
+  if(size!=0) nb_temporaries++;
+  if(nb_temporaries_on_assert>0) assert(nb_temporaries<nb_temporaries_on_assert);
+}
+
+#define EIGEN_DENSE_STORAGE_CTOR_PLUGIN { on_temporary_creation(size); }
+
+#define VERIFY_EVALUATION_COUNT(XPR,N) {\
+    nb_temporaries = 0; \
+    XPR; \
+    if(nb_temporaries!=N) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\
+    VERIFY( (#XPR) && nb_temporaries==N ); \
+  }
+  
+#endif
 
 // the following file is automatically generated by cmake
 #include "split_test_helper.h"
@@ -71,7 +112,7 @@
 #endif
 
 // bounds integer values for AltiVec
-#ifdef __ALTIVEC__
+#if defined(__ALTIVEC__) || defined(__VSX__)
 #define EIGEN_MAKING_DOCS
 #endif
 
@@ -84,16 +125,26 @@
 namespace Eigen
 {
   static std::vector<std::string> g_test_stack;
+  // level == 0 <=> abort if test fail
+  // level >= 1 <=> warning message to std::cerr if test fail
+  static int g_test_level = 0;
   static int g_repeat;
   static unsigned int g_seed;
   static bool g_has_set_repeat, g_has_set_seed;
 }
 
+#define TRACK std::cerr << __FILE__ << " " << __LINE__ << std::endl
+// #define TRACK while()
+
 #define EI_PP_MAKE_STRING2(S) #S
 #define EI_PP_MAKE_STRING(S) EI_PP_MAKE_STRING2(S)
 
 #define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, "  ", "\n", "", "", "", "")
 
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__)
+  #define EIGEN_EXCEPTIONS
+#endif
+
 #ifndef EIGEN_NO_ASSERTION_CHECKING
 
   namespace Eigen
@@ -135,33 +186,35 @@ namespace Eigen
         if(report_on_cerr_on_assert_failure) \
           std::cerr <<  #a << " " __FILE__ << "(" << __LINE__ << ")\n"; \
         Eigen::no_more_assert = true;       \
-        throw Eigen::eigen_assert_exception(); \
+        EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
       }                                     \
       else if (Eigen::internal::push_assert)       \
       {                                     \
         eigen_assert_list.push_back(std::string(EI_PP_MAKE_STRING(__FILE__) " (" EI_PP_MAKE_STRING(__LINE__) ") : " #a) ); \
       }
 
+    #ifdef EIGEN_EXCEPTIONS
     #define VERIFY_RAISES_ASSERT(a)                                                   \
       {                                                                               \
         Eigen::no_more_assert = false;                                                \
-        Eigen::eigen_assert_list.clear();                                                \
-        Eigen::internal::push_assert = true;                                                 \
+        Eigen::eigen_assert_list.clear();                                             \
+        Eigen::internal::push_assert = true;                                          \
         Eigen::report_on_cerr_on_assert_failure = false;                              \
         try {                                                                         \
           a;                                                                          \
           std::cerr << "One of the following asserts should have been triggered:\n";  \
-          for (uint ai=0 ; ai<eigen_assert_list.size() ; ++ai)                           \
-            std::cerr << "  " << eigen_assert_list[ai] << "\n";                          \
+          for (uint ai=0 ; ai<eigen_assert_list.size() ; ++ai)                        \
+            std::cerr << "  " << eigen_assert_list[ai] << "\n";                       \
           VERIFY(Eigen::should_raise_an_assert && # a);                               \
-        } catch (Eigen::eigen_assert_exception) {                                        \
-          Eigen::internal::push_assert = false; VERIFY(true);                                \
+        } catch (Eigen::eigen_assert_exception) {                                     \
+          Eigen::internal::push_assert = false; VERIFY(true);                         \
         }                                                                             \
         Eigen::report_on_cerr_on_assert_failure = true;                               \
-        Eigen::internal::push_assert = false;                                                \
+        Eigen::internal::push_assert = false;                                         \
       }
+    #endif //EIGEN_EXCEPTIONS
 
-  #else // EIGEN_DEBUG_ASSERTS
+  #elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS
     // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3
     #define eigen_assert(a) \
       if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
@@ -170,22 +223,30 @@ namespace Eigen
         if(report_on_cerr_on_assert_failure)  \
           eigen_plain_assert(a);              \
         else                                  \
-          throw Eigen::eigen_assert_exception(); \
+          EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
       }
-    #define VERIFY_RAISES_ASSERT(a) {                             \
+    #ifdef EIGEN_EXCEPTIONS
+      #define VERIFY_RAISES_ASSERT(a) {                           \
         Eigen::no_more_assert = false;                            \
         Eigen::report_on_cerr_on_assert_failure = false;          \
         try {                                                     \
           a;                                                      \
           VERIFY(Eigen::should_raise_an_assert && # a);           \
         }                                                         \
-        catch (Eigen::eigen_assert_exception&) { VERIFY(true); }     \
+        catch (Eigen::eigen_assert_exception&) { VERIFY(true); }  \
         Eigen::report_on_cerr_on_assert_failure = true;           \
       }
-
+    #endif //EIGEN_EXCEPTIONS
   #endif // EIGEN_DEBUG_ASSERTS
 
+#ifndef VERIFY_RAISES_ASSERT
+  #define VERIFY_RAISES_ASSERT(a) \
+    std::cout << "Can't VERIFY_RAISES_ASSERT( " #a " ) with exceptions disabled\n";
+#endif
+    
+  #if !defined(__CUDACC__)
   #define EIGEN_USE_CUSTOM_ASSERT
+  #endif
 
 #else // EIGEN_NO_ASSERTION_CHECKING
 
@@ -201,6 +262,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
 {
   if (!condition)
   {
+    if(Eigen::g_test_level>0)
+      std::cerr << "WARNING: ";
     std::cerr << "Test " << testname << " failed in " << file << " (" << line << ")"
       << std::endl << "    " << condition_as_string << std::endl;
     std::cerr << "Stack:\n";
@@ -208,14 +271,20 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
     for(int i=test_stack_size-1; i>=0; --i)
       std::cerr << "  - " << Eigen::g_test_stack[i] << "\n";
     std::cerr << "\n";
-    abort();
+    if(Eigen::g_test_level==0)
+      abort();
   }
 }
 
 #define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a))
 
-#define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b))
-#define VERIFY_IS_APPROX(a, b) VERIFY(test_isApprox(a, b))
+#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a >= b))
+#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a <= b))
+
+
+#define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b, true))
+#define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(test_is_equal(a, b, false))
+#define VERIFY_IS_APPROX(a, b) VERIFY(verifyIsApprox(a, b))
 #define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_isApprox(a, b))
 #define VERIFY_IS_MUCH_SMALLER_THAN(a, b) VERIFY(test_isMuchSmallerThan(a, b))
 #define VERIFY_IS_NOT_MUCH_SMALLER_THAN(a, b) VERIFY(!test_isMuchSmallerThan(a, b))
@@ -236,9 +305,10 @@ namespace Eigen {
 template<typename T> inline typename NumTraits<T>::Real test_precision() { return NumTraits<T>::dummy_precision(); }
 template<> inline float test_precision<float>() { return 1e-3f; }
 template<> inline double test_precision<double>() { return 1e-6; }
+template<> inline long double test_precision<long double>() { return 1e-6l; }
 template<> inline float test_precision<std::complex<float> >() { return test_precision<float>(); }
 template<> inline double test_precision<std::complex<double> >() { return test_precision<double>(); }
-template<> inline long double test_precision<long double>() { return 1e-6; }
+template<> inline long double test_precision<std::complex<long double> >() { return test_precision<long double>(); }
 
 inline bool test_isApprox(const int& a, const int& b)
 { return internal::isApprox(a, b, test_precision<int>()); }
@@ -253,14 +323,15 @@ inline bool test_isMuchSmallerThan(const float& a, const float& b)
 { return internal::isMuchSmallerThan(a, b, test_precision<float>()); }
 inline bool test_isApproxOrLessThan(const float& a, const float& b)
 { return internal::isApproxOrLessThan(a, b, test_precision<float>()); }
+
 inline bool test_isApprox(const double& a, const double& b)
 { return internal::isApprox(a, b, test_precision<double>()); }
-
 inline bool test_isMuchSmallerThan(const double& a, const double& b)
 { return internal::isMuchSmallerThan(a, b, test_precision<double>()); }
 inline bool test_isApproxOrLessThan(const double& a, const double& b)
 { return internal::isApproxOrLessThan(a, b, test_precision<double>()); }
 
+#ifndef EIGEN_TEST_NO_COMPLEX
 inline bool test_isApprox(const std::complex<float>& a, const std::complex<float>& b)
 { return internal::isApprox(a, b, test_precision<std::complex<float> >()); }
 inline bool test_isMuchSmallerThan(const std::complex<float>& a, const std::complex<float>& b)
@@ -271,6 +342,15 @@ inline bool test_isApprox(const std::complex<double>& a, const std::complex<doub
 inline bool test_isMuchSmallerThan(const std::complex<double>& a, const std::complex<double>& b)
 { return internal::isMuchSmallerThan(a, b, test_precision<std::complex<double> >()); }
 
+#ifndef EIGEN_TEST_NO_LONGDOUBLE
+inline bool test_isApprox(const std::complex<long double>& a, const std::complex<long double>& b)
+{ return internal::isApprox(a, b, test_precision<std::complex<long double> >()); }
+inline bool test_isMuchSmallerThan(const std::complex<long double>& a, const std::complex<long double>& b)
+{ return internal::isMuchSmallerThan(a, b, test_precision<std::complex<long double> >()); }
+#endif
+#endif
+
+#ifndef EIGEN_TEST_NO_LONGDOUBLE
 inline bool test_isApprox(const long double& a, const long double& b)
 {
     bool ret = internal::isApprox(a, b, test_precision<long double>());
@@ -284,13 +364,127 @@ inline bool test_isMuchSmallerThan(const long double& a, const long double& b)
 { return internal::isMuchSmallerThan(a, b, test_precision<long double>()); }
 inline bool test_isApproxOrLessThan(const long double& a, const long double& b)
 { return internal::isApproxOrLessThan(a, b, test_precision<long double>()); }
+#endif // EIGEN_TEST_NO_LONGDOUBLE
+
+inline bool test_isApprox(const half& a, const half& b)
+{ return internal::isApprox(a, b, test_precision<half>()); }
+inline bool test_isMuchSmallerThan(const half& a, const half& b)
+{ return internal::isMuchSmallerThan(a, b, test_precision<half>()); }
+inline bool test_isApproxOrLessThan(const half& a, const half& b)
+{ return internal::isApproxOrLessThan(a, b, test_precision<half>()); }
+
+// test_relative_error returns the relative difference between a and b as a real scalar as used in isApprox.
+template<typename T1,typename T2>
+typename NumTraits<typename T1::RealScalar>::NonInteger test_relative_error(const EigenBase<T1> &a, const EigenBase<T2> &b)
+{
+  using std::sqrt;
+  typedef typename NumTraits<typename T1::RealScalar>::NonInteger RealScalar;
+  typename internal::nested_eval<T1,2>::type ea(a.derived());
+  typename internal::nested_eval<T2,2>::type eb(b.derived());
+  return sqrt(RealScalar((ea-eb).cwiseAbs2().sum()) / RealScalar((std::min)(eb.cwiseAbs2().sum(),ea.cwiseAbs2().sum())));
+}
+
+template<typename T1,typename T2>
+typename T1::RealScalar test_relative_error(const T1 &a, const T2 &b, const typename T1::Coefficients* = 0)
+{
+  return test_relative_error(a.coeffs(), b.coeffs());
+}
+
+template<typename T1,typename T2>
+typename T1::Scalar test_relative_error(const T1 &a, const T2 &b, const typename T1::MatrixType* = 0)
+{
+  return test_relative_error(a.matrix(), b.matrix());
+}
+
+template<typename S, int D>
+S test_relative_error(const Translation<S,D> &a, const Translation<S,D> &b)
+{
+  return test_relative_error(a.vector(), b.vector());
+}
+
+template <typename S, int D, int O>
+S test_relative_error(const ParametrizedLine<S,D,O> &a, const ParametrizedLine<S,D,O> &b)
+{
+  return (std::max)(test_relative_error(a.origin(), b.origin()), test_relative_error(a.origin(), b.origin()));
+}
+
+template <typename S, int D>
+S test_relative_error(const AlignedBox<S,D> &a, const AlignedBox<S,D> &b)
+{
+  return (std::max)(test_relative_error((a.min)(), (b.min)()), test_relative_error((a.max)(), (b.max)()));
+}
+
+template<typename Derived> class SparseMatrixBase;
+template<typename T1,typename T2>
+typename T1::RealScalar test_relative_error(const MatrixBase<T1> &a, const SparseMatrixBase<T2> &b)
+{
+  return test_relative_error(a,b.toDense());
+}
+
+template<typename Derived> class SparseMatrixBase;
+template<typename T1,typename T2>
+typename T1::RealScalar test_relative_error(const SparseMatrixBase<T1> &a, const MatrixBase<T2> &b)
+{
+  return test_relative_error(a.toDense(),b);
+}
+
+template<typename Derived> class SparseMatrixBase;
+template<typename T1,typename T2>
+typename T1::RealScalar test_relative_error(const SparseMatrixBase<T1> &a, const SparseMatrixBase<T2> &b)
+{
+  return test_relative_error(a.toDense(),b.toDense());
+}
+
+template<typename T1,typename T2>
+typename NumTraits<typename NumTraits<T1>::Real>::NonInteger test_relative_error(const T1 &a, const T2 &b, typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T1>::Real>::value, T1>::type* = 0)
+{
+  typedef typename NumTraits<typename NumTraits<T1>::Real>::NonInteger RealScalar;
+  return numext::sqrt(RealScalar(numext::abs2(a-b))/RealScalar((numext::mini)(numext::abs2(a),numext::abs2(b))));
+}
+
+template<typename T>
+T test_relative_error(const Rotation2D<T> &a, const Rotation2D<T> &b)
+{
+  return test_relative_error(a.angle(), b.angle());
+}
+
+template<typename T>
+T test_relative_error(const AngleAxis<T> &a, const AngleAxis<T> &b)
+{
+  return (std::max)(test_relative_error(a.angle(), b.angle()), test_relative_error(a.axis(), b.axis()));
+}
 
 template<typename Type1, typename Type2>
-inline bool test_isApprox(const Type1& a, const Type2& b)
+inline bool test_isApprox(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
 {
   return a.isApprox(b, test_precision<typename Type1::Scalar>());
 }
 
+// get_test_precision is a small wrapper to test_precision allowing to return the scalar precision for either scalars or expressions
+template<typename T>
+typename NumTraits<typename T::Scalar>::Real get_test_precision(const T&, const typename T::Scalar* = 0)
+{
+  return test_precision<typename NumTraits<typename T::Scalar>::Real>();
+}
+
+template<typename T>
+typename NumTraits<T>::Real get_test_precision(const T&,typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T>::Real>::value, T>::type* = 0)
+{
+  return test_precision<typename NumTraits<T>::Real>();
+}
+
+// verifyIsApprox is a wrapper to test_isApprox that outputs the relative difference magnitude if the test fails.
+template<typename Type1, typename Type2>
+inline bool verifyIsApprox(const Type1& a, const Type2& b)
+{
+  bool ret = test_isApprox(a,b);
+  if(!ret)
+  {
+    std::cerr << "Difference too large wrt tolerance " << get_test_precision(a)  << ", relative error is: " << test_relative_error(a,b) << std::endl;
+  }
+  return ret;
+}
+
 // The idea behind this function is to compare the two scalars a and b where
 // the scalar ref is a hint about the expected order of magnitude of a and b.
 // WARNING: the scalar a and b must be positive
@@ -326,17 +520,17 @@ inline bool test_isUnitary(const MatrixBase<Derived>& m)
 
 // Forward declaration to avoid ICC warning
 template<typename T, typename U>
-bool test_is_equal(const T& actual, const U& expected);
+bool test_is_equal(const T& actual, const U& expected, bool expect_equal=true);
 
 template<typename T, typename U>
-bool test_is_equal(const T& actual, const U& expected)
+bool test_is_equal(const T& actual, const U& expected, bool expect_equal)
 {
-    if (actual==expected)
+    if ((actual==expected) == expect_equal)
         return true;
     // false:
     std::cerr
-        << std::endl << "    actual   = " << actual
-        << std::endl << "    expected = " << expected << std::endl << std::endl;
+        << "\n    actual   = " << actual
+        << "\n    expected " << (expect_equal ? "= " : "!=") << expected << "\n\n";
     return false;
 }
 
@@ -347,11 +541,10 @@ bool test_is_equal(const T& actual, const U& expected)
   */
 // Forward declaration to avoid ICC warning
 template<typename MatrixType>
-void createRandomPIMatrixOfRank(typename MatrixType::Index desired_rank, typename MatrixType::Index rows, typename MatrixType::Index cols, MatrixType& m);
+void createRandomPIMatrixOfRank(Index desired_rank, Index rows, Index cols, MatrixType& m);
 template<typename MatrixType>
-void createRandomPIMatrixOfRank(typename MatrixType::Index desired_rank, typename MatrixType::Index rows, typename MatrixType::Index cols, MatrixType& m)
+void createRandomPIMatrixOfRank(Index desired_rank, Index rows, Index cols, MatrixType& m)
 {
-  typedef typename internal::traits<MatrixType>::Index Index;
   typedef typename internal::traits<MatrixType>::Scalar Scalar;
   enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
 
@@ -388,11 +581,10 @@ void createRandomPIMatrixOfRank(typename MatrixType::Index desired_rank, typenam
 
 // Forward declaration to avoid ICC warning
 template<typename PermutationVectorType>
-void randomPermutationVector(PermutationVectorType& v, typename PermutationVectorType::Index size);
+void randomPermutationVector(PermutationVectorType& v, Index size);
 template<typename PermutationVectorType>
-void randomPermutationVector(PermutationVectorType& v, typename PermutationVectorType::Index size)
+void randomPermutationVector(PermutationVectorType& v, Index size)
 {
-  typedef typename PermutationVectorType::Index Index;
   typedef typename PermutationVectorType::Scalar Scalar;
   v.resize(size);
   for(Index i = 0; i < size; ++i) v(i) = Scalar(i);
@@ -411,12 +603,7 @@ template<typename T> bool isNotNaN(const T& x)
   return x==x;
 }
 
-template<typename T> bool isNaN(const T& x)
-{
-  return x!=x;
-}
-
-template<typename T> bool isInf(const T& x)
+template<typename T> bool isPlusInf(const T& x)
 {
   return x > NumTraits<T>::highest();
 }
@@ -437,13 +624,15 @@ template<typename T> struct GetDifferentType<std::complex<T> >
 
 // Forward declaration to avoid ICC warning
 template<typename T> std::string type_name();
-template<typename T> std::string type_name()              { return "other"; }
-template<> std::string type_name<float>()                 { return "float"; }
-template<> std::string type_name<double>()                { return "double"; }
-template<> std::string type_name<int>()                   { return "int"; }
-template<> std::string type_name<std::complex<float> >()  { return "complex<float>"; }
-template<> std::string type_name<std::complex<double> >() { return "complex<double>"; }
-template<> std::string type_name<std::complex<int> >()    { return "complex<int>"; }
+template<typename T> std::string type_name()                    { return "other"; }
+template<> std::string type_name<float>()                       { return "float"; }
+template<> std::string type_name<double>()                      { return "double"; }
+template<> std::string type_name<long double>()                 { return "long double"; }
+template<> std::string type_name<int>()                         { return "int"; }
+template<> std::string type_name<std::complex<float> >()        { return "complex<float>"; }
+template<> std::string type_name<std::complex<double> >()       { return "complex<double>"; }
+template<> std::string type_name<std::complex<long double> >()  { return "complex<long double>"; }
+template<> std::string type_name<std::complex<int> >()          { return "complex<int>"; }
 
 // forward declaration of the main test function
 void EIGEN_CAT(test_,EIGEN_TEST_FUNC)();
@@ -550,3 +739,8 @@ int main(int argc, char *argv[])
   // remark #1572: floating-point equality and inequality comparisons are unreliable
   #pragma warning disable 279 383 1418 1572
 #endif
+
+#ifdef _MSC_VER
+  // 4503 - decorated name length exceeded, name was truncated
+  #pragma warning( disable : 4503)
+#endif
diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp
index 58904fa37..6a84c5897 100644
--- a/test/mapped_matrix.cpp
+++ b/test/mapped_matrix.cpp
@@ -13,6 +13,8 @@
 
 #include "main.h"
 
+#define EIGEN_TESTMAP_MAX_SIZE 256
+
 template<typename VectorType> void map_class_vector(const VectorType& m)
 {
   typedef typename VectorType::Index Index;
@@ -20,23 +22,26 @@ template<typename VectorType> void map_class_vector(const VectorType& m)
 
   Index size = m.size();
 
-  // test Map.h
   Scalar* array1 = internal::aligned_new<Scalar>(size);
   Scalar* array2 = internal::aligned_new<Scalar>(size);
   Scalar* array3 = new Scalar[size+1];
-  Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3;
+  Scalar* array3unaligned = (internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3;
+  Scalar  array4[EIGEN_TESTMAP_MAX_SIZE];
 
-  Map<VectorType, Aligned>(array1, size) = VectorType::Random(size);
-  Map<VectorType, Aligned>(array2, size) = Map<VectorType,Aligned>(array1, size);
+  Map<VectorType, AlignedMax>(array1, size) = VectorType::Random(size);
+  Map<VectorType, AlignedMax>(array2, size) = Map<VectorType,AlignedMax>(array1, size);
   Map<VectorType>(array3unaligned, size) = Map<VectorType>(array1, size);
-  VectorType ma1 = Map<VectorType, Aligned>(array1, size);
-  VectorType ma2 = Map<VectorType, Aligned>(array2, size);
+  Map<VectorType>(array4, size)          = Map<VectorType,AlignedMax>(array1, size);
+  VectorType ma1 = Map<VectorType, AlignedMax>(array1, size);
+  VectorType ma2 = Map<VectorType, AlignedMax>(array2, size);
   VectorType ma3 = Map<VectorType>(array3unaligned, size);
+  VectorType ma4 = Map<VectorType>(array4, size);
   VERIFY_IS_EQUAL(ma1, ma2);
   VERIFY_IS_EQUAL(ma1, ma3);
+  VERIFY_IS_EQUAL(ma1, ma4);
   #ifdef EIGEN_VECTORIZE
-  if(internal::packet_traits<Scalar>::Vectorizable)
-    VERIFY_RAISES_ASSERT((Map<VectorType,Aligned>(array3unaligned, size)))
+  if(internal::packet_traits<Scalar>::Vectorizable && size>=AlignedMax)
+    VERIFY_RAISES_ASSERT((Map<VectorType,AlignedMax>(array3unaligned, size)))
   #endif
 
   internal::aligned_delete(array1, size);
@@ -50,23 +55,64 @@ template<typename MatrixType> void map_class_matrix(const MatrixType& m)
   typedef typename MatrixType::Scalar Scalar;
 
   Index rows = m.rows(), cols = m.cols(), size = rows*cols;
+  Scalar s1 = internal::random<Scalar>();
 
-  // test Map.h
+  // array1 and array2 -> aligned heap allocation
   Scalar* array1 = internal::aligned_new<Scalar>(size);
   for(int i = 0; i < size; i++) array1[i] = Scalar(1);
   Scalar* array2 = internal::aligned_new<Scalar>(size);
   for(int i = 0; i < size; i++) array2[i] = Scalar(1);
+  // array3unaligned -> unaligned pointer to heap
   Scalar* array3 = new Scalar[size+1];
   for(int i = 0; i < size+1; i++) array3[i] = Scalar(1);
-  Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3;
-  Map<MatrixType, Aligned>(array1, rows, cols) = MatrixType::Ones(rows,cols);
-  Map<MatrixType>(array2, rows, cols) = Map<MatrixType>(array1, rows, cols);
-  Map<MatrixType>(array3unaligned, rows, cols) = Map<MatrixType>(array1, rows, cols);
-  MatrixType ma1 = Map<MatrixType>(array1, rows, cols);
-  MatrixType ma2 = Map<MatrixType, Aligned>(array2, rows, cols);
+  Scalar* array3unaligned = internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3;
+  Scalar array4[256];
+  if(size<=256)
+    for(int i = 0; i < size; i++) array4[i] = Scalar(1);
+  
+  Map<MatrixType> map1(array1, rows, cols);
+  Map<MatrixType, AlignedMax> map2(array2, rows, cols);
+  Map<MatrixType> map3(array3unaligned, rows, cols);
+  Map<MatrixType> map4(array4, rows, cols);
+  
+  VERIFY_IS_EQUAL(map1, MatrixType::Ones(rows,cols));
+  VERIFY_IS_EQUAL(map2, MatrixType::Ones(rows,cols));
+  VERIFY_IS_EQUAL(map3, MatrixType::Ones(rows,cols));
+  map1 = MatrixType::Random(rows,cols);
+  map2 = map1;
+  map3 = map1;
+  MatrixType ma1 = map1;
+  MatrixType ma2 = map2;
+  MatrixType ma3 = map3;
+  VERIFY_IS_EQUAL(map1, map2);
+  VERIFY_IS_EQUAL(map1, map3);
   VERIFY_IS_EQUAL(ma1, ma2);
-  MatrixType ma3 = Map<MatrixType>(array3unaligned, rows, cols);
   VERIFY_IS_EQUAL(ma1, ma3);
+  VERIFY_IS_EQUAL(ma1, map3);
+  
+  VERIFY_IS_APPROX(s1*map1, s1*map2);
+  VERIFY_IS_APPROX(s1*ma1, s1*ma2);
+  VERIFY_IS_EQUAL(s1*ma1, s1*ma3);
+  VERIFY_IS_APPROX(s1*map1, s1*map3);
+  
+  map2 *= s1;
+  map3 *= s1;
+  VERIFY_IS_APPROX(s1*map1, map2);
+  VERIFY_IS_APPROX(s1*map1, map3);
+  
+  if(size<=256)
+  {
+    VERIFY_IS_EQUAL(map4, MatrixType::Ones(rows,cols));
+    map4 = map1;
+    MatrixType ma4 = map4;
+    VERIFY_IS_EQUAL(map1, map4);
+    VERIFY_IS_EQUAL(ma1, map4);
+    VERIFY_IS_EQUAL(ma1, ma4);
+    VERIFY_IS_APPROX(s1*map1, s1*map4);
+    
+    map4 *= s1;
+    VERIFY_IS_APPROX(s1*map1, map4);
+  }
 
   internal::aligned_delete(array1, size);
   internal::aligned_delete(array2, size);
@@ -80,11 +126,10 @@ template<typename VectorType> void map_static_methods(const VectorType& m)
 
   Index size = m.size();
 
-  // test Map.h
   Scalar* array1 = internal::aligned_new<Scalar>(size);
   Scalar* array2 = internal::aligned_new<Scalar>(size);
   Scalar* array3 = new Scalar[size+1];
-  Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3;
+  Scalar* array3unaligned = internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3;
 
   VectorType::MapAligned(array1, size) = VectorType::Random(size);
   VectorType::Map(array2, size) = VectorType::Map(array1, size);
@@ -109,9 +154,9 @@ template<typename PlainObjectType> void check_const_correctness(const PlainObjec
   // verify that map-to-const don't have LvalueBit
   typedef typename internal::add_const<PlainObjectType>::type ConstPlainObjectType;
   VERIFY( !(internal::traits<Map<ConstPlainObjectType> >::Flags & LvalueBit) );
-  VERIFY( !(internal::traits<Map<ConstPlainObjectType, Aligned> >::Flags & LvalueBit) );
+  VERIFY( !(internal::traits<Map<ConstPlainObjectType, AlignedMax> >::Flags & LvalueBit) );
   VERIFY( !(Map<ConstPlainObjectType>::Flags & LvalueBit) );
-  VERIFY( !(Map<ConstPlainObjectType, Aligned>::Flags & LvalueBit) );
+  VERIFY( !(Map<ConstPlainObjectType, AlignedMax>::Flags & LvalueBit) );
 }
 
 template<typename Scalar>
@@ -142,6 +187,7 @@ void test_mapped_matrix()
     CALL_SUBTEST_1( map_class_vector(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_1( check_const_correctness(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( map_class_vector(Vector4d()) );
+    CALL_SUBTEST_2( map_class_vector(VectorXd(13)) );
     CALL_SUBTEST_2( check_const_correctness(Matrix4d()) );
     CALL_SUBTEST_3( map_class_vector(RowVector4f()) );
     CALL_SUBTEST_4( map_class_vector(VectorXcf(8)) );
diff --git a/test/mapstaticmethods.cpp b/test/mapstaticmethods.cpp
index 5b512bde4..06272d106 100644
--- a/test/mapstaticmethods.cpp
+++ b/test/mapstaticmethods.cpp
@@ -69,7 +69,8 @@ struct mapstaticmethods_impl<PlainObjectType, true, false>
 {
   static void run(const PlainObjectType& m)
   {
-    int rows = m.rows(), cols = m.cols();
+    typedef typename PlainObjectType::Index Index;
+    Index rows = m.rows(), cols = m.cols();
 
     int i = internal::random<int>(2,5), j = internal::random<int>(2,5);
 
@@ -115,7 +116,8 @@ struct mapstaticmethods_impl<PlainObjectType, true, true>
 {
   static void run(const PlainObjectType& v)
   {
-    int size = v.size();
+    typedef typename PlainObjectType::Index Index;
+    Index size = v.size();
 
     int i = internal::random<int>(2,5);
 
diff --git a/test/mapstride.cpp b/test/mapstride.cpp
index b1dc9de2a..4858f8fea 100644
--- a/test/mapstride.cpp
+++ b/test/mapstride.cpp
@@ -23,7 +23,7 @@ template<int Alignment,typename VectorType> void map_class_vector(const VectorTy
   Scalar* a_array = internal::aligned_new<Scalar>(arraysize+1);
   Scalar* array = a_array;
   if(Alignment!=Aligned)
-    array = (Scalar*)(ptrdiff_t(a_array) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+    array = (Scalar*)(internal::IntPtr(a_array) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
 
   {
     Map<VectorType, Alignment, InnerStride<3> > map(array, size);
@@ -56,16 +56,30 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
   Index rows = _m.rows(), cols = _m.cols();
 
   MatrixType m = MatrixType::Random(rows,cols);
+  Scalar s1 = internal::random<Scalar>();
 
   Index arraysize = 2*(rows+4)*(cols+4);
 
-  Scalar* a_array = internal::aligned_new<Scalar>(arraysize+1);
-  Scalar* array = a_array;
+  Scalar* a_array1 = internal::aligned_new<Scalar>(arraysize+1);
+  Scalar* array1 = a_array1;
   if(Alignment!=Aligned)
-    array = (Scalar*)(ptrdiff_t(a_array) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+    array1 = (Scalar*)(internal::IntPtr(a_array1) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
 
+  Scalar a_array2[256];
+  Scalar* array2 = a_array2;
+  if(Alignment!=Aligned)
+    array2 = (Scalar*)(internal::IntPtr(a_array2) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+  else
+    array2 = (Scalar*)(((internal::UIntPtr(a_array2)+EIGEN_MAX_ALIGN_BYTES-1)/EIGEN_MAX_ALIGN_BYTES)*EIGEN_MAX_ALIGN_BYTES);
+  Index maxsize2 = a_array2 - array2 + 256;
+  
   // test no inner stride and some dynamic outer stride
+  for(int k=0; k<2; ++k)
   {
+    if(k==1 && (m.innerSize()+1)*m.outerSize() > maxsize2)
+      break;
+    Scalar* array = (k==0 ? array1 : array2);
+    
     Map<MatrixType, Alignment, OuterStride<Dynamic> > map(array, rows, cols, OuterStride<Dynamic>(m.innerSize()+1));
     map = m;
     VERIFY(map.outerStride() == map.innerSize()+1);
@@ -75,11 +89,19 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
         VERIFY(array[map.outerStride()*i+j] == m.coeffByOuterInner(i,j));
         VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
       }
+    VERIFY_IS_APPROX(s1*map,s1*m);
+    map *= s1;
+    VERIFY_IS_APPROX(map,s1*m);
   }
 
   // test no inner stride and an outer stride of +4. This is quite important as for fixed-size matrices,
   // this allows to hit the special case where it's vectorizable.
+  for(int k=0; k<2; ++k)
   {
+    if(k==1 && (m.innerSize()+4)*m.outerSize() > maxsize2)
+      break;
+    Scalar* array = (k==0 ? array1 : array2);
+    
     enum {
       InnerSize = MatrixType::InnerSizeAtCompileTime,
       OuterStrideAtCompileTime = InnerSize==Dynamic ? Dynamic : InnerSize+4
@@ -94,10 +116,18 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
         VERIFY(array[map.outerStride()*i+j] == m.coeffByOuterInner(i,j));
         VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
       }
+    VERIFY_IS_APPROX(s1*map,s1*m);
+    map *= s1;
+    VERIFY_IS_APPROX(map,s1*m);
   }
 
   // test both inner stride and outer stride
+  for(int k=0; k<2; ++k)
   {
+    if(k==1 && (2*m.innerSize()+1)*(m.outerSize()*2) > maxsize2)
+      break;
+    Scalar* array = (k==0 ? array1 : array2);
+    
     Map<MatrixType, Alignment, Stride<Dynamic,Dynamic> > map(array, rows, cols, Stride<Dynamic,Dynamic>(2*m.innerSize()+1, 2));
     map = m;
     VERIFY(map.outerStride() == 2*map.innerSize()+1);
@@ -108,9 +138,12 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
         VERIFY(array[map.outerStride()*i+map.innerStride()*j] == m.coeffByOuterInner(i,j));
         VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
       }
+    VERIFY_IS_APPROX(s1*map,s1*m);
+    map *= s1;
+    VERIFY_IS_APPROX(map,s1*m);
   }
 
-  internal::aligned_delete(a_array, arraysize+1);
+  internal::aligned_delete(a_array1, arraysize+1);
 }
 
 void test_mapstride()
diff --git a/test/meta.cpp b/test/meta.cpp
index 3302c5887..b8dea68e8 100644
--- a/test/meta.cpp
+++ b/test/meta.cpp
@@ -9,6 +9,12 @@
 
 #include "main.h"
 
+template<typename From, typename To>
+bool check_is_convertible(const From&, const To&)
+{
+  return internal::is_convertible<From,To>::value;
+}
+
 void test_meta()
 {
   VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value));
@@ -52,6 +58,24 @@ void test_meta()
   VERIFY(( internal::is_same<const float,internal::remove_pointer<const float*>::type >::value));
   VERIFY(( internal::is_same<float,internal::remove_pointer<float* const >::type >::value));
   
+  VERIFY(( internal::is_convertible<float,double>::value ));
+  VERIFY(( internal::is_convertible<int,double>::value ));
+  VERIFY(( internal::is_convertible<double,int>::value ));
+  VERIFY((!internal::is_convertible<std::complex<double>,double>::value ));
+  VERIFY(( internal::is_convertible<Array33f,Matrix3f>::value ));
+//   VERIFY((!internal::is_convertible<Matrix3f,Matrix3d>::value )); //does not work because the conversion is prevented by a static assertion
+  VERIFY((!internal::is_convertible<Array33f,int>::value ));
+  VERIFY((!internal::is_convertible<MatrixXf,float>::value ));
+  {
+    float f;
+    MatrixXf A, B;
+    VectorXf a, b;
+    VERIFY(( check_is_convertible(a.dot(b), f) ));
+    VERIFY(( check_is_convertible(a.transpose()*b, f) ));
+    VERIFY((!check_is_convertible(A*B, f) ));
+    VERIFY(( check_is_convertible(A*B, A) ));
+  }
+  
   VERIFY(internal::meta_sqrt<1>::ret == 1);
   #define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt<X>::ret == int(std::sqrt(double(X))))
   VERIFY_META_SQRT(2);
diff --git a/test/metis_support.cpp b/test/metis_support.cpp
index 932b04074..d87c56a13 100644
--- a/test/metis_support.cpp
+++ b/test/metis_support.cpp
@@ -3,24 +3,10 @@
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #include "sparse_solver.h"
 #include <Eigen/SparseLU>
 #include <Eigen/MetisSupport>
diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index 6c2f74875..ad9c2c652 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -15,14 +15,26 @@
 #define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them
 #endif
 
-// #ifndef EIGEN_DONT_VECTORIZE
-// #define EIGEN_DONT_VECTORIZE // SSE intrinsics aren't designed to allow mixing types
-// #endif
+#if defined(EIGEN_TEST_PART_1) || defined(EIGEN_TEST_PART_2) || defined(EIGEN_TEST_PART_3)
+
+#ifndef EIGEN_DONT_VECTORIZE
+#define EIGEN_DONT_VECTORIZE
+#endif
+
+#endif
+
+static bool g_called;
+#define EIGEN_SCALAR_BINARY_OP_PLUGIN { g_called |= (!internal::is_same<LhsScalar,RhsScalar>::value); }
 
 #include "main.h"
 
 using namespace std;
 
+#define VERIFY_MIX_SCALAR(XPR,REF) \
+  g_called = false; \
+  VERIFY_IS_APPROX(XPR,REF); \
+  VERIFY( g_called && #XPR" not properly optimized");
+
 template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
 {
   typedef std::complex<float>   CF;
@@ -38,8 +50,10 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
 
   Mat_f mf    = Mat_f::Random(size,size);
   Mat_d md    = mf.template cast<double>();
+  //Mat_d rd    = md;
   Mat_cf mcf  = Mat_cf::Random(size,size);
   Mat_cd mcd  = mcf.template cast<complex<double> >();
+  Mat_cd rcd = mcd;
   Vec_f vf    = Vec_f::Random(size,1);
   Vec_d vd    = vf.template cast<double>();
   Vec_cf vcf  = Vec_cf::Random(size,1);
@@ -49,19 +63,59 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   complex<float>  scf = internal::random<complex<float> >();
   complex<double> scd = internal::random<complex<double> >();
 
-
   mf+mf;
-  VERIFY_RAISES_ASSERT(mf+md);
-  VERIFY_RAISES_ASSERT(mf+mcf);
+
+  float  epsf = std::sqrt(std::numeric_limits<float> ::min EIGEN_EMPTY ());
+  double epsd = std::sqrt(std::numeric_limits<double>::min EIGEN_EMPTY ());
+
+  while(std::abs(sf )<epsf) sf  = internal::random<float>();
+  while(std::abs(sd )<epsd) sf  = internal::random<double>();
+  while(std::abs(scf)<epsf) scf = internal::random<CF>();
+  while(std::abs(scd)<epsd) scd = internal::random<CD>();
+
+//   VERIFY_RAISES_ASSERT(mf+md); // does not even compile
+
+#ifdef EIGEN_DONT_VECTORIZE
   VERIFY_RAISES_ASSERT(vf=vd);
   VERIFY_RAISES_ASSERT(vf+=vd);
-  VERIFY_RAISES_ASSERT(mcd=md);
-
+#endif
+  
   // check scalar products
-  VERIFY_IS_APPROX(vcf * sf , vcf * complex<float>(sf));
-  VERIFY_IS_APPROX(sd * vcd, complex<double>(sd) * vcd);
-  VERIFY_IS_APPROX(vf * scf , vf.template cast<complex<float> >() * scf);
-  VERIFY_IS_APPROX(scd * vd, scd * vd.template cast<complex<double> >());
+  VERIFY_MIX_SCALAR(vcf * sf , vcf * complex<float>(sf));
+  VERIFY_MIX_SCALAR(sd * vcd , complex<double>(sd) * vcd);
+  VERIFY_MIX_SCALAR(vf * scf , vf.template cast<complex<float> >() * scf);
+  VERIFY_MIX_SCALAR(scd * vd , scd * vd.template cast<complex<double> >());
+
+  VERIFY_MIX_SCALAR(vcf * 2 , vcf * complex<float>(2));
+  VERIFY_MIX_SCALAR(vcf * 2.1 , vcf * complex<float>(2.1));
+  VERIFY_MIX_SCALAR(2 * vcf, vcf * complex<float>(2));
+  VERIFY_MIX_SCALAR(2.1 * vcf , vcf * complex<float>(2.1));
+
+  // check scalar quotients
+  VERIFY_MIX_SCALAR(vcf / sf , vcf / complex<float>(sf));
+  VERIFY_MIX_SCALAR(vf / scf , vf.template cast<complex<float> >() / scf);
+  VERIFY_MIX_SCALAR(vf.array()  / scf, vf.template cast<complex<float> >().array() / scf);
+  VERIFY_MIX_SCALAR(scd / vd.array() , scd / vd.template cast<complex<double> >().array());
+
+  // check scalar increment
+  VERIFY_MIX_SCALAR(vcf.array() + sf , vcf.array() + complex<float>(sf));
+  VERIFY_MIX_SCALAR(sd  + vcd.array(), complex<double>(sd) + vcd.array());
+  VERIFY_MIX_SCALAR(vf.array()  + scf, vf.template cast<complex<float> >().array() + scf);
+  VERIFY_MIX_SCALAR(scd + vd.array() , scd + vd.template cast<complex<double> >().array());
+
+  // check scalar subtractions
+  VERIFY_MIX_SCALAR(vcf.array() - sf , vcf.array() - complex<float>(sf));
+  VERIFY_MIX_SCALAR(sd  - vcd.array(), complex<double>(sd) - vcd.array());
+  VERIFY_MIX_SCALAR(vf.array()  - scf, vf.template cast<complex<float> >().array() - scf);
+  VERIFY_MIX_SCALAR(scd - vd.array() , scd - vd.template cast<complex<double> >().array());
+
+  // check scalar powers
+  VERIFY_MIX_SCALAR( pow(vcf.array(), sf),        Eigen::pow(vcf.array(), complex<float>(sf)) );
+  VERIFY_MIX_SCALAR( vcf.array().pow(sf) ,        Eigen::pow(vcf.array(), complex<float>(sf)) );
+  VERIFY_MIX_SCALAR( pow(sd, vcd.array()),        Eigen::pow(complex<double>(sd), vcd.array()) );
+  VERIFY_MIX_SCALAR( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast<complex<float> >().array(), scf) );
+  VERIFY_MIX_SCALAR( vf.array().pow(scf) ,        Eigen::pow(vf.template cast<complex<float> >().array(), scf) );
+  VERIFY_MIX_SCALAR( Eigen::pow(scd, vd.array()), Eigen::pow(scd, vd.template cast<complex<double> >().array()) );
 
   // check dot product
   vf.dot(vf);
@@ -75,6 +129,7 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX(vcd.asDiagonal() * md, vcd.asDiagonal() * md.template cast<complex<double> >());
   VERIFY_IS_APPROX(mcf * vf.asDiagonal(), mcf * vf.template cast<complex<float> >().asDiagonal());
   VERIFY_IS_APPROX(md * vcd.asDiagonal(), md.template cast<complex<double> >() * vcd.asDiagonal());
+
 //   vd.asDiagonal() * mf;    // does not even compile
 //   vcd.asDiagonal() * mf;   // does not even compile
 
@@ -92,7 +147,6 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX(mcd.array() *= md.array(), mcd2.array() *= md.array().template cast<std::complex<double> >());
   
   // check matrix-matrix products
-
   VERIFY_IS_APPROX(sd*md*mcd, (sd*md).template cast<CD>().eval()*mcd);
   VERIFY_IS_APPROX(sd*mcd*md, sd*mcd*md.template cast<CD>());
   VERIFY_IS_APPROX(scd*md*mcd, scd*md.template cast<CD>().eval()*mcd);
@@ -103,6 +157,20 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX(scf*mf*mcf, scf*mf.template cast<CF>()*mcf);
   VERIFY_IS_APPROX(scf*mcf*mf, scf*mcf*mf.template cast<CF>());
 
+  VERIFY_IS_APPROX(sd*md.adjoint()*mcd, (sd*md).template cast<CD>().eval().adjoint()*mcd);
+  VERIFY_IS_APPROX(sd*mcd.adjoint()*md, sd*mcd.adjoint()*md.template cast<CD>());
+  VERIFY_IS_APPROX(sd*md.adjoint()*mcd.adjoint(), (sd*md).template cast<CD>().eval().adjoint()*mcd.adjoint());
+  VERIFY_IS_APPROX(sd*mcd.adjoint()*md.adjoint(), sd*mcd.adjoint()*md.template cast<CD>().adjoint());
+  VERIFY_IS_APPROX(sd*md*mcd.adjoint(), (sd*md).template cast<CD>().eval()*mcd.adjoint());
+  VERIFY_IS_APPROX(sd*mcd*md.adjoint(), sd*mcd*md.template cast<CD>().adjoint());
+
+  VERIFY_IS_APPROX(sf*mf.adjoint()*mcf, (sf*mf).template cast<CF>().eval().adjoint()*mcf);
+  VERIFY_IS_APPROX(sf*mcf.adjoint()*mf, sf*mcf.adjoint()*mf.template cast<CF>());
+  VERIFY_IS_APPROX(sf*mf.adjoint()*mcf.adjoint(), (sf*mf).template cast<CF>().eval().adjoint()*mcf.adjoint());
+  VERIFY_IS_APPROX(sf*mcf.adjoint()*mf.adjoint(), sf*mcf.adjoint()*mf.template cast<CF>().adjoint());
+  VERIFY_IS_APPROX(sf*mf*mcf.adjoint(), (sf*mf).template cast<CF>().eval()*mcf.adjoint());
+  VERIFY_IS_APPROX(sf*mcf*mf.adjoint(), sf*mcf*mf.template cast<CF>().adjoint());
+
   VERIFY_IS_APPROX(sf*mf*vcf, (sf*mf).template cast<CF>().eval()*vcf);
   VERIFY_IS_APPROX(scf*mf*vcf,(scf*mf.template cast<CF>()).eval()*vcf);
   VERIFY_IS_APPROX(sf*mcf*vf, sf*mcf*vf.template cast<CF>());
@@ -122,11 +190,111 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX(scd*vcd.adjoint()*md, scd*vcd.adjoint()*md.template cast<CD>().eval());
   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd,  sd*vd.adjoint().template cast<CD>().eval()*mcd);
   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast<CD>().eval()*mcd);
+
+  VERIFY_IS_APPROX( sd*vcd.adjoint()*md.template triangularView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
+  VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Lower>());
+  VERIFY_IS_APPROX( sd*vcd.adjoint()*md.transpose().template triangularView<Upper>(),  sd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Upper>());
+  VERIFY_IS_APPROX(scd*vcd.adjoint()*md.transpose().template triangularView<Lower>(), scd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Lower>());
+  VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
+  VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Upper>());
+  VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.transpose().template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Lower>());
+  VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.transpose().template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Upper>());
+
+  // Not supported yet: trmm
+//   VERIFY_IS_APPROX(sd*mcd*md.template triangularView<Lower>(),  sd*mcd*md.template cast<CD>().eval().template triangularView<Lower>());
+//   VERIFY_IS_APPROX(scd*mcd*md.template triangularView<Upper>(), scd*mcd*md.template cast<CD>().eval().template triangularView<Upper>());
+//   VERIFY_IS_APPROX(sd*md*mcd.template triangularView<Lower>(),  sd*md.template cast<CD>().eval()*mcd.template triangularView<Lower>());
+//   VERIFY_IS_APPROX(scd*md*mcd.template triangularView<Upper>(), scd*md.template cast<CD>().eval()*mcd.template triangularView<Upper>());
+
+  // Not supported yet: symv
+//   VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Lower>());
+//   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Lower>());
+//   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+
+  // Not supported yet: symm
+//   VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView<Upper>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView<Upper>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+
+  rcd.setZero();
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = sd * mcd * md),
+                   Mat_cd((sd * mcd * md.template cast<CD>().eval()).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = sd * md * mcd),
+                   Mat_cd((sd * md.template cast<CD>().eval() * mcd).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = scd * mcd * md),
+                   Mat_cd((scd * mcd * md.template cast<CD>().eval()).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = scd * md * mcd),
+                   Mat_cd((scd * md.template cast<CD>().eval() * mcd).template triangularView<Upper>()));
+
+
+  VERIFY_IS_APPROX( md.array()  * mcd.array(), md.template cast<CD>().eval().array() * mcd.array() );
+  VERIFY_IS_APPROX( mcd.array() * md.array(),  mcd.array() * md.template cast<CD>().eval().array() );
+
+  VERIFY_IS_APPROX( md.array()  + mcd.array(), md.template cast<CD>().eval().array() + mcd.array() );
+  VERIFY_IS_APPROX( mcd.array() + md.array(),  mcd.array() + md.template cast<CD>().eval().array() );
+
+  VERIFY_IS_APPROX( md.array()  - mcd.array(), md.template cast<CD>().eval().array() - mcd.array() );
+  VERIFY_IS_APPROX( mcd.array() - md.array(),  mcd.array() - md.template cast<CD>().eval().array() );
+
+  if(mcd.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( md.array() / mcd.array(), md.template cast<CD>().eval().array() / mcd.array() );
+  }
+  if(md.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( mcd.array() / md.array(), mcd.array() / md.template cast<CD>().eval().array() );
+  }
+
+  if(md.array().abs().minCoeff()>epsd || mcd.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( md.array().pow(mcd.array()), md.template cast<CD>().eval().array().pow(mcd.array()) );
+    VERIFY_IS_APPROX( mcd.array().pow(md.array()),  mcd.array().pow(md.template cast<CD>().eval().array()) );
+
+    VERIFY_IS_APPROX( pow(md.array(),mcd.array()), md.template cast<CD>().eval().array().pow(mcd.array()) );
+    VERIFY_IS_APPROX( pow(mcd.array(),md.array()),  mcd.array().pow(md.template cast<CD>().eval().array()) );
+  }
+
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd = md, md.template cast<CD>().eval() );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd += md, mcd + md.template cast<CD>().eval() );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd -= md, mcd - md.template cast<CD>().eval() );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.array() *= md.array(), mcd.array() * md.template cast<CD>().eval().array() );
+  rcd = mcd;
+  if(md.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( rcd.array() /= md.array(), mcd.array() / md.template cast<CD>().eval().array() );
+  }
+
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() += md + mcd*md, mcd + (md.template cast<CD>().eval()) + mcd*(md.template cast<CD>().eval()));
+
+  VERIFY_IS_APPROX( rcd.noalias()  = md*md,       ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() += md*md, mcd + ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() -= md*md, mcd - ((md*md).eval().template cast<CD>()) );
+
+  VERIFY_IS_APPROX( rcd.noalias()  = mcd + md*md,       mcd + ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() += mcd + md*md, mcd + mcd + ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() -= mcd + md*md,           - ((md*md).eval().template cast<CD>()) );
 }
 
 void test_mixingtypes()
 {
-  CALL_SUBTEST_1(mixingtypes<3>());
-  CALL_SUBTEST_2(mixingtypes<4>());
-  CALL_SUBTEST_3(mixingtypes<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(mixingtypes<3>());
+    CALL_SUBTEST_2(mixingtypes<4>());
+    CALL_SUBTEST_3(mixingtypes<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
+
+    CALL_SUBTEST_4(mixingtypes<3>());
+    CALL_SUBTEST_5(mixingtypes<4>());
+    CALL_SUBTEST_6(mixingtypes<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
+  }
 }
diff --git a/test/mpl2only.cpp b/test/mpl2only.cpp
new file mode 100644
index 000000000..7d04d6bba
--- /dev/null
+++ b/test/mpl2only.cpp
@@ -0,0 +1,22 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_MPL2_ONLY
+#include <Eigen/Dense>
+#include <Eigen/SparseCore>
+#include <Eigen/SparseLU>
+#include <Eigen/SparseQR>
+#include <Eigen/Sparse>
+#include <Eigen/IterativeLinearSolvers>
+#include <Eigen/Eigen>
+
+int main()
+{
+  return 0;
+}
diff --git a/test/nesting_ops.cpp b/test/nesting_ops.cpp
index 1e8523283..a419b0e44 100644
--- a/test/nesting_ops.cpp
+++ b/test/nesting_ops.cpp
@@ -2,16 +2,37 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
 #include "main.h"
 
-template <typename MatrixType> void run_nesting_ops(const MatrixType& _m)
+template <int N, typename XprType>
+void use_n_times(const XprType &xpr)
 {
-  typename MatrixType::Nested m(_m);
+  typename internal::nested_eval<XprType,N>::type mat(xpr);
+  typename XprType::PlainObject res(mat.rows(), mat.cols());
+  nb_temporaries--; // remove res
+  res.setZero();
+  for(int i=0; i<N; ++i)
+    res += mat;
+}
+
+template <int N, typename ReferenceType, typename XprType>
+bool verify_eval_type(const XprType &, const ReferenceType&)
+{
+  typedef typename internal::nested_eval<XprType,N>::type EvalType;
+  return internal::is_same<typename internal::remove_all<EvalType>::type, typename internal::remove_all<ReferenceType>::type>::value;
+}
+
+template <typename MatrixType> void run_nesting_ops_1(const MatrixType& _m)
+{
+  typename internal::nested_eval<MatrixType,2>::type m(_m);
 
   // Make really sure that we are in debug mode!
   VERIFY_RAISES_ASSERT(eigen_assert(false));
@@ -24,10 +45,63 @@ template <typename MatrixType> void run_nesting_ops(const MatrixType& _m)
   VERIFY_IS_APPROX( (m.transpose() * m).array().abs().sum(), (m.transpose() * m).array().abs().sum() );
 }
 
+template <typename MatrixType> void run_nesting_ops_2(const MatrixType& _m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  Index rows = _m.rows();
+  Index cols = _m.cols();
+  MatrixType m1 = MatrixType::Random(rows,cols);
+  Matrix<Scalar,MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime,ColMajor> m2;
+
+  if((MatrixType::SizeAtCompileTime==Dynamic))
+  {
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(m1 + m1*m1), 1 );
+    VERIFY_EVALUATION_COUNT( use_n_times<10>(m1 + m1*m1), 1 );
+
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(m1.template triangularView<Lower>().solve(m1.col(0))), 1 );
+    VERIFY_EVALUATION_COUNT( use_n_times<10>(m1.template triangularView<Lower>().solve(m1.col(0))), 1 );
+
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(Scalar(2)*m1.template triangularView<Lower>().solve(m1.col(0))), 2 ); // FIXME could be one by applying the scaling in-place on the solve result
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(m1.col(0)+m1.template triangularView<Lower>().solve(m1.col(0))), 2 ); // FIXME could be one by adding m1.col() inplace
+    VERIFY_EVALUATION_COUNT( use_n_times<10>(m1.col(0)+m1.template triangularView<Lower>().solve(m1.col(0))), 2 );
+  }
+
+  {
+    VERIFY( verify_eval_type<10>(m1, m1) );
+    if(!NumTraits<Scalar>::IsComplex)
+    {
+      VERIFY( verify_eval_type<3>(2*m1, 2*m1) );
+      VERIFY( verify_eval_type<4>(2*m1, m1) );
+    }
+    else
+    {
+      VERIFY( verify_eval_type<2>(2*m1, 2*m1) );
+      VERIFY( verify_eval_type<3>(2*m1, m1) );
+    }
+    VERIFY( verify_eval_type<2>(m1+m1, m1+m1) );
+    VERIFY( verify_eval_type<3>(m1+m1, m1) );
+    VERIFY( verify_eval_type<1>(m1*m1.transpose(), m2) );
+    VERIFY( verify_eval_type<1>(m1*(m1+m1).transpose(), m2) );
+    VERIFY( verify_eval_type<2>(m1*m1.transpose(), m2) );
+    VERIFY( verify_eval_type<1>(m1+m1*m1, m1) );
+
+    VERIFY( verify_eval_type<1>(m1.template triangularView<Lower>().solve(m1), m1) );
+    VERIFY( verify_eval_type<1>(m1+m1.template triangularView<Lower>().solve(m1), m1) );
+  }
+}
+
+
 void test_nesting_ops()
 {
-  CALL_SUBTEST_1(run_nesting_ops(MatrixXf::Random(25,25)));
-  CALL_SUBTEST_2(run_nesting_ops(MatrixXd::Random(25,25)));
-  CALL_SUBTEST_3(run_nesting_ops(Matrix4f::Random()));
-  CALL_SUBTEST_4(run_nesting_ops(Matrix4d::Random()));
+  CALL_SUBTEST_1(run_nesting_ops_1(MatrixXf::Random(25,25)));
+  CALL_SUBTEST_2(run_nesting_ops_1(MatrixXcd::Random(25,25)));
+  CALL_SUBTEST_3(run_nesting_ops_1(Matrix4f::Random()));
+  CALL_SUBTEST_4(run_nesting_ops_1(Matrix2d::Random()));
+
+  Index s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+  CALL_SUBTEST_1( run_nesting_ops_2(MatrixXf(s,s)) );
+  CALL_SUBTEST_2( run_nesting_ops_2(MatrixXcd(s,s)) );
+  CALL_SUBTEST_3( run_nesting_ops_2(Matrix4f()) );
+  CALL_SUBTEST_4( run_nesting_ops_2(Matrix2d()) );
+  TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp
index 8e0402358..50756c2fb 100644
--- a/test/nomalloc.cpp
+++ b/test/nomalloc.cpp
@@ -8,20 +8,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// this hack is needed to make this file compiles with -pedantic (gcc)
-#ifdef __GNUC__
-#define throw(X)
-#endif
-
-#ifdef __INTEL_COMPILER
-  // disable "warning #76: argument to macro is empty" produced by the above hack
-  #pragma warning disable 76
-#endif
-
 // discard stack allocation as that too bypasses malloc
 #define EIGEN_STACK_ALLOCATION_LIMIT 0
-// any heap allocation will raise an assert
-#define EIGEN_NO_MALLOC
+// heap allocation will raise an assert if enabled at runtime
+#define EIGEN_RUNTIME_NO_MALLOC
 
 #include "main.h"
 #include <Eigen/Cholesky>
@@ -88,14 +78,15 @@ template<typename MatrixType> void nomalloc(const MatrixType& m)
   VERIFY_IS_APPROX(m2,m2);
   
   m2.template selfadjointView<Lower>().rankUpdate(m1.col(0),-1);
-  m2.template selfadjointView<Lower>().rankUpdate(m1.row(0),-1);
+  m2.template selfadjointView<Upper>().rankUpdate(m1.row(0),-1);
+  m2.template selfadjointView<Lower>().rankUpdate(m1.col(0), m1.col(0)); // rank-2
 
   // The following fancy matrix-matrix products are not safe yet regarding static allocation
-//   m1 += m1.template triangularView<Upper>() * m2.col(;
-//   m1.template selfadjointView<Lower>().rankUpdate(m2);
-//   m1 += m1.template triangularView<Upper>() * m2;
-//   m1 += m1.template selfadjointView<Lower>() * m2;
-//   VERIFY_IS_APPROX(m1,m1);
+  m2.template selfadjointView<Lower>().rankUpdate(m1);
+  m2 += m2.template triangularView<Upper>() * m1;
+  m2.template triangularView<Upper>() = m2 * m2;
+  m1 += m1.template selfadjointView<Lower>() * m2;
+  VERIFY_IS_APPROX(m2,m2);
 }
 
 template<typename Scalar>
@@ -171,7 +162,7 @@ void test_zerosized() {
   Eigen::VectorXd v;
   // explicit zero-sized:
   Eigen::ArrayXXd A0(0,0);
-  Eigen::ArrayXd v0(std::ptrdiff_t(0)); // FIXME ArrayXd(0) is ambiguous
+  Eigen::ArrayXd v0(0);
 
   // assigning empty objects to each other:
   A=A0;
@@ -183,9 +174,11 @@ template<typename MatrixType> void test_reference(const MatrixType& m) {
   enum { Flag          =  MatrixType::IsRowMajor ? Eigen::RowMajor : Eigen::ColMajor};
   enum { TransposeFlag = !MatrixType::IsRowMajor ? Eigen::RowMajor : Eigen::ColMajor};
   typename MatrixType::Index rows = m.rows(), cols=m.cols();
+  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Flag         > MatrixX;
+  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, TransposeFlag> MatrixXT;
   // Dynamic reference:
-  typedef Eigen::Ref<const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Flag         > > Ref;
-  typedef Eigen::Ref<const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, TransposeFlag> > RefT;
+  typedef Eigen::Ref<const MatrixX  > Ref;
+  typedef Eigen::Ref<const MatrixXT > RefT;
 
   Ref r1(m);
   Ref r2(m.block(rows/3, cols/4, rows/2, cols/2));
@@ -195,10 +188,30 @@ template<typename MatrixType> void test_reference(const MatrixType& m) {
   VERIFY_RAISES_ASSERT(RefT r5(m));
   VERIFY_RAISES_ASSERT(Ref r6(m.transpose()));
   VERIFY_RAISES_ASSERT(Ref r7(Scalar(2) * m));
+
+  // Copy constructors shall also never malloc
+  Ref r8 = r1;
+  RefT r9 = r3;
+
+  // Initializing from a compatible Ref shall also never malloc
+  Eigen::Ref<const MatrixX, Unaligned, Stride<Dynamic, Dynamic> > r10=r8, r11=m;
+
+  // Initializing from an incompatible Ref will malloc:
+  typedef Eigen::Ref<const MatrixX, Aligned> RefAligned;
+  VERIFY_RAISES_ASSERT(RefAligned r12=r10);
+  VERIFY_RAISES_ASSERT(Ref r13=r10); // r10 has more dynamic strides
+
 }
 
 void test_nomalloc()
 {
+  // create some dynamic objects
+  Eigen::MatrixXd M1 = MatrixXd::Random(3,3);
+  Ref<const MatrixXd> R1 = 2.0*M1; // Ref requires temporary
+
+  // from here on prohibit malloc:
+  Eigen::internal::set_is_malloc_allowed(false);
+
   // check that our operator new is indeed called:
   VERIFY_RAISES_ASSERT(MatrixXd dummy(MatrixXd::Random(3,3)));
   CALL_SUBTEST_1(nomalloc(Matrix<float, 1, 1>()) );
@@ -207,6 +220,10 @@ void test_nomalloc()
   
   // Check decomposition modules with dynamic matrices that have a known compile-time max size (ctms)
   CALL_SUBTEST_4(ctms_decompositions<float>());
+
   CALL_SUBTEST_5(test_zerosized());
+
   CALL_SUBTEST_6(test_reference(Matrix<float,32,32>()));
+  CALL_SUBTEST_7(test_reference(R1));
+  CALL_SUBTEST_8(Ref<MatrixXd> R2 = M1.topRows<2>(); test_reference(R2));
 }
diff --git a/test/nullary.cpp b/test/nullary.cpp
index fbc721a1a..acd55506e 100644
--- a/test/nullary.cpp
+++ b/test/nullary.cpp
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010-2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,7 +13,6 @@
 template<typename MatrixType>
 bool equalsIdentity(const MatrixType& A)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   Scalar zero = static_cast<Scalar>(0);
 
@@ -30,13 +30,41 @@ bool equalsIdentity(const MatrixType& A)
 
   bool diagOK = (A.diagonal().array() == 1).all();
   return offDiagOK && diagOK;
+
+}
+
+template<typename VectorType>
+void check_extremity_accuracy(const VectorType &v, const typename VectorType::Scalar &low, const typename VectorType::Scalar &high)
+{
+  typedef typename VectorType::Scalar Scalar;
+  typedef typename VectorType::RealScalar RealScalar;
+
+  RealScalar prec = internal::is_same<RealScalar,float>::value ? NumTraits<RealScalar>::dummy_precision()*10 : NumTraits<RealScalar>::dummy_precision()/10;
+  Index size = v.size();
+
+  if(size<20)
+    return;
+
+  for (int i=0; i<size; ++i)
+  {
+    if(i<5 || i>size-6)
+    {
+      Scalar ref = (low*RealScalar(size-i-1))/RealScalar(size-1) + (high*RealScalar(i))/RealScalar(size-1);
+      if(std::abs(ref)>1)
+      {
+        if(!internal::isApprox(v(i), ref, prec))
+          std::cout << v(i) << " != " << ref << "  ; relative error: " << std::abs((v(i)-ref)/ref) << "  ; required precision: " << prec << "  ; range: " << low << "," << high << "  ; i: " << i << "\n";
+        VERIFY(internal::isApprox(v(i), (low*RealScalar(size-i-1))/RealScalar(size-1) + (high*RealScalar(i))/RealScalar(size-1), prec));
+      }
+    }
+  }
 }
 
 template<typename VectorType>
 void testVectorType(const VectorType& base)
 {
-  typedef typename internal::traits<VectorType>::Index Index;
-  typedef typename internal::traits<VectorType>::Scalar Scalar;
+  typedef typename VectorType::Scalar Scalar;
+  typedef typename VectorType::RealScalar RealScalar;
 
   const Index size = base.size();
   
@@ -44,36 +72,61 @@ void testVectorType(const VectorType& base)
   Scalar low = (size == 1 ? high : internal::random<Scalar>(-500,500));
   if (low>high) std::swap(low,high);
 
+  // check low==high
+  if(internal::random<float>(0.f,1.f)<0.05f)
+    low = high;
+  // check abs(low) >> abs(high)
+  else if(size>2 && std::numeric_limits<RealScalar>::max_exponent10>0 && internal::random<float>(0.f,1.f)<0.1f)
+    low = -internal::random<Scalar>(1,2) * RealScalar(std::pow(RealScalar(10),std::numeric_limits<RealScalar>::max_exponent10/2));
+
   const Scalar step = ((size == 1) ? 1 : (high-low)/(size-1));
 
   // check whether the result yields what we expect it to do
   VectorType m(base);
   m.setLinSpaced(size,low,high);
 
-  VectorType n(size);
-  for (int i=0; i<size; ++i)
-    n(i) = low+i*step;
+  if(!NumTraits<Scalar>::IsInteger)
+  {
+    VectorType n(size);
+    for (int i=0; i<size; ++i)
+      n(i) = low+i*step;
+    VERIFY_IS_APPROX(m,n);
 
-  VERIFY_IS_APPROX(m,n);
+    CALL_SUBTEST( check_extremity_accuracy(m, low, high) );
+  }
 
-  // random access version
-  m = VectorType::LinSpaced(size,low,high);
-  VERIFY_IS_APPROX(m,n);
+  if((!NumTraits<Scalar>::IsInteger) || ((high-low)>=size && (Index(high-low)%(size-1))==0) || (Index(high-low+1)<size && (size%Index(high-low+1))==0))
+  {
+    VectorType n(size);
+    if((!NumTraits<Scalar>::IsInteger) || (high-low>=size))
+      for (int i=0; i<size; ++i)
+        n(i) = size==1 ? low : (low + ((high-low)*Scalar(i))/(size-1));
+    else
+      for (int i=0; i<size; ++i)
+        n(i) = size==1 ? low : low + Scalar((double(high-low+1)*double(i))/double(size));
+    VERIFY_IS_APPROX(m,n);
 
-  // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79).
-  VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits<Scalar>::epsilon() );
+    // random access version
+    m = VectorType::LinSpaced(size,low,high);
+    VERIFY_IS_APPROX(m,n);
+    VERIFY( internal::isApprox(m(m.size()-1),high) );
+    VERIFY( size==1 || internal::isApprox(m(0),low) );
+    VERIFY_IS_EQUAL(m(m.size()-1) , high);
+    if(!NumTraits<Scalar>::IsInteger)
+      CALL_SUBTEST( check_extremity_accuracy(m, low, high) );
+  }
 
-  // These guys sometimes fail! This is not good. Any ideas how to fix them!?
-  //VERIFY( m(m.size()-1) == high );
-  //VERIFY( m(0) == low );
+  VERIFY( m(m.size()-1) <= high );
+  VERIFY( (m.array() <= high).all() );
+  VERIFY( (m.array() >= low).all() );
 
-  // sequential access version
-  m = VectorType::LinSpaced(Sequential,size,low,high);
-  VERIFY_IS_APPROX(m,n);
 
-  // These guys sometimes fail! This is not good. Any ideas how to fix them!?
-  //VERIFY( m(m.size()-1) == high );
-  //VERIFY( m(0) == low );
+  VERIFY( m(m.size()-1) >= low );
+  if(size>=1)
+  {
+    VERIFY( internal::isApprox(m(0),low) );
+    VERIFY_IS_EQUAL(m(0) , low);
+  }
 
   // check whether everything works with row and col major vectors
   Matrix<Scalar,Dynamic,1> row_vector(size);
@@ -95,23 +148,77 @@ void testVectorType(const VectorType& base)
   VERIFY_IS_APPROX( ScalarMatrix::LinSpaced(1,low,high), ScalarMatrix::Constant(high) );
 
   // regression test for bug 526 (linear vectorized transversal)
-  if (size > 1) {
+  if (size > 1 && (!NumTraits<Scalar>::IsInteger)) {
     m.tail(size-1).setLinSpaced(low, high);
     VERIFY_IS_APPROX(m(size-1), high);
   }
+
+  // regression test for bug 1383 (LinSpaced with empty size/range)
+  {
+    Index n0 = VectorType::SizeAtCompileTime==Dynamic ? 0 : VectorType::SizeAtCompileTime;
+    low = internal::random<Scalar>();
+    m = VectorType::LinSpaced(n0,low,low-1);
+    VERIFY(m.size()==n0);
+
+    if(VectorType::SizeAtCompileTime==Dynamic)
+    {
+      VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,0,Scalar(n0-1)).sum(),Scalar(0));
+      VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,low,low-1).sum(),Scalar(0));
+    }
+
+    m.setLinSpaced(n0,0,Scalar(n0-1));
+    VERIFY(m.size()==n0);
+    m.setLinSpaced(n0,low,low-1);
+    VERIFY(m.size()==n0);
+
+    // empty range only:
+    VERIFY_IS_APPROX(VectorType::LinSpaced(size,low,low),VectorType::Constant(size,low));
+    m.setLinSpaced(size,low,low);
+    VERIFY_IS_APPROX(m,VectorType::Constant(size,low));
+
+    if(NumTraits<Scalar>::IsInteger)
+    {
+      VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,Scalar(low+size-1)), VectorType::LinSpaced(size,Scalar(low+size-1),low).reverse() );
+
+      if(VectorType::SizeAtCompileTime==Dynamic)
+      {
+        // Check negative multiplicator path:
+        for(Index k=1; k<5; ++k)
+          VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,Scalar(low+(size-1)*k)), VectorType::LinSpaced(size,Scalar(low+(size-1)*k),low).reverse() );
+        // Check negative divisor path:
+        for(Index k=1; k<5; ++k)
+          VERIFY_IS_APPROX( VectorType::LinSpaced(size*k,low,Scalar(low+size-1)), VectorType::LinSpaced(size*k,Scalar(low+size-1),low).reverse() );
+      }
+    }
+  }
 }
 
 template<typename MatrixType>
 void testMatrixType(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
+  using std::abs;
   const Index rows = m.rows();
   const Index cols = m.cols();
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+  Scalar s1;
+  do {
+    s1 = internal::random<Scalar>();
+  } while(abs(s1)<RealScalar(1e-5) && (!NumTraits<Scalar>::IsInteger));
 
   MatrixType A;
   A.setIdentity(rows, cols);
   VERIFY(equalsIdentity(A));
   VERIFY(equalsIdentity(MatrixType::Identity(rows, cols)));
+
+
+  A = MatrixType::Constant(rows,cols,s1);
+  Index i = internal::random<Index>(0,rows-1);
+  Index j = internal::random<Index>(0,cols-1);
+  VERIFY_IS_APPROX( MatrixType::Constant(rows,cols,s1)(i,j), s1 );
+  VERIFY_IS_APPROX( MatrixType::Constant(rows,cols,s1).coeff(i,j), s1 );
+  VERIFY_IS_APPROX( A(i,j), s1 );
 }
 
 void test_nullary()
@@ -120,12 +227,78 @@ void test_nullary()
   CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random<int>(1,300),internal::random<int>(1,300))) );
   CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random<int>(1,300),internal::random<int>(1,300))) );
   
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_4( testVectorType(VectorXd(internal::random<int>(1,300))) );
+  for(int i = 0; i < g_repeat*10; i++) {
+    CALL_SUBTEST_4( testVectorType(VectorXd(internal::random<int>(1,30000))) );
     CALL_SUBTEST_5( testVectorType(Vector4d()) );  // regression test for bug 232
     CALL_SUBTEST_6( testVectorType(Vector3d()) );
-    CALL_SUBTEST_7( testVectorType(VectorXf(internal::random<int>(1,300))) );
+    CALL_SUBTEST_7( testVectorType(VectorXf(internal::random<int>(1,30000))) );
     CALL_SUBTEST_8( testVectorType(Vector3f()) );
+    CALL_SUBTEST_8( testVectorType(Vector4f()) );
+    CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );
     CALL_SUBTEST_8( testVectorType(Matrix<float,1,1>()) );
+
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(1,10))) );
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(9,300))) );
+    CALL_SUBTEST_9( testVectorType(Matrix<int,1,1>()) );
+  }
+
+#ifdef EIGEN_TEST_PART_6
+  // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79).
+  VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits<double>::epsilon() );
+#endif
+
+#ifdef EIGEN_TEST_PART_9
+  // Check possible overflow issue
+  {
+    int n = 60000;
+    ArrayXi a1(n), a2(n);
+    a1.setLinSpaced(n, 0, n-1);
+    for(int i=0; i<n; ++i)
+      a2(i) = i;
+    VERIFY_IS_APPROX(a1,a2);
+  }
+#endif
+
+#ifdef EIGEN_TEST_PART_10
+  // check some internal logic
+  VERIFY((  internal::has_nullary_operator<internal::scalar_constant_op<double> >::value ));
+  VERIFY(( !internal::has_unary_operator<internal::scalar_constant_op<double> >::value ));
+  VERIFY(( !internal::has_binary_operator<internal::scalar_constant_op<double> >::value ));
+  VERIFY((  internal::functor_has_linear_access<internal::scalar_constant_op<double> >::ret ));
+
+  VERIFY(( !internal::has_nullary_operator<internal::scalar_identity_op<double> >::value ));
+  VERIFY(( !internal::has_unary_operator<internal::scalar_identity_op<double> >::value ));
+  VERIFY((  internal::has_binary_operator<internal::scalar_identity_op<double> >::value ));
+  VERIFY(( !internal::functor_has_linear_access<internal::scalar_identity_op<double> >::ret ));
+
+  VERIFY(( !internal::has_nullary_operator<internal::linspaced_op<float,float> >::value ));
+  VERIFY((  internal::has_unary_operator<internal::linspaced_op<float,float> >::value ));
+  VERIFY(( !internal::has_binary_operator<internal::linspaced_op<float,float> >::value ));
+  VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<float,float> >::ret ));
+
+  // Regression unit test for a weird MSVC bug.
+  // Search "nullary_wrapper_workaround_msvc" in CoreEvaluators.h for the details.
+  // See also traits<Ref>::match.
+  {
+    MatrixXf A = MatrixXf::Random(3,3);
+    Ref<const MatrixXf> R = 2.0*A;
+    VERIFY_IS_APPROX(R, A+A);
+
+    Ref<const MatrixXf> R1 = MatrixXf::Random(3,3)+A;
+
+    VectorXi V = VectorXi::Random(3);
+    Ref<const VectorXi> R2 = VectorXi::LinSpaced(3,1,3)+V;
+    VERIFY_IS_APPROX(R2, V+Vector3i(1,2,3));
+
+    VERIFY((  internal::has_nullary_operator<internal::scalar_constant_op<float> >::value ));
+    VERIFY(( !internal::has_unary_operator<internal::scalar_constant_op<float> >::value ));
+    VERIFY(( !internal::has_binary_operator<internal::scalar_constant_op<float> >::value ));
+    VERIFY((  internal::functor_has_linear_access<internal::scalar_constant_op<float> >::ret ));
+
+    VERIFY(( !internal::has_nullary_operator<internal::linspaced_op<int,int> >::value ));
+    VERIFY((  internal::has_unary_operator<internal::linspaced_op<int,int> >::value ));
+    VERIFY(( !internal::has_binary_operator<internal::linspaced_op<int,int> >::value ));
+    VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<int,int> >::ret ));
   }
+#endif
 }
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 38aa256ce..7821a1738 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -9,16 +9,28 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
+#include "unsupported/Eigen/SpecialFunctions"
 
+#if defined __GNUC__ && __GNUC__>=6
+  #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
 // using namespace Eigen;
 
+#ifdef EIGEN_VECTORIZE_SSE
+const bool g_vectorize_sse = true;
+#else
+const bool g_vectorize_sse = false;
+#endif
+
 namespace Eigen {
 namespace internal {
 template<typename T> T negate(const T& x) { return -x; }
 }
 }
 
-template<typename Scalar> bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits<Scalar>::Real& refvalue)
+// NOTE: we disbale inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU.
+template<typename Scalar> EIGEN_DONT_INLINE
+bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits<Scalar>::Real& refvalue)
 {
   return internal::isMuchSmallerThan(a-b, refvalue);
 }
@@ -29,7 +41,7 @@ template<typename Scalar> bool areApproxAbs(const Scalar* a, const Scalar* b, in
   {
     if (!isApproxAbs(a[i],b[i],refvalue))
     {
-      std::cout << "[" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != " << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "\n";
+      std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
       return false;
     }
   }
@@ -42,21 +54,13 @@ template<typename Scalar> bool areApprox(const Scalar* a, const Scalar* b, int s
   {
     if (a[i]!=b[i] && !internal::isApprox(a[i],b[i]))
     {
-      std::cout << "[" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != " << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "\n";
+      std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
       return false;
     }
   }
   return true;
 }
 
-
-#define CHECK_CWISE2(REFOP, POP) { \
-  for (int i=0; i<PacketSize; ++i) \
-    ref[i] = REFOP(data1[i], data1[i+PacketSize]); \
-  internal::pstore(data2, POP(internal::pload<Packet>(data1), internal::pload<Packet>(data1+PacketSize))); \
-  VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
-}
-
 #define CHECK_CWISE1(REFOP, POP) { \
   for (int i=0; i<PacketSize; ++i) \
     ref[i] = REFOP(data1[i]); \
@@ -92,6 +96,14 @@ struct packet_helper<false,Packet>
   VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
 }
 
+#define CHECK_CWISE2_IF(COND, REFOP, POP) if(COND) { \
+  packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = REFOP(data1[i], data1[i+PacketSize]); \
+  h.store(data2, POP(h.load(data1),h.load(data1+PacketSize))); \
+  VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
+}
+
 #define REF_ADD(a,b) ((a)+(b))
 #define REF_SUB(a,b) ((a)-(b))
 #define REF_MUL(a,b) ((a)*(b))
@@ -100,15 +112,17 @@ struct packet_helper<false,Packet>
 template<typename Scalar> void packetmath()
 {
   using std::abs;
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  const int PacketSize = PacketTraits::size;
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
-  const int size = PacketSize*4;
-  EIGEN_ALIGN16 Scalar data1[internal::packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN16 Scalar data2[internal::packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN16 Packet packets[PacketSize*2];
-  EIGEN_ALIGN16 Scalar ref[internal::packet_traits<Scalar>::size*4];
+  const int max_size = PacketSize > 4 ? PacketSize : 4;
+  const int size = PacketSize*max_size;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Packet packets[PacketSize*2];
+  EIGEN_ALIGN_MAX Scalar ref[size];
   RealScalar refvalue = 0;
   for (int i=0; i<size; ++i)
   {
@@ -140,6 +154,18 @@ template<typename Scalar> void packetmath()
     else if (offset==1) internal::palign<1>(packets[0], packets[1]);
     else if (offset==2) internal::palign<2>(packets[0], packets[1]);
     else if (offset==3) internal::palign<3>(packets[0], packets[1]);
+    else if (offset==4) internal::palign<4>(packets[0], packets[1]);
+    else if (offset==5) internal::palign<5>(packets[0], packets[1]);
+    else if (offset==6) internal::palign<6>(packets[0], packets[1]);
+    else if (offset==7) internal::palign<7>(packets[0], packets[1]);
+    else if (offset==8) internal::palign<8>(packets[0], packets[1]);
+    else if (offset==9) internal::palign<9>(packets[0], packets[1]);
+    else if (offset==10) internal::palign<10>(packets[0], packets[1]);
+    else if (offset==11) internal::palign<11>(packets[0], packets[1]);
+    else if (offset==12) internal::palign<12>(packets[0], packets[1]);
+    else if (offset==13) internal::palign<13>(packets[0], packets[1]);
+    else if (offset==14) internal::palign<14>(packets[0], packets[1]);
+    else if (offset==15) internal::palign<15>(packets[0], packets[1]);
     internal::pstore(data2, packets[0]);
 
     for (int i=0; i<PacketSize; ++i)
@@ -148,13 +174,17 @@ template<typename Scalar> void packetmath()
     VERIFY(areApprox(ref, data2, PacketSize) && "internal::palign");
   }
 
-  CHECK_CWISE2(REF_ADD,  internal::padd);
-  CHECK_CWISE2(REF_SUB,  internal::psub);
-  CHECK_CWISE2(REF_MUL,  internal::pmul);
-  #ifndef EIGEN_VECTORIZE_ALTIVEC
-  if (!internal::is_same<Scalar,int>::value)
-    CHECK_CWISE2(REF_DIV,  internal::pdiv);
-  #endif
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasAdd);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasNegate);
+  VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv);
+
+  CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD,  internal::padd);
+  CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB,  internal::psub);
+  CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL,  internal::pmul);
+  CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv);
+
   CHECK_CWISE1(internal::negate, internal::pnegate);
   CHECK_CWISE1(numext::conj, internal::pconj);
 
@@ -165,9 +195,31 @@ template<typename Scalar> void packetmath()
     internal::pstore(data2, internal::pset1<Packet>(data1[offset]));
     VERIFY(areApprox(ref, data2, PacketSize) && "internal::pset1");
   }
-  
+
+  {
+    for (int i=0; i<PacketSize*4; ++i)
+      ref[i] = data1[i/PacketSize];
+    Packet A0, A1, A2, A3;
+    internal::pbroadcast4<Packet>(data1, A0, A1, A2, A3);
+    internal::pstore(data2+0*PacketSize, A0);
+    internal::pstore(data2+1*PacketSize, A1);
+    internal::pstore(data2+2*PacketSize, A2);
+    internal::pstore(data2+3*PacketSize, A3);
+    VERIFY(areApprox(ref, data2, 4*PacketSize) && "internal::pbroadcast4");
+  }
+
+  {
+    for (int i=0; i<PacketSize*2; ++i)
+      ref[i] = data1[i/PacketSize];
+    Packet A0, A1;
+    internal::pbroadcast2<Packet>(data1, A0, A1);
+    internal::pstore(data2+0*PacketSize, A0);
+    internal::pstore(data2+1*PacketSize, A1);
+    VERIFY(areApprox(ref, data2, 2*PacketSize) && "internal::pbroadcast2");
+  }
+
   VERIFY(internal::isApprox(data1[0], internal::pfirst(internal::pload<Packet>(data1))) && "internal::pfirst");
-  
+
   if(PacketSize>1)
   {
     for(int offset=0;offset<4;++offset)
@@ -179,11 +231,31 @@ template<typename Scalar> void packetmath()
     }
   }
 
+  if(PacketSize>2)
+  {
+    for(int offset=0;offset<4;++offset)
+    {
+      for(int i=0;i<PacketSize/4;++i)
+        ref[4*i+0] = ref[4*i+1] = ref[4*i+2] = ref[4*i+3] = data1[offset+i];
+      internal::pstore(data2,internal::ploadquad<Packet>(data1+offset));
+      VERIFY(areApprox(ref, data2, PacketSize) && "ploadquad");
+    }
+  }
+
   ref[0] = 0;
   for (int i=0; i<PacketSize; ++i)
     ref[0] += data1[i];
   VERIFY(isApproxAbs(ref[0], internal::predux(internal::pload<Packet>(data1)), refvalue) && "internal::predux");
 
+  {
+    for (int i=0; i<4; ++i)
+      ref[i] = 0;
+    for (int i=0; i<PacketSize; ++i)
+      ref[i%4] += data1[i];
+    internal::pstore(data2, internal::predux_downto4(internal::pload<Packet>(data1)));
+    VERIFY(areApprox(ref, data2, PacketSize>4?PacketSize/2:PacketSize) && "internal::predux_downto4");
+  }
+
   ref[0] = 1;
   for (int i=0; i<PacketSize; ++i)
     ref[0] *= data1[i];
@@ -203,116 +275,258 @@ template<typename Scalar> void packetmath()
     ref[i] = data1[PacketSize-i-1];
   internal::pstore(data2, internal::preverse(internal::pload<Packet>(data1)));
   VERIFY(areApprox(ref, data2, PacketSize) && "internal::preverse");
+
+  internal::PacketBlock<Packet> kernel;
+  for (int i=0; i<PacketSize; ++i) {
+    kernel.packet[i] = internal::pload<Packet>(data1+i*PacketSize);
+  }
+  ptranspose(kernel);
+  for (int i=0; i<PacketSize; ++i) {
+    internal::pstore(data2, kernel.packet[i]);
+    for (int j = 0; j < PacketSize; ++j) {
+      VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose");
+    }
+  }
+
+  if (PacketTraits::HasBlend) {
+    Packet thenPacket = internal::pload<Packet>(data1);
+    Packet elsePacket = internal::pload<Packet>(data2);
+    EIGEN_ALIGN_MAX internal::Selector<PacketSize> selector;
+    for (int i = 0; i < PacketSize; ++i) {
+      selector.select[i] = i;
+    }
+
+    Packet blend = internal::pblend(selector, thenPacket, elsePacket);
+    EIGEN_ALIGN_MAX Scalar result[size];
+    internal::pstore(result, blend);
+    for (int i = 0; i < PacketSize; ++i) {
+      VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue));
+    }
+  }
+
+  if (PacketTraits::HasBlend || g_vectorize_sse) {
+    // pinsertfirst
+    for (int i=0; i<PacketSize; ++i)
+      ref[i] = data1[i];
+    Scalar s = internal::random<Scalar>();
+    ref[0] = s;
+    internal::pstore(data2, internal::pinsertfirst(internal::pload<Packet>(data1),s));
+    VERIFY(areApprox(ref, data2, PacketSize) && "internal::pinsertfirst");
+  }
+
+  if (PacketTraits::HasBlend || g_vectorize_sse) {
+    // pinsertlast
+    for (int i=0; i<PacketSize; ++i)
+      ref[i] = data1[i];
+    Scalar s = internal::random<Scalar>();
+    ref[PacketSize-1] = s;
+    internal::pstore(data2, internal::pinsertlast(internal::pload<Packet>(data1),s));
+    VERIFY(areApprox(ref, data2, PacketSize) && "internal::pinsertlast");
+  }
 }
 
 template<typename Scalar> void packetmath_real()
 {
   using std::abs;
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  const int PacketSize = PacketTraits::size;
 
   const int size = PacketSize*4;
-  EIGEN_ALIGN16 Scalar data1[internal::packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN16 Scalar data2[internal::packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN16 Scalar ref[internal::packet_traits<Scalar>::size*4];
+  EIGEN_ALIGN_MAX Scalar data1[PacketTraits::size*4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketTraits::size*4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketTraits::size*4];
 
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-3,3));
     data2[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-3,3));
   }
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasSin, std::sin, internal::psin);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasCos, std::cos, internal::pcos);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasTan, std::tan, internal::ptan);
-  
+  CHECK_CWISE1_IF(PacketTraits::HasSin, std::sin, internal::psin);
+  CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos);
+  CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan);
+
+  CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
+  CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+  CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>(-1,1);
     data2[i] = internal::random<Scalar>(-1,1);
   }
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasASin, std::asin, internal::pasin);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasACos, std::acos, internal::pacos);
+  CHECK_CWISE1_IF(PacketTraits::HasASin, std::asin, internal::pasin);
+  CHECK_CWISE1_IF(PacketTraits::HasACos, std::acos, internal::pacos);
 
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>(-87,88);
     data2[i] = internal::random<Scalar>(-87,88);
   }
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasExp, std::exp, internal::pexp);
+  CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp);
+  for (int i=0; i<size; ++i)
+  {
+    data1[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
+    data2[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasTanh, std::tanh, internal::ptanh);
+  if(PacketTraits::HasExp && PacketTraits::size>=2)
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    data1[1] = std::numeric_limits<Scalar>::epsilon();
+    packet_helper<PacketTraits::HasExp,Packet> h;
+    h.store(data2, internal::pexp(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+    VERIFY_IS_EQUAL(std::exp(std::numeric_limits<Scalar>::epsilon()), data2[1]);
+
+    data1[0] = -std::numeric_limits<Scalar>::epsilon();
+    data1[1] = 0;
+    h.store(data2, internal::pexp(h.load(data1)));
+    VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::epsilon()), data2[0]);
+    VERIFY_IS_EQUAL(std::exp(Scalar(0)), data2[1]);
+
+    data1[0] = (std::numeric_limits<Scalar>::min)();
+    data1[1] = -(std::numeric_limits<Scalar>::min)();
+    h.store(data2, internal::pexp(h.load(data1)));
+    VERIFY_IS_EQUAL(std::exp((std::numeric_limits<Scalar>::min)()), data2[0]);
+    VERIFY_IS_EQUAL(std::exp(-(std::numeric_limits<Scalar>::min)()), data2[1]);
+
+    data1[0] = std::numeric_limits<Scalar>::denorm_min();
+    data1[1] = -std::numeric_limits<Scalar>::denorm_min();
+    h.store(data2, internal::pexp(h.load(data1)));
+    VERIFY_IS_EQUAL(std::exp(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
+    VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
+  }
+
+  if (PacketTraits::HasTanh) {
+    // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    packet_helper<internal::packet_traits<Scalar>::HasTanh,Packet> h;
+    h.store(data2, internal::ptanh(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+
+#if EIGEN_HAS_C99_MATH
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    packet_helper<internal::packet_traits<Scalar>::HasLGamma,Packet> h;
+    h.store(data2, internal::plgamma(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
   {
     data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
-    packet_helper<internal::packet_traits<Scalar>::HasExp,Packet> h;
-    h.store(data2, internal::pexp(h.load(data1))); 
-    VERIFY(isNaN(data2[0]));
+    packet_helper<internal::packet_traits<Scalar>::HasErf,Packet> h;
+    h.store(data2, internal::perf(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
   }
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    packet_helper<internal::packet_traits<Scalar>::HasErfc,Packet> h;
+    h.store(data2, internal::perfc(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+#endif  // EIGEN_HAS_C99_MATH
 
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>(0,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
     data2[i] = internal::random<Scalar>(0,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
   }
-  if(internal::random<float>(0,1)<0.1)
+
+  if(internal::random<float>(0,1)<0.1f)
     data1[internal::random<int>(0, PacketSize)] = 0;
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasSqrt, std::sqrt, internal::psqrt);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLog, std::log, internal::plog);
+  CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt);
+  CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
+#if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L)
+  CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
+#endif
+
+  if(PacketTraits::HasLog && PacketTraits::size>=2)
   {
     data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
-    packet_helper<internal::packet_traits<Scalar>::HasLog,Packet> h;
+    data1[1] = std::numeric_limits<Scalar>::epsilon();
+    packet_helper<PacketTraits::HasLog,Packet> h;
     h.store(data2, internal::plog(h.load(data1)));
-    VERIFY(isNaN(data2[0]));
-    data1[0] = -1.0f;
+    VERIFY((numext::isnan)(data2[0]));
+    VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::epsilon()), data2[1]);
+
+    data1[0] = -std::numeric_limits<Scalar>::epsilon();
+    data1[1] = 0;
+    h.store(data2, internal::plog(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+    VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]);
+
+    data1[0] = (std::numeric_limits<Scalar>::min)();
+    data1[1] = -(std::numeric_limits<Scalar>::min)();
+    h.store(data2, internal::plog(h.load(data1)));
+    VERIFY_IS_EQUAL(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
+    VERIFY((numext::isnan)(data2[1]));
+
+    data1[0] = std::numeric_limits<Scalar>::denorm_min();
+    data1[1] = -std::numeric_limits<Scalar>::denorm_min();
+    h.store(data2, internal::plog(h.load(data1)));
+    // VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
+    VERIFY((numext::isnan)(data2[1]));
+
+    data1[0] = Scalar(-1.0f);
     h.store(data2, internal::plog(h.load(data1)));
-    VERIFY(isNaN(data2[0]));
-#if !EIGEN_FAST_MATH
+    VERIFY((numext::isnan)(data2[0]));
     h.store(data2, internal::psqrt(h.load(data1)));
-    VERIFY(isNaN(data2[0]));
-    VERIFY(isNaN(data2[1]));
-#endif
+    VERIFY((numext::isnan)(data2[0]));
+    VERIFY((numext::isnan)(data2[1]));
   }
 }
 
 template<typename Scalar> void packetmath_notcomplex()
 {
   using std::abs;
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  const int PacketSize = PacketTraits::size;
+
+  EIGEN_ALIGN_MAX Scalar data1[PacketTraits::size*4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketTraits::size*4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketTraits::size*4];
 
-  EIGEN_ALIGN16 Scalar data1[internal::packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN16 Scalar data2[internal::packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN16 Scalar ref[internal::packet_traits<Scalar>::size*4];
-  
-  Array<Scalar,Dynamic,1>::Map(data1, internal::packet_traits<Scalar>::size*4).setRandom();
+  Array<Scalar,Dynamic,1>::Map(data1, PacketTraits::size*4).setRandom();
 
   ref[0] = data1[0];
   for (int i=0; i<PacketSize; ++i)
     ref[0] = (std::min)(ref[0],data1[i]);
   VERIFY(internal::isApprox(ref[0], internal::predux_min(internal::pload<Packet>(data1))) && "internal::predux_min");
 
-  CHECK_CWISE2((std::min), internal::pmin);
-  CHECK_CWISE2((std::max), internal::pmax);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMin);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMax);
+
+  CHECK_CWISE2_IF(PacketTraits::HasMin, (std::min), internal::pmin);
+  CHECK_CWISE2_IF(PacketTraits::HasMax, (std::max), internal::pmax);
   CHECK_CWISE1(abs, internal::pabs);
 
   ref[0] = data1[0];
   for (int i=0; i<PacketSize; ++i)
     ref[0] = (std::max)(ref[0],data1[i]);
   VERIFY(internal::isApprox(ref[0], internal::predux_max(internal::pload<Packet>(data1))) && "internal::predux_max");
-  
+
   for (int i=0; i<PacketSize; ++i)
     ref[i] = data1[0]+Scalar(i);
-  internal::pstore(data2, internal::plset(data1[0]));
+  internal::pstore(data2, internal::plset<Packet>(data1[0]));
   VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset");
 }
 
 template<typename Scalar,bool ConjLhs,bool ConjRhs> void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval)
 {
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
-  
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  const int PacketSize = PacketTraits::size;
+
   internal::conj_if<ConjLhs> cj0;
   internal::conj_if<ConjRhs> cj1;
   internal::conj_helper<Scalar,Scalar,ConjLhs,ConjRhs> cj;
   internal::conj_helper<Packet,Packet,ConjLhs,ConjRhs> pcj;
-  
+
   for(int i=0;i<PacketSize;++i)
   {
     ref[i] = cj0(data1[i]) * cj1(data2[i]);
@@ -320,7 +534,7 @@ template<typename Scalar,bool ConjLhs,bool ConjRhs> void test_conj_helper(Scalar
   }
   internal::pstore(pval,pcj.pmul(internal::pload<Packet>(data1),internal::pload<Packet>(data2)));
   VERIFY(areApprox(ref, pval, PacketSize) && "conj_helper pmul");
-  
+
   for(int i=0;i<PacketSize;++i)
   {
     Scalar tmp = ref[i];
@@ -333,34 +547,70 @@ template<typename Scalar,bool ConjLhs,bool ConjRhs> void test_conj_helper(Scalar
 
 template<typename Scalar> void packetmath_complex()
 {
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  const int PacketSize = PacketTraits::size;
 
   const int size = PacketSize*4;
-  EIGEN_ALIGN16 Scalar data1[PacketSize*4];
-  EIGEN_ALIGN16 Scalar data2[PacketSize*4];
-  EIGEN_ALIGN16 Scalar ref[PacketSize*4];
-  EIGEN_ALIGN16 Scalar pval[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar pval[PacketSize*4];
 
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>() * Scalar(1e2);
     data2[i] = internal::random<Scalar>() * Scalar(1e2);
   }
-  
+
   test_conj_helper<Scalar,false,false> (data1,data2,ref,pval);
   test_conj_helper<Scalar,false,true>  (data1,data2,ref,pval);
   test_conj_helper<Scalar,true,false>  (data1,data2,ref,pval);
   test_conj_helper<Scalar,true,true>   (data1,data2,ref,pval);
-  
+
   {
     for(int i=0;i<PacketSize;++i)
       ref[i] = Scalar(std::imag(data1[i]),std::real(data1[i]));
     internal::pstore(pval,internal::pcplxflip(internal::pload<Packet>(data1)));
     VERIFY(areApprox(ref, pval, PacketSize) && "pcplxflip");
   }
-  
-  
+}
+
+template<typename Scalar> void packetmath_scatter_gather()
+{
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  const int PacketSize = PacketTraits::size;
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize];
+  RealScalar refvalue = 0;
+  for (int i=0; i<PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>()/RealScalar(PacketSize);
+  }
+
+  int stride = internal::random<int>(1,20);
+
+  EIGEN_ALIGN_MAX Scalar buffer[PacketSize*20];
+  memset(buffer, 0, 20*PacketSize*sizeof(Scalar));
+  Packet packet = internal::pload<Packet>(data1);
+  internal::pscatter<Scalar, Packet>(buffer, packet, stride);
+
+  for (int i = 0; i < PacketSize*20; ++i) {
+    if ((i%stride) == 0 && i<stride*PacketSize) {
+      VERIFY(isApproxAbs(buffer[i], data1[i/stride], refvalue) && "pscatter");
+    } else {
+      VERIFY(isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter");
+    }
+  }
+
+  for (int i=0; i<PacketSize*7; ++i) {
+    buffer[i] = internal::random<Scalar>()/RealScalar(PacketSize);
+  }
+  packet = internal::pgather<Scalar, Packet>(buffer, 7);
+  internal::pstore(data1, packet);
+  for (int i = 0; i < PacketSize; ++i) {
+    VERIFY(isApproxAbs(data1[i], buffer[i*7], refvalue) && "pgather");
+  }
 }
 
 void test_packetmath()
@@ -369,17 +619,23 @@ void test_packetmath()
     CALL_SUBTEST_1( packetmath<float>() );
     CALL_SUBTEST_2( packetmath<double>() );
     CALL_SUBTEST_3( packetmath<int>() );
-    CALL_SUBTEST_1( packetmath<std::complex<float> >() );
-    CALL_SUBTEST_2( packetmath<std::complex<double> >() );
+    CALL_SUBTEST_4( packetmath<std::complex<float> >() );
+    CALL_SUBTEST_5( packetmath<std::complex<double> >() );
 
     CALL_SUBTEST_1( packetmath_notcomplex<float>() );
     CALL_SUBTEST_2( packetmath_notcomplex<double>() );
     CALL_SUBTEST_3( packetmath_notcomplex<int>() );
-    
+
     CALL_SUBTEST_1( packetmath_real<float>() );
     CALL_SUBTEST_2( packetmath_real<double>() );
 
-    CALL_SUBTEST_1( packetmath_complex<std::complex<float> >() );
-    CALL_SUBTEST_2( packetmath_complex<std::complex<double> >() );
+    CALL_SUBTEST_4( packetmath_complex<std::complex<float> >() );
+    CALL_SUBTEST_5( packetmath_complex<std::complex<double> >() );
+
+    CALL_SUBTEST_1( packetmath_scatter_gather<float>() );
+    CALL_SUBTEST_2( packetmath_scatter_gather<double>() );
+    CALL_SUBTEST_3( packetmath_scatter_gather<int>() );
+    CALL_SUBTEST_4( packetmath_scatter_gather<std::complex<float> >() );
+    CALL_SUBTEST_5( packetmath_scatter_gather<std::complex<double> >() );
   }
 }
diff --git a/test/pastix_support.cpp b/test/pastix_support.cpp
index 14da0944b..b62f85739 100644
--- a/test/pastix_support.cpp
+++ b/test/pastix_support.cpp
@@ -7,6 +7,8 @@
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
 #include "sparse_solver.h"
 #include <Eigen/PaStiXSupport>
 #include <unsupported/Eigen/SparseExtra>
@@ -25,6 +27,14 @@ template<typename T> void test_pastix_T()
   check_sparse_spd_solving(pastix_llt_upper);
   check_sparse_spd_solving(pastix_ldlt_upper);
   check_sparse_square_solving(pastix_lu);
+
+  // Some compilation check:
+  pastix_llt_lower.iparm();
+  pastix_llt_lower.dparm();
+  pastix_ldlt_lower.iparm();
+  pastix_ldlt_lower.dparm();
+  pastix_lu.iparm();
+  pastix_lu.dparm();
 }
 
 // There is no support for selfadjoint matrices with PaStiX. 
diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp
index 7b0dbc763..db1266579 100644
--- a/test/permutationmatrices.cpp
+++ b/test/permutationmatrices.cpp
@@ -7,6 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define TEST_ENABLE_TEMPORARY_TRACKING
+  
 #include "main.h"
 
 using namespace std;
@@ -33,7 +35,9 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
   RightPermutationVectorType rv;
   randomPermutationVector(rv, cols);
   RightPermutationType rp(rv);
-  MatrixType m_permuted = lp * m_original * rp;
+  MatrixType m_permuted = MatrixType::Random(rows,cols);
+  
+  VERIFY_EVALUATION_COUNT(m_permuted = lp * m_original * rp, 1); // 1 temp for sub expression "lp * m_original"
 
   for (int i=0; i<rows; i++)
     for (int j=0; j<cols; j++)
@@ -43,7 +47,11 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
   Matrix<Scalar,Cols,Cols> rm(rp);
 
   VERIFY_IS_APPROX(m_permuted, lm*m_original*rm);
-
+  
+  m_permuted = m_original;
+  VERIFY_EVALUATION_COUNT(m_permuted = lp * m_permuted * rp, 1);
+  VERIFY_IS_APPROX(m_permuted, lm*m_original*rm);
+  
   VERIFY_IS_APPROX(lp.inverse()*m_permuted*rp.inverse(), m_original);
   VERIFY_IS_APPROX(lv.asPermutation().inverse()*m_permuted*rv.asPermutation().inverse(), m_original);
   VERIFY_IS_APPROX(MapLeftPerm(lv.data(),lv.size()).inverse()*m_permuted*MapRightPerm(rv.data(),rv.size()).inverse(), m_original);
@@ -63,22 +71,22 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
   LeftPermutationType identityp;
   identityp.setIdentity(rows);
   VERIFY_IS_APPROX(m_original, identityp*m_original);
-
+  
   // check inplace permutations
   m_permuted = m_original;
-  m_permuted = lp.inverse() * m_permuted;
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias()= lp.inverse() * m_permuted, 1); // 1 temp to allocate the mask
   VERIFY_IS_APPROX(m_permuted, lp.inverse()*m_original);
-
+  
   m_permuted = m_original;
-  m_permuted = m_permuted * rp.inverse();
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp.inverse(), 1); // 1 temp to allocate the mask
   VERIFY_IS_APPROX(m_permuted, m_original*rp.inverse());
-
+  
   m_permuted = m_original;
-  m_permuted = lp * m_permuted;
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = lp * m_permuted, 1); // 1 temp to allocate the mask
   VERIFY_IS_APPROX(m_permuted, lp*m_original);
-
+  
   m_permuted = m_original;
-  m_permuted = m_permuted * rp;
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp, 1); // 1 temp to allocate the mask
   VERIFY_IS_APPROX(m_permuted, m_original*rp);
 
   if(rows>1 && cols>1)
@@ -99,7 +107,38 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
     rm = rp;
     rm.col(i).swap(rm.col(j));
     VERIFY_IS_APPROX(rm, rp2.toDenseMatrix().template cast<Scalar>());
-  }  
+  }
+
+  {
+    // simple compilation check
+    Matrix<Scalar, Cols, Cols> A = rp;
+    Matrix<Scalar, Cols, Cols> B = rp.transpose();
+    VERIFY_IS_APPROX(A, B.transpose());
+  }
+}
+
+template<typename T>
+void bug890()
+{
+  typedef Matrix<T, Dynamic, Dynamic> MatrixType;
+  typedef Matrix<T, Dynamic, 1> VectorType;
+  typedef Stride<Dynamic,Dynamic> S;
+  typedef Map<MatrixType, Aligned, S> MapType;
+  typedef PermutationMatrix<Dynamic> Perm;
+  
+  VectorType v1(2), v2(2), op(4), rhs(2);
+  v1 << 666,667;
+  op << 1,0,0,1;
+  rhs << 42,42;
+  
+  Perm P(2);
+  P.indices() << 1, 0;
+
+  MapType(v1.data(),2,1,S(1,1)) = P * MapType(rhs.data(),2,1,S(1,1));
+  VERIFY_IS_APPROX(v1, (P * rhs).eval());
+  
+  MapType(v1.data(),2,1,S(1,1)) = P.inverse() * MapType(rhs.data(),2,1,S(1,1));
+  VERIFY_IS_APPROX(v1, (P.inverse() * rhs).eval());
 }
 
 void test_permutationmatrices()
@@ -113,4 +152,5 @@ void test_permutationmatrices()
     CALL_SUBTEST_6( permutationmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(20, 30)) );
     CALL_SUBTEST_7( permutationmatrices(MatrixXcf(15, 10)) );
   }
+  CALL_SUBTEST_5( bug890<double>() );
 }
diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp
index c4ef2d4bd..eb6ad18c9 100644
--- a/test/prec_inverse_4x4.cpp
+++ b/test/prec_inverse_4x4.cpp
@@ -53,14 +53,29 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
    // FIXME that 1.25 used to be 1.2 until we tested gcc 4.1 on 30 June 2010 and got 1.21.
   VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.25));
   VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 20.0));
+
+  {
+    int s = 5;//internal::random<int>(4,10);
+    int i = 0;//internal::random<int>(0,s-4);
+    int j = 0;//internal::random<int>(0,s-4);
+    Matrix<Scalar,5,5> mat(s,s);
+    mat.setRandom();
+    MatrixType submat = mat.template block<4,4>(i,j);
+    MatrixType mat_inv = mat.template block<4,4>(i,j).inverse();
+    VERIFY_IS_APPROX(mat_inv, submat.inverse());
+    mat.template block<4,4>(i,j) = submat.inverse();
+    VERIFY_IS_APPROX(mat_inv, (mat.template block<4,4>(i,j)));
+  }
 }
 
 void test_prec_inverse_4x4()
 {
   CALL_SUBTEST_1((inverse_permutation_4x4<Matrix4f>()));
   CALL_SUBTEST_1(( inverse_general_4x4<Matrix4f>(200000 * g_repeat) ));
+  CALL_SUBTEST_1(( inverse_general_4x4<Matrix<float,4,4,RowMajor> >(200000 * g_repeat) ));
 
   CALL_SUBTEST_2((inverse_permutation_4x4<Matrix<double,4,4,RowMajor> >()));
+  CALL_SUBTEST_2(( inverse_general_4x4<Matrix<double,4,4,ColMajor> >(200000 * g_repeat) ));
   CALL_SUBTEST_2(( inverse_general_4x4<Matrix<double,4,4,RowMajor> >(200000 * g_repeat) ));
 
   CALL_SUBTEST_3((inverse_permutation_4x4<Matrix4cf>()));
diff --git a/test/product.h b/test/product.h
index 0b3abe402..3b6511270 100644
--- a/test/product.h
+++ b/test/product.h
@@ -22,7 +22,6 @@ template<typename MatrixType> void product(const MatrixType& m)
   /* this test covers the following files:
      Identity.h Product.h
   */
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> RowVectorType;
   typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> ColVectorType;
@@ -112,6 +111,23 @@ template<typename MatrixType> void product(const MatrixType& m)
   vcres.noalias() -= m1.transpose() * v1;
   VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1);
 
+  // test d ?= a+b*c rules
+  res.noalias() = square + m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
+  res.noalias() += square + m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, 2*(square + m1 * m2.transpose()));
+  res.noalias() -= square + m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
+
+  // test d ?= a-b*c rules
+  res.noalias() = square - m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square - m1 * m2.transpose());
+  res.noalias() += square - m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, 2*(square - m1 * m2.transpose()));
+  res.noalias() -= square - m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square - m1 * m2.transpose());
+
+
   tm1 = m1;
   VERIFY_IS_APPROX(tm1.transpose() * v1, m1.transpose() * v1);
   VERIFY_IS_APPROX(v1.transpose() * tm1, v1.transpose() * m1);
@@ -136,15 +152,80 @@ template<typename MatrixType> void product(const MatrixType& m)
   VERIFY_IS_APPROX(res.col(r).noalias() = square.adjoint() * square.col(r), (square.adjoint() * square.col(r)).eval());
   VERIFY_IS_APPROX(res.col(r).noalias() = square * square.col(r), (square * square.col(r)).eval());
 
+  // vector at runtime (see bug 1166)
+  {
+    RowSquareMatrixType ref(square);
+    ColSquareMatrixType ref2(square2);
+    ref = res = square;
+    VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.col(0).transpose() * square.transpose(),            (ref.row(0) = m1.col(0).transpose() * square.transpose()));
+    VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.block(0,0,rows,1).transpose() * square.transpose(), (ref.row(0) = m1.col(0).transpose() * square.transpose()));
+    VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.col(0).transpose() * square,                        (ref.row(0) = m1.col(0).transpose() * square));
+    VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.block(0,0,rows,1).transpose() * square,             (ref.row(0) = m1.col(0).transpose() * square));
+    ref2 = res2 = square2;
+    VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.row(0) * square2.transpose(),                      (ref2.row(0) = m1.row(0) * square2.transpose()));
+    VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.block(0,0,1,cols) * square2.transpose(),           (ref2.row(0) = m1.row(0) * square2.transpose()));
+    VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.row(0) * square2,                                  (ref2.row(0) = m1.row(0) * square2));
+    VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.block(0,0,1,cols) * square2,                       (ref2.row(0) = m1.row(0) * square2));
+  }
+
+  // vector.block() (see bug 1283)
+  {
+    RowVectorType w1(rows);
+    VERIFY_IS_APPROX(square * v1.block(0,0,rows,1), square * v1);
+    VERIFY_IS_APPROX(w1.noalias() = square * v1.block(0,0,rows,1), square * v1);
+    VERIFY_IS_APPROX(w1.block(0,0,rows,1).noalias() = square * v1.block(0,0,rows,1), square * v1);
+
+    Matrix<Scalar,1,MatrixType::ColsAtCompileTime> w2(cols);
+    VERIFY_IS_APPROX(vc2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.noalias() = vc2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.block(0,0,1,cols).noalias() = vc2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+
+    vc2 = square2.block(0,0,1,cols).transpose();
+    VERIFY_IS_APPROX(square2.block(0,0,1,cols) * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.noalias() = square2.block(0,0,1,cols) * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.block(0,0,1,cols).noalias() = square2.block(0,0,1,cols) * square2, vc2.transpose() * square2);
+
+    vc2 = square2.block(0,0,cols,1);
+    VERIFY_IS_APPROX(square2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.noalias() = square2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.block(0,0,1,cols).noalias() = square2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+  }
+
   // inner product
-  Scalar x = square2.row(c) * square2.col(c2);
-  VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum());
-  
+  {
+    Scalar x = square2.row(c) * square2.col(c2);
+    VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum());
+  }
+
   // outer product
-  VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose());
-  VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols));  
+  {
+    VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose());
+    VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols));
+  }
+
+  // Aliasing
+  {
+    ColVectorType x(cols); x.setRandom();
+    ColVectorType z(x);
+    ColVectorType y(cols); y.setZero();
+    ColSquareMatrixType A(cols,cols); A.setRandom();
+    // CwiseBinaryOp
+    VERIFY_IS_APPROX(x = y + A*x, A*z);
+    x = z;
+    // CwiseUnaryOp
+    VERIFY_IS_APPROX(x = Scalar(1.)*(A*x), A*z);
+  }
+
+  // regression for blas_trais
+  {
+    VERIFY_IS_APPROX(square * (square*square).transpose(), square * square.transpose() * square.transpose());
+    VERIFY_IS_APPROX(square * (-(square*square)), -square * square * square);
+    VERIFY_IS_APPROX(square * (s1*(square*square)), s1 * square * square * square);
+    VERIFY_IS_APPROX(square * (square*square).conjugate(), square * square.conjugate() * square.conjugate());
+  }
+
 }
diff --git a/test/product_extra.cpp b/test/product_extra.cpp
index ea2486937..e2b855bff 100644
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@@ -98,6 +98,16 @@ template<typename MatrixType> void product_extra(const MatrixType& m)
   // regression test
   MatrixType tmp = m1 * m1.adjoint() * s1;
   VERIFY_IS_APPROX(tmp, m1 * m1.adjoint() * s1);
+
+  // regression test for bug 1343, assignment to arrays
+  Array<Scalar,Dynamic,1> a1 = m1 * vc2;
+  VERIFY_IS_APPROX(a1.matrix(),m1*vc2);
+  Array<Scalar,Dynamic,1> a2 = s1 * (m1 * vc2);
+  VERIFY_IS_APPROX(a2.matrix(),s1*m1*vc2);
+  Array<Scalar,1,Dynamic> a3 = v1 * m1;
+  VERIFY_IS_APPROX(a3.matrix(),v1*m1);
+  Array<Scalar,Dynamic,Dynamic> a4 = m1 * m2.adjoint();
+  VERIFY_IS_APPROX(a4.matrix(),m1*m2.adjoint());
 }
 
 // Regression test for bug reported at http://forum.kde.org/viewtopic.php?f=74&t=96947
@@ -116,8 +126,8 @@ void zero_sized_objects(const MatrixType& m)
   typedef typename MatrixType::Scalar Scalar;
   const int PacketSize  = internal::packet_traits<Scalar>::size;
   const int PacketSize1 = PacketSize>1 ?  PacketSize-1 : 1;
-  DenseIndex rows = m.rows();
-  DenseIndex cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   
   {
     MatrixType res, a(rows,0), b(0,cols);
@@ -169,6 +179,7 @@ void zero_sized_objects(const MatrixType& m)
   }
 }
 
+template<int>
 void bug_127()
 {
   // Bug 127
@@ -193,6 +204,16 @@ void bug_127()
   a*b;
 }
 
+template<int> void bug_817()
+{
+  ArrayXXf B = ArrayXXf::Random(10,10), C;
+  VectorXf x = VectorXf::Random(10);
+  C = (x.transpose()*B.matrix());
+  B = (x.transpose()*B.matrix());
+  VERIFY_IS_APPROX(B,C);
+}
+
+template<int>
 void unaligned_objects()
 {
   // Regression test for the bug reported here:
@@ -222,6 +243,116 @@ void unaligned_objects()
   }
 }
 
+template<typename T>
+EIGEN_DONT_INLINE
+Index test_compute_block_size(Index m, Index n, Index k)
+{
+  Index mc(m), nc(n), kc(k);
+  internal::computeProductBlockingSizes<T,T>(kc, mc, nc);
+  return kc+mc+nc;
+}
+
+template<typename T>
+Index compute_block_size()
+{
+  Index ret = 0;
+  ret += test_compute_block_size<T>(0,1,1);
+  ret += test_compute_block_size<T>(1,0,1);
+  ret += test_compute_block_size<T>(1,1,0);
+  ret += test_compute_block_size<T>(0,0,1);
+  ret += test_compute_block_size<T>(0,1,0);
+  ret += test_compute_block_size<T>(1,0,0);
+  ret += test_compute_block_size<T>(0,0,0);
+  return ret;
+}
+
+template<typename>
+void aliasing_with_resize()
+{
+  Index m = internal::random<Index>(10,50);
+  Index n = internal::random<Index>(10,50);
+  MatrixXd A, B, C(m,n), D(m,m);
+  VectorXd a, b, c(n);
+  C.setRandom();
+  D.setRandom();
+  c.setRandom();
+  double s = internal::random<double>(1,10);
+
+  A = C;
+  B = A * A.transpose();
+  A = A * A.transpose();
+  VERIFY_IS_APPROX(A,B);
+
+  A = C;
+  B = (A * A.transpose())/s;
+  A = (A * A.transpose())/s;
+  VERIFY_IS_APPROX(A,B);
+
+  A = C;
+  B = (A * A.transpose()) + D;
+  A = (A * A.transpose()) + D;
+  VERIFY_IS_APPROX(A,B);
+
+  A = C;
+  B = D + (A * A.transpose());
+  A = D + (A * A.transpose());
+  VERIFY_IS_APPROX(A,B);
+
+  A = C;
+  B = s * (A * A.transpose());
+  A = s * (A * A.transpose());
+  VERIFY_IS_APPROX(A,B);
+
+  A = C;
+  a = c;
+  b = (A * a)/s;
+  a = (A * a)/s;
+  VERIFY_IS_APPROX(a,b);
+}
+
+template<int>
+void bug_1308()
+{
+  int n = 10;
+  MatrixXd r(n,n);
+  VectorXd v = VectorXd::Random(n);
+  r = v * RowVectorXd::Ones(n);
+  VERIFY_IS_APPROX(r, v.rowwise().replicate(n));
+  r = VectorXd::Ones(n) * v.transpose();
+  VERIFY_IS_APPROX(r, v.rowwise().replicate(n).transpose());
+
+  Matrix4d ones44 = Matrix4d::Ones();
+  Matrix4d m44 = Matrix4d::Ones() * Matrix4d::Ones();
+  VERIFY_IS_APPROX(m44,Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+
+  typedef Matrix<double,4,4,RowMajor> RMatrix4d;
+  RMatrix4d r44 = Matrix4d::Ones() * Matrix4d::Ones();
+  VERIFY_IS_APPROX(r44,Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44*RMatrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*RMatrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+
+//   RowVector4d r4;
+  m44.setOnes();
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += m44.row(0).transpose() * RowVector4d::Ones(), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += m44.col(0) * RowVector4d::Ones(), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.row(0), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
+}
+
 void test_product_extra()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -232,6 +363,13 @@ void test_product_extra()
     CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_1( zero_sized_objects(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
-  CALL_SUBTEST_5( bug_127() );
-  CALL_SUBTEST_6( unaligned_objects() );
+  CALL_SUBTEST_5( bug_127<0>() );
+  CALL_SUBTEST_5( bug_817<0>() );
+  CALL_SUBTEST_5( bug_1308<0>() );
+  CALL_SUBTEST_6( unaligned_objects<0>() );
+  CALL_SUBTEST_7( compute_block_size<float>() );
+  CALL_SUBTEST_7( compute_block_size<double>() );
+  CALL_SUBTEST_7( compute_block_size<std::complex<double> >() );
+  CALL_SUBTEST_8( aliasing_with_resize<void>() );
+
 }
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 03d7bd8ed..845cd40ca 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -9,6 +9,27 @@
 
 #include "product.h"
 
+template<typename T>
+void test_aliasing()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  typedef Matrix<T,Dynamic,Dynamic> MatrixType;
+  typedef Matrix<T,Dynamic,1> VectorType;
+  VectorType x(cols); x.setRandom();
+  VectorType z(x);
+  VectorType y(rows); y.setZero();
+  MatrixType A(rows,cols); A.setRandom();
+  // CwiseBinaryOp
+  VERIFY_IS_APPROX(x = y + A*x, A*z);     // OK because "y + A*x" is marked as "assume-aliasing"
+  x = z;
+  // CwiseUnaryOp
+  VERIFY_IS_APPROX(x = T(1.)*(A*x), A*z); // OK because 1*(A*x) is replaced by (1*A*x) which is a Product<> expression
+  x = z;
+  // VERIFY_IS_APPROX(x = y-A*x, -A*z);   // Not OK in 3.3 because x is resized before A*x gets evaluated
+  x = z;
+}
+
 void test_product_large()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -17,6 +38,8 @@ void test_product_large()
     CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+
+    CALL_SUBTEST_1( test_aliasing<float>() );
   }
 
 #if defined EIGEN_TEST_PART_6
@@ -39,15 +62,16 @@ void test_product_large()
     // check the functions to setup blocking sizes compile and do not segfault
     // FIXME check they do what they are supposed to do !!
     std::ptrdiff_t l1 = internal::random<int>(10000,20000);
-    std::ptrdiff_t l2 = internal::random<int>(1000000,2000000);
-    setCpuCacheSizes(l1,l2);
+    std::ptrdiff_t l2 = internal::random<int>(100000,200000);
+    std::ptrdiff_t l3 = internal::random<int>(1000000,2000000);
+    setCpuCacheSizes(l1,l2,l3);
     VERIFY(l1==l1CacheSize());
     VERIFY(l2==l2CacheSize());
     std::ptrdiff_t k1 = internal::random<int>(10,100)*16;
     std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
     std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
     // only makes sure it compiles fine
-    internal::computeProductBlockingSizes<float,float>(k1,m1,n1);
+    internal::computeProductBlockingSizes<float,float,std::ptrdiff_t>(k1,m1,n1,1);
   }
 
   {
@@ -60,5 +84,24 @@ void test_product_large()
     MatrixXf r2 = mat1.row(2)*mat2;
     VERIFY_IS_APPROX(r2, (mat1.row(2)*mat2).eval());
   }
+
+  {
+    Eigen::MatrixXd A(10,10), B, C;
+    A.setRandom();
+    C = A;
+    for(int k=0; k<79; ++k)
+      C = C * A;
+    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
+                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
+    VERIFY_IS_APPROX(B,C);
+  }
+#endif
+
+  // Regression test for bug 714:
+#if defined EIGEN_HAS_OPENMP
+  omp_set_dynamic(1);
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_6( product(Matrix<float,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
 #endif
 }
diff --git a/test/product_mmtr.cpp b/test/product_mmtr.cpp
index aeba009f4..f6e4bb1ae 100644
--- a/test/product_mmtr.cpp
+++ b/test/product_mmtr.cpp
@@ -13,7 +13,8 @@
     ref2 = ref1 = DEST;                               \
     DEST.template triangularView<TRI>() OP;           \
     ref1 OP;                                          \
-    ref2.template triangularView<TRI>() = ref1;       \
+    ref2.template triangularView<TRI>()               \
+      = ref1.template triangularView<TRI>();          \
     VERIFY_IS_APPROX(DEST,ref2);                      \
   }
 
@@ -32,6 +33,8 @@ template<typename Scalar> void mmtr(int size)
   MatrixColMaj osc(othersize,size); osc.setRandom();
   MatrixRowMaj sor(size,othersize); sor.setRandom();
   MatrixRowMaj osr(othersize,size); osr.setRandom();
+  MatrixColMaj sqc(size,size); sqc.setRandom();
+  MatrixRowMaj sqr(size,size); sqr.setRandom();
   
   Scalar s = internal::random<Scalar>();
   
@@ -49,6 +52,29 @@ template<typename Scalar> void mmtr(int size)
   CHECK_MMTR(matc, Upper, -= s*(osc.transpose()*osc.conjugate()));
   CHECK_MMTR(matr, Lower, -= s*soc*soc.adjoint());
   CHECK_MMTR(matr, Upper, -= soc*(s*soc.adjoint()));
+  
+  CHECK_MMTR(matc, Lower, -= s*sqr*sqc.template triangularView<Upper>());
+  CHECK_MMTR(matc, Upper, = s*sqc*sqr.template triangularView<Upper>());
+  CHECK_MMTR(matc, Lower, += s*sqr*sqc.template triangularView<Lower>());
+  CHECK_MMTR(matc, Upper, = s*sqc*sqc.template triangularView<Lower>());
+  
+  CHECK_MMTR(matc, Lower, = (s*sqr).template triangularView<Upper>()*sqc);
+  CHECK_MMTR(matc, Upper, -= (s*sqc).template triangularView<Upper>()*sqc);
+  CHECK_MMTR(matc, Lower, = (s*sqr).template triangularView<Lower>()*sqc);
+  CHECK_MMTR(matc, Upper, += (s*sqc).template triangularView<Lower>()*sqc);
+
+  // check aliasing
+  ref2 = ref1 = matc;
+  ref1 = sqc.adjoint() * matc * sqc;
+  ref2.template triangularView<Upper>() = ref1.template triangularView<Upper>();
+  matc.template triangularView<Upper>() = sqc.adjoint() * matc * sqc;
+  VERIFY_IS_APPROX(matc, ref2);
+
+  ref2 = ref1 = matc;
+  ref1 = sqc * matc * sqc.adjoint();
+  ref2.template triangularView<Lower>() = ref1.template triangularView<Lower>();
+  matc.template triangularView<Lower>() = sqc * matc * sqc.adjoint();
+  VERIFY_IS_APPROX(matc, ref2);
 }
 
 void test_product_mmtr()
diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp
index 258d238e2..8bf71b4f2 100644
--- a/test/product_notemporary.cpp
+++ b/test/product_notemporary.cpp
@@ -7,25 +7,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-static int nb_temporaries;
-
-inline void on_temporary_creation(int size) {
-  // here's a great place to set a breakpoint when debugging failures in this test!
-  if(size!=0) nb_temporaries++;
-}
-  
-
-#define EIGEN_DENSE_STORAGE_CTOR_PLUGIN { on_temporary_creation(size); }
+#define TEST_ENABLE_TEMPORARY_TRACKING
 
 #include "main.h"
 
-#define VERIFY_EVALUATION_COUNT(XPR,N) {\
-    nb_temporaries = 0; \
-    XPR; \
-    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
-    VERIFY( (#XPR) && nb_temporaries==N ); \
-  }
-
 template<typename MatrixType> void product_notemporary(const MatrixType& m)
 {
   /* This test checks the number of temporaries created
@@ -58,10 +43,23 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
         r1 = internal::random<Index>(8,rows-r0);
 
   VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()), 1);
+  VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()).transpose(), 1);
   VERIFY_EVALUATION_COUNT( m3.noalias() = m1 * m2.adjoint(), 0);
 
+  VERIFY_EVALUATION_COUNT( m3 = s1 * (m1 * m2.transpose()), 1);
+//   VERIFY_EVALUATION_COUNT( m3 = m3 + s1 * (m1 * m2.transpose()), 1);
   VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * (m1 * m2.transpose()), 0);
 
+  VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()), 1);
+
+  VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()).transpose(), 1);
+  VERIFY_EVALUATION_COUNT( m3.noalias() = m3 + m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() += m3 + m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 + m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() =  m3 - m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() += m3 - m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 - m1 * m2.transpose(), 0);
+
   VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * m1 * s2 * m2.adjoint(), 0);
   VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * m1 * s2 * (m1*s3+m2*s2).adjoint(), 1);
   VERIFY_EVALUATION_COUNT( m3.noalias() = (s1 * m1).adjoint() * s2 * m2, 0);
@@ -77,7 +75,7 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( m3.noalias() -= (s1 * m1).template triangularView<Lower>() * m2, 0);
   VERIFY_EVALUATION_COUNT( rm3.noalias() = (s1 * m1.adjoint()).template triangularView<Upper>() * (m2+m2), 1);
   VERIFY_EVALUATION_COUNT( rm3.noalias() = (s1 * m1.adjoint()).template triangularView<UnitUpper>() * m2.adjoint(), 0);
-  
+
   VERIFY_EVALUATION_COUNT( m3.template triangularView<Upper>() = (m1 * m2.adjoint()), 0);
   VERIFY_EVALUATION_COUNT( m3.template triangularView<Upper>() -= (m1 * m2.adjoint()), 0);
 
@@ -114,8 +112,7 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( Scalar tmp = 0; tmp += Scalar(RealScalar(1)) /  (m3.transpose() * m3).diagonal().array().abs().sum(), 0 );
 
   // Zero temporaries for ... CoeffBasedProductMode
-  // - does not work with GCC because of the <..>, we'ld need variadic macros ...
-  //VERIFY_EVALUATION_COUNT( m3.col(0).head<5>() * m3.col(0).transpose() + m3.col(0).head<5>() * m3.col(0).transpose(), 0 );
+  VERIFY_EVALUATION_COUNT( m3.col(0).template head<5>() * m3.col(0).transpose() + m3.col(0).template head<5>() * m3.col(0).transpose(), 0 );
 
   // Check matrix * vectors
   VERIFY_EVALUATION_COUNT( cvres.noalias() = m1 * cv1, 0 );
@@ -123,6 +120,26 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( cvres.noalias() -= m1 * m2.col(0), 0 );
   VERIFY_EVALUATION_COUNT( cvres.noalias() -= m1 * rv1.adjoint(), 0 );
   VERIFY_EVALUATION_COUNT( cvres.noalias() -= m1 * m2.row(0).transpose(), 0 );
+
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (m1+m1) * cv1, 0 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * cv1, 0 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (m1+m1) * (m1*cv1), 1 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 );
+
+  // Check outer products
+  m3 = cv1 * rv1;
+  VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() += (m1*cv1) * (rv1 * m1), 2 );
+
+  // Check nested products
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = m1.adjoint() * m1 * cv1, 1 );
+  VERIFY_EVALUATION_COUNT( rvres.noalias() = rv1 * (m1 * m2.adjoint()), 1 );
 }
 
 void test_product_notemporary()
@@ -131,11 +148,12 @@ void test_product_notemporary()
   for(int i = 0; i < g_repeat; i++) {
     s = internal::random<int>(16,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_1( product_notemporary(MatrixXf(s, s)) );
-    s = internal::random<int>(16,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_2( product_notemporary(MatrixXd(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(16,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_3( product_notemporary(MatrixXcf(s,s)) );
-    s = internal::random<int>(16,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_4( product_notemporary(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
 }
diff --git a/test/product_selfadjoint.cpp b/test/product_selfadjoint.cpp
index 374e2393b..3d768aa7e 100644
--- a/test/product_selfadjoint.cpp
+++ b/test/product_selfadjoint.cpp
@@ -67,14 +67,21 @@ void test_product_selfadjoint()
     CALL_SUBTEST_1( product_selfadjoint(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( product_selfadjoint(Matrix<float, 2, 2>()) );
     CALL_SUBTEST_3( product_selfadjoint(Matrix3d()) );
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_4( product_selfadjoint(MatrixXcf(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_5( product_selfadjoint(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_6( product_selfadjoint(MatrixXd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_7( product_selfadjoint(Matrix<float,Dynamic,Dynamic,RowMajor>(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
-  TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
diff --git a/test/product_small.cpp b/test/product_small.cpp
index 8b132abb6..fdfdd9f6c 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -9,8 +9,10 @@
 
 #define EIGEN_NO_STATIC_ASSERT
 #include "product.h"
+#include <Eigen/LU>
 
 // regression test for bug 447
+template<int>
 void product1x1()
 {
   Matrix<float,1,3> matAstatic;
@@ -28,16 +30,237 @@ void product1x1()
                     matAdynamic.cwiseProduct(matBdynamic.transpose()).sum() );
 }
 
+template<typename TC, typename TA, typename TB>
+const TC& ref_prod(TC &C, const TA &A, const TB &B)
+{
+  for(Index i=0;i<C.rows();++i)
+    for(Index j=0;j<C.cols();++j)
+      for(Index k=0;k<A.cols();++k)
+        C.coeffRef(i,j) += A.coeff(i,k) * B.coeff(k,j);
+  return C;
+}
+
+template<typename T, int Rows, int Cols, int Depth, int OC, int OA, int OB>
+typename internal::enable_if<! ( (Rows ==1&&Depth!=1&&OA==ColMajor)
+                              || (Depth==1&&Rows !=1&&OA==RowMajor)
+                              || (Cols ==1&&Depth!=1&&OB==RowMajor)
+                              || (Depth==1&&Cols !=1&&OB==ColMajor)
+                              || (Rows ==1&&Cols !=1&&OC==ColMajor)
+                              || (Cols ==1&&Rows !=1&&OC==RowMajor)),void>::type
+test_lazy_single(int rows, int cols, int depth)
+{
+  Matrix<T,Rows,Depth,OA> A(rows,depth); A.setRandom();
+  Matrix<T,Depth,Cols,OB> B(depth,cols); B.setRandom();
+  Matrix<T,Rows,Cols,OC>  C(rows,cols);  C.setRandom();
+  Matrix<T,Rows,Cols,OC>  D(C);
+  VERIFY_IS_APPROX(C+=A.lazyProduct(B), ref_prod(D,A,B));
+}
+
+template<typename T, int Rows, int Cols, int Depth, int OC, int OA, int OB>
+typename internal::enable_if<  ( (Rows ==1&&Depth!=1&&OA==ColMajor)
+                              || (Depth==1&&Rows !=1&&OA==RowMajor)
+                              || (Cols ==1&&Depth!=1&&OB==RowMajor)
+                              || (Depth==1&&Cols !=1&&OB==ColMajor)
+                              || (Rows ==1&&Cols !=1&&OC==ColMajor)
+                              || (Cols ==1&&Rows !=1&&OC==RowMajor)),void>::type
+test_lazy_single(int, int, int)
+{
+}
+
+template<typename T, int Rows, int Cols, int Depth>
+void test_lazy_all_layout(int rows=Rows, int cols=Cols, int depth=Depth)
+{
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,ColMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,ColMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,RowMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,RowMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,ColMajor,RowMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,ColMajor,RowMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,RowMajor,RowMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,RowMajor,RowMajor>(rows,cols,depth) ));
+}
+
+template<typename T>
+void test_lazy_l1()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  int depth = internal::random<int>(1,12);
+
+  // Inner
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,3>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,9>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,-1>(1,1,depth) ));
+
+  // Outer
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,1,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,2,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,2,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,3,3,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,4,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,8,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,-1,1>(4,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,7,-1,1>(7,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,8,1>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,3,1>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,-1,1>(rows,cols) ));
+}
+
+template<typename T>
+void test_lazy_l2()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  int depth = internal::random<int>(1,12);
+
+  // mat-vec
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,1,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,5,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,6,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,1,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,1,4>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,-1>(4,1,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,1,-1>(rows,1,depth) ));
+
+  // vec-mat
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,2,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,2,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,5,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,8,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,-1, 4>(1,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1, 4,-1>(1,4,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,-1,-1>(1,cols,depth) ));
+}
+
+template<typename T>
+void test_lazy_l3()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  int depth = internal::random<int>(1,12);
+  // mat-mat
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,4,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,3,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,8,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,5,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,2,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,7,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,6,8,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,3,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,6,4>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,3,-1>(4,3,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,6,-1>(rows,6,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,2,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,5,2,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,4,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,4,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,6,5,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,4,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,3,4,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,7,8,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,-1, 4>(8,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,3, 4,-1>(3,4,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,-1,-1>(4,cols,depth) ));
+}
+
+template<typename T,int N,int M,int K>
+void test_linear_but_not_vectorizable()
+{
+  // Check tricky cases for which the result of the product is a vector and thus must exhibit the LinearBit flag,
+  // but is not vectorizable along the linear dimension.
+  Index n = N==Dynamic ? internal::random<Index>(1,32) : N;
+  Index m = M==Dynamic ? internal::random<Index>(1,32) : M;
+  Index k = K==Dynamic ? internal::random<Index>(1,32) : K;
+
+  {
+    Matrix<T,N,M+1> A; A.setRandom(n,m+1);
+    Matrix<T,M*2,K> B; B.setRandom(m*2,k);
+    Matrix<T,1,K> C;
+    Matrix<T,1,K> R;
+
+    C.noalias() = A.template topLeftCorner<1,M>() * (B.template topRows<M>()+B.template bottomRows<M>());
+    R.noalias() = A.template topLeftCorner<1,M>() * (B.template topRows<M>()+B.template bottomRows<M>()).eval();
+    VERIFY_IS_APPROX(C,R);
+  }
+
+  {
+    Matrix<T,M+1,N,RowMajor> A; A.setRandom(m+1,n);
+    Matrix<T,K,M*2,RowMajor> B; B.setRandom(k,m*2);
+    Matrix<T,K,1> C;
+    Matrix<T,K,1> R;
+
+    C.noalias() = (B.template leftCols<M>()+B.template rightCols<M>())        * A.template topLeftCorner<M,1>();
+    R.noalias() = (B.template leftCols<M>()+B.template rightCols<M>()).eval() * A.template topLeftCorner<M,1>();
+    VERIFY_IS_APPROX(C,R);
+  }
+}
+
+template<int Rows>
+void bug_1311()
+{
+  Matrix< double, Rows, 2 > A;  A.setRandom();
+  Vector2d b = Vector2d::Random() ;
+  Matrix<double,Rows,1> res;
+  res.noalias() = 1. * (A * b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = 1.*A * b;
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (1.*A).lazyProduct(b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (1.*A).lazyProduct(1.*b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (A).lazyProduct(1.*b);
+  VERIFY_IS_APPROX(res, A*b);
+}
 
 void test_product_small()
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
-    CALL_SUBTEST_2( product(Matrix<int, 3, 5>()) );
+    CALL_SUBTEST_2( product(Matrix<int, 3, 17>()) );
+    CALL_SUBTEST_8( product(Matrix<double, 3, 17>()) );
     CALL_SUBTEST_3( product(Matrix3d()) );
     CALL_SUBTEST_4( product(Matrix4d()) );
     CALL_SUBTEST_5( product(Matrix4f()) );
-    CALL_SUBTEST_6( product1x1() );
+    CALL_SUBTEST_6( product1x1<0>() );
+
+    CALL_SUBTEST_11( test_lazy_l1<float>() );
+    CALL_SUBTEST_12( test_lazy_l2<float>() );
+    CALL_SUBTEST_13( test_lazy_l3<float>() );
+
+    CALL_SUBTEST_21( test_lazy_l1<double>() );
+    CALL_SUBTEST_22( test_lazy_l2<double>() );
+    CALL_SUBTEST_23( test_lazy_l3<double>() );
+
+    CALL_SUBTEST_31( test_lazy_l1<std::complex<float> >() );
+    CALL_SUBTEST_32( test_lazy_l2<std::complex<float> >() );
+    CALL_SUBTEST_33( test_lazy_l3<std::complex<float> >() );
+
+    CALL_SUBTEST_41( test_lazy_l1<std::complex<double> >() );
+    CALL_SUBTEST_42( test_lazy_l2<std::complex<double> >() );
+    CALL_SUBTEST_43( test_lazy_l3<std::complex<double> >() );
+
+    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,Dynamic>() ));
+    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,3,1,Dynamic>() ));
+    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,16>() ));
+
+    CALL_SUBTEST_6( bug_1311<3>() );
+    CALL_SUBTEST_6( bug_1311<5>() );
   }
 
 #ifdef EIGEN_TEST_PART_6
@@ -46,5 +269,25 @@ void test_product_small()
     Vector3f v = Vector3f::Random();
     VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v);
   }
+  
+  {
+    // regression test for pull-request #93
+    Eigen::Matrix<double, 1, 1> A;  A.setRandom();
+    Eigen::Matrix<double, 18, 1> B; B.setRandom();
+    Eigen::Matrix<double, 1, 18> C; C.setRandom();
+    VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]);
+    VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C);
+  }
+
+  {
+    Eigen::Matrix<double, 10, 10> A, B, C;
+    A.setRandom();
+    C = A;
+    for(int k=0; k<79; ++k)
+      C = C * A;
+    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
+                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
+    VERIFY_IS_APPROX(B,C);
+  }
 #endif
 }
diff --git a/test/product_symm.cpp b/test/product_symm.cpp
index 74d7329b1..8c44383f9 100644
--- a/test/product_symm.cpp
+++ b/test/product_symm.cpp
@@ -39,6 +39,24 @@ template<typename Scalar, int Size, int OtherSize> void symm(int size = Size, in
   VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>() * (s2*rhs1),
                    rhs13 = (s1*m1) * (s2*rhs1));
 
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).transpose().template selfadjointView<Upper>() * (s2*rhs1),
+                   rhs13 = (s1*m1.transpose()) * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>().transpose() * (s2*rhs1),
+                   rhs13 = (s1*m1.transpose()) * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).conjugate().template selfadjointView<Lower>() * (s2*rhs1),
+                   rhs13 = (s1*m1).conjugate() * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>().conjugate() * (s2*rhs1),
+                   rhs13 = (s1*m1).conjugate() * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).adjoint().template selfadjointView<Upper>() * (s2*rhs1),
+                   rhs13 = (s1*m1).adjoint() * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>().adjoint() * (s2*rhs1),
+                   rhs13 = (s1*m1).adjoint() * (s2*rhs1));
+
   m2 = m1.template triangularView<Upper>(); rhs12.setRandom(); rhs13 = rhs12;
   m3 = m2.template selfadjointView<Upper>();
   VERIFY_IS_EQUAL(m1, m3);
diff --git a/test/product_syrk.cpp b/test/product_syrk.cpp
index 73c95000c..e10f0f2f2 100644
--- a/test/product_syrk.cpp
+++ b/test/product_syrk.cpp
@@ -125,11 +125,12 @@ void test_product_syrk()
     int s;
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_1( syrk(MatrixXf(s, s)) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_2( syrk(MatrixXd(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_3( syrk(MatrixXcf(s, s)) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_4( syrk(MatrixXcd(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
 }
diff --git a/test/product_trmm.cpp b/test/product_trmm.cpp
index d715b9a36..12e554410 100644
--- a/test/product_trmm.cpp
+++ b/test/product_trmm.cpp
@@ -9,10 +9,18 @@
 
 #include "main.h"
 
+template<typename T>
+int get_random_size()
+{
+  const int factor = NumTraits<T>::ReadCost;
+  const int max_test_size = EIGEN_TEST_MAX_SIZE>2*factor ? EIGEN_TEST_MAX_SIZE/factor : EIGEN_TEST_MAX_SIZE;
+  return internal::random<int>(1,max_test_size);
+}
+
 template<typename Scalar, int Mode, int TriOrder, int OtherOrder, int ResOrder, int OtherCols>
-void trmm(int rows=internal::random<int>(1,EIGEN_TEST_MAX_SIZE),
-          int cols=internal::random<int>(1,EIGEN_TEST_MAX_SIZE),
-          int otherCols = OtherCols==Dynamic?internal::random<int>(1,EIGEN_TEST_MAX_SIZE):OtherCols)
+void trmm(int rows=get_random_size<Scalar>(),
+          int cols=get_random_size<Scalar>(),
+          int otherCols = OtherCols==Dynamic?get_random_size<Scalar>():OtherCols)
 {
   typedef Matrix<Scalar,Dynamic,Dynamic,TriOrder> TriMatrix;
   typedef Matrix<Scalar,Dynamic,OtherCols,OtherCols==1?ColMajor:OtherOrder> OnTheRight;
@@ -42,13 +50,13 @@ void trmm(int rows=internal::random<int>(1,EIGEN_TEST_MAX_SIZE),
   
   VERIFY_IS_APPROX( ge_xs.noalias() = mat.template triangularView<Mode>() * ge_right, tri * ge_right);
   VERIFY_IS_APPROX( ge_sx.noalias() = ge_left * mat.template triangularView<Mode>(), ge_left * tri);
-  
+
   VERIFY_IS_APPROX( ge_xs.noalias() = (s1*mat.adjoint()).template triangularView<Mode>() * (s2*ge_left.transpose()), s1*triTr.conjugate() * (s2*ge_left.transpose()));
   VERIFY_IS_APPROX( ge_sx.noalias() = ge_right.transpose() * mat.adjoint().template triangularView<Mode>(), ge_right.transpose() * triTr.conjugate());
   
   VERIFY_IS_APPROX( ge_xs.noalias() = (s1*mat.adjoint()).template triangularView<Mode>() * (s2*ge_left.adjoint()), s1*triTr.conjugate() * (s2*ge_left.adjoint()));
   VERIFY_IS_APPROX( ge_sx.noalias() = ge_right.adjoint() * mat.adjoint().template triangularView<Mode>(), ge_right.adjoint() * triTr.conjugate());
-  
+
   ge_xs_save = ge_xs;
   VERIFY_IS_APPROX( (ge_xs_save + s1*triTr.conjugate() * (s2*ge_left.adjoint())).eval(), ge_xs.noalias() += (s1*mat.adjoint()).template triangularView<Mode>() * (s2*ge_left.adjoint()) );
   ge_sx.setRandom();
@@ -61,13 +69,13 @@ void trmm(int rows=internal::random<int>(1,EIGEN_TEST_MAX_SIZE),
 }
 
 template<typename Scalar, int Mode, int TriOrder>
-void trmv(int rows=internal::random<int>(1,EIGEN_TEST_MAX_SIZE), int cols=internal::random<int>(1,EIGEN_TEST_MAX_SIZE))
+void trmv(int rows=get_random_size<Scalar>(), int cols=get_random_size<Scalar>())
 {
   trmm<Scalar,Mode,TriOrder,ColMajor,ColMajor,1>(rows,cols,1);
 }
 
 template<typename Scalar, int Mode, int TriOrder, int OtherOrder, int ResOrder>
-void trmm(int rows=internal::random<int>(1,EIGEN_TEST_MAX_SIZE), int cols=internal::random<int>(1,EIGEN_TEST_MAX_SIZE), int otherCols = internal::random<int>(1,EIGEN_TEST_MAX_SIZE))
+void trmm(int rows=get_random_size<Scalar>(), int cols=get_random_size<Scalar>(), int otherCols = get_random_size<Scalar>())
 {
   trmm<Scalar,Mode,TriOrder,OtherOrder,ResOrder,Dynamic>(rows,cols,otherCols);
 }
diff --git a/test/product_trmv.cpp b/test/product_trmv.cpp
index 4c3c435c2..57a202afc 100644
--- a/test/product_trmv.cpp
+++ b/test/product_trmv.cpp
@@ -78,12 +78,14 @@ void test_product_trmv()
     CALL_SUBTEST_1( trmv(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( trmv(Matrix<float, 2, 2>()) );
     CALL_SUBTEST_3( trmv(Matrix3d()) );
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_4( trmv(MatrixXcf(s,s)) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_5( trmv(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_6( trmv(Matrix<float,Dynamic,Dynamic,RowMajor>(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
-  TEST_SET_BUT_UNUSED_VARIABLE(s);
 }
diff --git a/test/product_trsolve.cpp b/test/product_trsolve.cpp
index 69892b3a8..4b97fa9d6 100644
--- a/test/product_trsolve.cpp
+++ b/test/product_trsolve.cpp
@@ -84,10 +84,18 @@ void test_product_trsolve()
     CALL_SUBTEST_4((trsolve<std::complex<double>,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2),internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))));
 
     // vectors
-    CALL_SUBTEST_1((trsolve<float,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
-    CALL_SUBTEST_5((trsolve<std::complex<double>,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
-    CALL_SUBTEST_6((trsolve<float,1,1>()));
-    CALL_SUBTEST_7((trsolve<float,1,2>()));
-    CALL_SUBTEST_8((trsolve<std::complex<float>,4,1>()));
+    CALL_SUBTEST_5((trsolve<float,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_6((trsolve<double,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_7((trsolve<std::complex<float>,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_8((trsolve<std::complex<double>,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    
+    // meta-unrollers
+    CALL_SUBTEST_9((trsolve<float,4,1>()));
+    CALL_SUBTEST_10((trsolve<double,4,1>()));
+    CALL_SUBTEST_11((trsolve<std::complex<float>,4,1>()));
+    CALL_SUBTEST_12((trsolve<float,1,1>()));
+    CALL_SUBTEST_13((trsolve<float,1,2>()));
+    CALL_SUBTEST_14((trsolve<float,3,1>()));
+    
   }
 }
diff --git a/test/qr.cpp b/test/qr.cpp
index a79e0dd34..dfcc1e8f9 100644
--- a/test/qr.cpp
+++ b/test/qr.cpp
@@ -54,6 +54,8 @@ template<typename MatrixType> void qr_invertible()
 {
   using std::log;
   using std::abs;
+  using std::pow;
+  using std::max;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
   typedef typename MatrixType::Scalar Scalar;
 
@@ -65,7 +67,7 @@ template<typename MatrixType> void qr_invertible()
   if (internal::is_same<RealScalar,float>::value)
   {
     // let's build a matrix more stable to inverse
-    MatrixType a = MatrixType::Random(size,size*2);
+    MatrixType a = MatrixType::Random(size,size*4);
     m1 += a * a.adjoint();
   }
 
@@ -81,8 +83,11 @@ template<typename MatrixType> void qr_invertible()
   m3 = qr.householderQ(); // get a unitary
   m1 = m3 * m1 * m3;
   qr.compute(m1);
-  VERIFY_IS_APPROX(absdet, qr.absDeterminant());
   VERIFY_IS_APPROX(log(absdet), qr.logAbsDeterminant());
+  // This test is tricky if the determinant becomes too small.
+  // Since we generate random numbers with magnitude rrange [0,1], the average determinant is 0.5^size
+  VERIFY_IS_MUCH_SMALLER_THAN( abs(absdet-qr.absDeterminant()), numext::maxi(RealScalar(pow(0.5,size)),numext::maxi<RealScalar>(abs(absdet),abs(qr.absDeterminant()))) );
+  
 }
 
 template<typename MatrixType> void qr_verify_assert()
diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
index eb3feac01..26ed27f5c 100644
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp
@@ -10,21 +10,103 @@
 
 #include "main.h"
 #include <Eigen/QR>
+#include <Eigen/SVD>
+
+template <typename MatrixType>
+void cod() {
+  typedef typename MatrixType::Index Index;
+
+  Index rows = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+  Index cols = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+  Index cols2 = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+  Index rank = internal::random<Index>(1, (std::min)(rows, cols) - 1);
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime,
+                 MatrixType::RowsAtCompileTime>
+      MatrixQType;
+  MatrixType matrix;
+  createRandomPIMatrixOfRank(rank, rows, cols, matrix);
+  CompleteOrthogonalDecomposition<MatrixType> cod(matrix);
+  VERIFY(rank == cod.rank());
+  VERIFY(cols - cod.rank() == cod.dimensionOfKernel());
+  VERIFY(!cod.isInjective());
+  VERIFY(!cod.isInvertible());
+  VERIFY(!cod.isSurjective());
+
+  MatrixQType q = cod.householderQ();
+  VERIFY_IS_UNITARY(q);
+
+  MatrixType z = cod.matrixZ();
+  VERIFY_IS_UNITARY(z);
+
+  MatrixType t;
+  t.setZero(rows, cols);
+  t.topLeftCorner(rank, rank) =
+      cod.matrixT().topLeftCorner(rank, rank).template triangularView<Upper>();
+
+  MatrixType c = q * t * z * cod.colsPermutation().inverse();
+  VERIFY_IS_APPROX(matrix, c);
+
+  MatrixType exact_solution = MatrixType::Random(cols, cols2);
+  MatrixType rhs = matrix * exact_solution;
+  MatrixType cod_solution = cod.solve(rhs);
+  VERIFY_IS_APPROX(rhs, matrix * cod_solution);
+
+  // Verify that we get the same minimum-norm solution as the SVD.
+  JacobiSVD<MatrixType> svd(matrix, ComputeThinU | ComputeThinV);
+  MatrixType svd_solution = svd.solve(rhs);
+  VERIFY_IS_APPROX(cod_solution, svd_solution);
+
+  MatrixType pinv = cod.pseudoInverse();
+  VERIFY_IS_APPROX(cod_solution, pinv * rhs);
+}
+
+template <typename MatrixType, int Cols2>
+void cod_fixedsize() {
+  enum {
+    Rows = MatrixType::RowsAtCompileTime,
+    Cols = MatrixType::ColsAtCompileTime
+  };
+  typedef typename MatrixType::Scalar Scalar;
+  int rank = internal::random<int>(1, (std::min)(int(Rows), int(Cols)) - 1);
+  Matrix<Scalar, Rows, Cols> matrix;
+  createRandomPIMatrixOfRank(rank, Rows, Cols, matrix);
+  CompleteOrthogonalDecomposition<Matrix<Scalar, Rows, Cols> > cod(matrix);
+  VERIFY(rank == cod.rank());
+  VERIFY(Cols - cod.rank() == cod.dimensionOfKernel());
+  VERIFY(cod.isInjective() == (rank == Rows));
+  VERIFY(cod.isSurjective() == (rank == Cols));
+  VERIFY(cod.isInvertible() == (cod.isInjective() && cod.isSurjective()));
+
+  Matrix<Scalar, Cols, Cols2> exact_solution;
+  exact_solution.setRandom(Cols, Cols2);
+  Matrix<Scalar, Rows, Cols2> rhs = matrix * exact_solution;
+  Matrix<Scalar, Cols, Cols2> cod_solution = cod.solve(rhs);
+  VERIFY_IS_APPROX(rhs, matrix * cod_solution);
+
+  // Verify that we get the same minimum-norm solution as the SVD.
+  JacobiSVD<MatrixType> svd(matrix, ComputeFullU | ComputeFullV);
+  Matrix<Scalar, Cols, Cols2> svd_solution = svd.solve(rhs);
+  VERIFY_IS_APPROX(cod_solution, svd_solution);
+}
 
 template<typename MatrixType> void qr()
 {
+  using std::sqrt;
   typedef typename MatrixType::Index Index;
 
   Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols2 = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
   Index rank = internal::random<Index>(1, (std::min)(rows, cols)-1);
 
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> MatrixQType;
   MatrixType m1;
   createRandomPIMatrixOfRank(rank,rows,cols,m1);
   ColPivHouseholderQR<MatrixType> qr(m1);
-  VERIFY(rank == qr.rank());
-  VERIFY(cols - qr.rank() == qr.dimensionOfKernel());
+  VERIFY_IS_EQUAL(rank, qr.rank());
+  VERIFY_IS_EQUAL(cols - qr.rank(), qr.dimensionOfKernel());
   VERIFY(!qr.isInjective());
   VERIFY(!qr.isInvertible());
   VERIFY(!qr.isSurjective());
@@ -36,26 +118,59 @@ template<typename MatrixType> void qr()
   MatrixType c = q * r * qr.colsPermutation().inverse();
   VERIFY_IS_APPROX(m1, c);
 
+  // Verify that the absolute value of the diagonal elements in R are
+  // non-increasing until they reach the singularity threshold.
+  RealScalar threshold =
+      sqrt(RealScalar(rows)) * numext::abs(r(0, 0)) * NumTraits<Scalar>::epsilon();
+  for (Index i = 0; i < (std::min)(rows, cols) - 1; ++i) {
+    RealScalar x = numext::abs(r(i, i));
+    RealScalar y = numext::abs(r(i + 1, i + 1));
+    if (x < threshold && y < threshold) continue;
+    if (!test_isApproxOrLessThan(y, x)) {
+      for (Index j = 0; j < (std::min)(rows, cols); ++j) {
+        std::cout << "i = " << j << ", |r_ii| = " << numext::abs(r(j, j)) << std::endl;
+      }
+      std::cout << "Failure at i=" << i << ", rank=" << rank
+                << ", threshold=" << threshold << std::endl;
+    }
+    VERIFY_IS_APPROX_OR_LESS_THAN(y, x);
+  }
+
   MatrixType m2 = MatrixType::Random(cols,cols2);
   MatrixType m3 = m1*m2;
   m2 = MatrixType::Random(cols,cols2);
   m2 = qr.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
+
+  {
+    Index size = rows;
+    do {
+      m1 = MatrixType::Random(size,size);
+      qr.compute(m1);
+    } while(!qr.isInvertible());
+    MatrixType m1_inv = qr.inverse();
+    m3 = m1 * MatrixType::Random(size,cols2);
+    m2 = qr.solve(m3);
+    VERIFY_IS_APPROX(m2, m1_inv*m3);
+  }
 }
 
 template<typename MatrixType, int Cols2> void qr_fixedsize()
 {
+  using std::sqrt;
+  using std::abs;
   enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
   int rank = internal::random<int>(1, (std::min)(int(Rows), int(Cols))-1);
   Matrix<Scalar,Rows,Cols> m1;
   createRandomPIMatrixOfRank(rank,Rows,Cols,m1);
   ColPivHouseholderQR<Matrix<Scalar,Rows,Cols> > qr(m1);
-  VERIFY(rank == qr.rank());
-  VERIFY(Cols - qr.rank() == qr.dimensionOfKernel());
-  VERIFY(qr.isInjective() == (rank == Rows));
-  VERIFY(qr.isSurjective() == (rank == Cols));
-  VERIFY(qr.isInvertible() == (qr.isInjective() && qr.isSurjective()));
+  VERIFY_IS_EQUAL(rank, qr.rank());
+  VERIFY_IS_EQUAL(Cols - qr.rank(), qr.dimensionOfKernel());
+  VERIFY_IS_EQUAL(qr.isInjective(), (rank == Rows));
+  VERIFY_IS_EQUAL(qr.isSurjective(), (rank == Cols));
+  VERIFY_IS_EQUAL(qr.isInvertible(), (qr.isInjective() && qr.isSurjective()));
 
   Matrix<Scalar,Rows,Cols> r = qr.matrixQR().template triangularView<Upper>();
   Matrix<Scalar,Rows,Cols> c = qr.householderQ() * r * qr.colsPermutation().inverse();
@@ -66,6 +181,71 @@ template<typename MatrixType, int Cols2> void qr_fixedsize()
   m2 = Matrix<Scalar,Cols,Cols2>::Random(Cols,Cols2);
   m2 = qr.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
+  // Verify that the absolute value of the diagonal elements in R are
+  // non-increasing until they reache the singularity threshold.
+  RealScalar threshold =
+      sqrt(RealScalar(Rows)) * (std::abs)(r(0, 0)) * NumTraits<Scalar>::epsilon();
+  for (Index i = 0; i < (std::min)(int(Rows), int(Cols)) - 1; ++i) {
+    RealScalar x = numext::abs(r(i, i));
+    RealScalar y = numext::abs(r(i + 1, i + 1));
+    if (x < threshold && y < threshold) continue;
+    if (!test_isApproxOrLessThan(y, x)) {
+      for (Index j = 0; j < (std::min)(int(Rows), int(Cols)); ++j) {
+        std::cout << "i = " << j << ", |r_ii| = " << numext::abs(r(j, j)) << std::endl;
+      }
+      std::cout << "Failure at i=" << i << ", rank=" << rank
+                << ", threshold=" << threshold << std::endl;
+    }
+    VERIFY_IS_APPROX_OR_LESS_THAN(y, x);
+  }
+}
+
+// This test is meant to verify that pivots are chosen such that
+// even for a graded matrix, the diagonal of R falls of roughly
+// monotonically until it reaches the threshold for singularity.
+// We use the so-called Kahan matrix, which is a famous counter-example
+// for rank-revealing QR. See
+// http://www.netlib.org/lapack/lawnspdf/lawn176.pdf
+// page 3 for more detail.
+template<typename MatrixType> void qr_kahan_matrix()
+{
+  using std::sqrt;
+  using std::abs;
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+  Index rows = 300, cols = rows;
+
+  MatrixType m1;
+  m1.setZero(rows,cols);
+  RealScalar s = std::pow(NumTraits<RealScalar>::epsilon(), 1.0 / rows);
+  RealScalar c = std::sqrt(1 - s*s);
+  RealScalar pow_s_i(1.0); // pow(s,i)
+  for (Index i = 0; i < rows; ++i) {
+    m1(i, i) = pow_s_i;
+    m1.row(i).tail(rows - i - 1) = -pow_s_i * c * MatrixType::Ones(1, rows - i - 1);
+    pow_s_i *= s;
+  }
+  m1 = (m1 + m1.transpose()).eval();
+  ColPivHouseholderQR<MatrixType> qr(m1);
+  MatrixType r = qr.matrixQR().template triangularView<Upper>();
+
+  RealScalar threshold =
+      std::sqrt(RealScalar(rows)) * numext::abs(r(0, 0)) * NumTraits<Scalar>::epsilon();
+  for (Index i = 0; i < (std::min)(rows, cols) - 1; ++i) {
+    RealScalar x = numext::abs(r(i, i));
+    RealScalar y = numext::abs(r(i + 1, i + 1));
+    if (x < threshold && y < threshold) continue;
+    if (!test_isApproxOrLessThan(y, x)) {
+      for (Index j = 0; j < (std::min)(rows, cols); ++j) {
+        std::cout << "i = " << j << ", |r_ii| = " << numext::abs(r(j, j)) << std::endl;
+      }
+      std::cout << "Failure at i=" << i << ", rank=" << qr.rank()
+                << ", threshold=" << threshold << std::endl;
+    }
+    VERIFY_IS_APPROX_OR_LESS_THAN(y, x);
+  }
 }
 
 template<typename MatrixType> void qr_invertible()
@@ -132,6 +312,15 @@ void test_qr_colpivoting()
   }
 
   for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( cod<MatrixXf>() );
+    CALL_SUBTEST_2( cod<MatrixXd>() );
+    CALL_SUBTEST_3( cod<MatrixXcd>() );
+    CALL_SUBTEST_4(( cod_fixedsize<Matrix<float,3,5>, 4 >() ));
+    CALL_SUBTEST_5(( cod_fixedsize<Matrix<double,6,2>, 3 >() ));
+    CALL_SUBTEST_5(( cod_fixedsize<Matrix<double,1,1>, 1 >() ));
+  }
+
+  for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( qr_invertible<MatrixXf>() );
     CALL_SUBTEST_2( qr_invertible<MatrixXd>() );
     CALL_SUBTEST_6( qr_invertible<MatrixXcf>() );
@@ -147,4 +336,7 @@ void test_qr_colpivoting()
 
   // Test problem size constructors
   CALL_SUBTEST_9(ColPivHouseholderQR<MatrixXf>(10, 20));
+
+  CALL_SUBTEST_1( qr_kahan_matrix<MatrixXf>() );
+  CALL_SUBTEST_2( qr_kahan_matrix<MatrixXd>() );
 }
diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp
index 511f2473f..70e89c198 100644
--- a/test/qr_fullpivoting.cpp
+++ b/test/qr_fullpivoting.cpp
@@ -15,16 +15,20 @@ template<typename MatrixType> void qr()
 {
   typedef typename MatrixType::Index Index;
 
-  Index rows = internal::random<Index>(20,200), cols = internal::random<int>(20,200), cols2 = internal::random<int>(20,200);
-  Index rank = internal::random<Index>(1, (std::min)(rows, cols)-1);
+  Index max_size = EIGEN_TEST_MAX_SIZE;
+  Index min_size = numext::maxi(1,EIGEN_TEST_MAX_SIZE/10);
+  Index rows  = internal::random<Index>(min_size,max_size),
+        cols  = internal::random<Index>(min_size,max_size),
+        cols2 = internal::random<Index>(min_size,max_size),
+        rank  = internal::random<Index>(1, (std::min)(rows, cols)-1);
 
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> MatrixQType;
   MatrixType m1;
   createRandomPIMatrixOfRank(rank,rows,cols,m1);
   FullPivHouseholderQR<MatrixType> qr(m1);
-  VERIFY(rank == qr.rank());
-  VERIFY(cols - qr.rank() == qr.dimensionOfKernel());
+  VERIFY_IS_EQUAL(rank, qr.rank());
+  VERIFY_IS_EQUAL(cols - qr.rank(), qr.dimensionOfKernel());
   VERIFY(!qr.isInjective());
   VERIFY(!qr.isInvertible());
   VERIFY(!qr.isSurjective());
@@ -40,12 +44,28 @@ template<typename MatrixType> void qr()
   MatrixType c = qr.matrixQ() * r * qr.colsPermutation().inverse();
 
   VERIFY_IS_APPROX(m1, c);
-
+  
+  // stress the ReturnByValue mechanism
+  MatrixType tmp;
+  VERIFY_IS_APPROX(tmp.noalias() = qr.matrixQ() * r, (qr.matrixQ() * r).eval());
+  
   MatrixType m2 = MatrixType::Random(cols,cols2);
   MatrixType m3 = m1*m2;
   m2 = MatrixType::Random(cols,cols2);
   m2 = qr.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
+
+  {
+    Index size = rows;
+    do {
+      m1 = MatrixType::Random(size,size);
+      qr.compute(m1);
+    } while(!qr.isInvertible());
+    MatrixType m1_inv = qr.inverse();
+    m3 = m1 * MatrixType::Random(size,cols2);
+    m2 = qr.solve(m3);
+    VERIFY_IS_APPROX(m2, m1_inv*m3);
+  }
 }
 
 template<typename MatrixType> void qr_invertible()
@@ -55,7 +75,9 @@ template<typename MatrixType> void qr_invertible()
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
   typedef typename MatrixType::Scalar Scalar;
 
-  int size = internal::random<int>(10,50);
+  Index max_size = numext::mini(50,EIGEN_TEST_MAX_SIZE);
+  Index min_size = numext::maxi(1,EIGEN_TEST_MAX_SIZE/10);
+  Index size = internal::random<Index>(min_size,max_size);
 
   MatrixType m1(size, size), m2(size, size), m3(size, size);
   m1 = MatrixType::Random(size,size);
diff --git a/test/rand.cpp b/test/rand.cpp
new file mode 100644
index 000000000..51cf01773
--- /dev/null
+++ b/test/rand.cpp
@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+typedef long long int64;
+
+template<typename Scalar> Scalar check_in_range(Scalar x, Scalar y)
+{
+  Scalar r = internal::random<Scalar>(x,y);
+  VERIFY(r>=x);
+  if(y>=x)
+  {
+    VERIFY(r<=y);
+  }
+  return r;
+}
+
+template<typename Scalar> void check_all_in_range(Scalar x, Scalar y)
+{
+  Array<int,1,Dynamic> mask(y-x+1);
+  mask.fill(0);
+  long n = (y-x+1)*32;
+  for(long k=0; k<n; ++k)
+  {
+    mask( check_in_range(x,y)-x )++;
+  }
+  for(Index i=0; i<mask.size(); ++i)
+    if(mask(i)==0)
+      std::cout << "WARNING: value " << x+i << " not reached." << std::endl;
+  VERIFY( (mask>0).all() );
+}
+
+template<typename Scalar> void check_histogram(Scalar x, Scalar y, int bins)
+{
+  Array<int,1,Dynamic> hist(bins);
+  hist.fill(0);
+  int f = 100000;
+  int n = bins*f;
+  int64 range = int64(y)-int64(x);
+  int divisor = int((range+1)/bins);
+  assert(((range+1)%bins)==0);
+  for(int k=0; k<n; ++k)
+  {
+    Scalar r = check_in_range(x,y);
+    hist( int((int64(r)-int64(x))/divisor) )++;
+  }
+  VERIFY( (((hist.cast<double>()/double(f))-1.0).abs()<0.02).all() );
+}
+
+void test_rand()
+{
+  long long_ref = NumTraits<long>::highest()/10;
+  signed char char_offset = (std::min)(g_repeat,64);
+  signed char short_offset = (std::min)(g_repeat,16000);
+
+  for(int i = 0; i < g_repeat*10000; i++) {
+    CALL_SUBTEST(check_in_range<float>(10,11));
+    CALL_SUBTEST(check_in_range<float>(1.24234523,1.24234523));
+    CALL_SUBTEST(check_in_range<float>(-1,1));
+    CALL_SUBTEST(check_in_range<float>(-1432.2352,-1432.2352));
+
+    CALL_SUBTEST(check_in_range<double>(10,11));
+    CALL_SUBTEST(check_in_range<double>(1.24234523,1.24234523));
+    CALL_SUBTEST(check_in_range<double>(-1,1));
+    CALL_SUBTEST(check_in_range<double>(-1432.2352,-1432.2352));
+
+    CALL_SUBTEST(check_in_range<int>(0,-1));
+    CALL_SUBTEST(check_in_range<short>(0,-1));
+    CALL_SUBTEST(check_in_range<long>(0,-1));
+    CALL_SUBTEST(check_in_range<int>(-673456,673456));
+    CALL_SUBTEST(check_in_range<int>(-RAND_MAX+10,RAND_MAX-10));
+    CALL_SUBTEST(check_in_range<short>(-24345,24345));
+    CALL_SUBTEST(check_in_range<long>(-long_ref,long_ref));
+  }
+
+  CALL_SUBTEST(check_all_in_range<signed char>(11,11));
+  CALL_SUBTEST(check_all_in_range<signed char>(11,11+char_offset));
+  CALL_SUBTEST(check_all_in_range<signed char>(-5,5));
+  CALL_SUBTEST(check_all_in_range<signed char>(-11-char_offset,-11));
+  CALL_SUBTEST(check_all_in_range<signed char>(-126,-126+char_offset));
+  CALL_SUBTEST(check_all_in_range<signed char>(126-char_offset,126));
+  CALL_SUBTEST(check_all_in_range<signed char>(-126,126));
+
+  CALL_SUBTEST(check_all_in_range<short>(11,11));
+  CALL_SUBTEST(check_all_in_range<short>(11,11+short_offset));
+  CALL_SUBTEST(check_all_in_range<short>(-5,5));
+  CALL_SUBTEST(check_all_in_range<short>(-11-short_offset,-11));
+  CALL_SUBTEST(check_all_in_range<short>(-24345,-24345+short_offset));
+  CALL_SUBTEST(check_all_in_range<short>(24345,24345+short_offset));
+
+  CALL_SUBTEST(check_all_in_range<int>(11,11));
+  CALL_SUBTEST(check_all_in_range<int>(11,11+g_repeat));
+  CALL_SUBTEST(check_all_in_range<int>(-5,5));
+  CALL_SUBTEST(check_all_in_range<int>(-11-g_repeat,-11));
+  CALL_SUBTEST(check_all_in_range<int>(-673456,-673456+g_repeat));
+  CALL_SUBTEST(check_all_in_range<int>(673456,673456+g_repeat));
+
+  CALL_SUBTEST(check_all_in_range<long>(11,11));
+  CALL_SUBTEST(check_all_in_range<long>(11,11+g_repeat));
+  CALL_SUBTEST(check_all_in_range<long>(-5,5));
+  CALL_SUBTEST(check_all_in_range<long>(-11-g_repeat,-11));
+  CALL_SUBTEST(check_all_in_range<long>(-long_ref,-long_ref+g_repeat));
+  CALL_SUBTEST(check_all_in_range<long>( long_ref, long_ref+g_repeat));
+
+  CALL_SUBTEST(check_histogram<int>(-5,5,11));
+  int bins = 100;
+  CALL_SUBTEST(check_histogram<int>(-3333,-3333+bins*(3333/bins)-1,bins));
+  bins = 1000;
+  CALL_SUBTEST(check_histogram<int>(-RAND_MAX+10,-RAND_MAX+10+bins*(RAND_MAX/bins)-1,bins));
+  CALL_SUBTEST(check_histogram<int>(-RAND_MAX+10,-int64(RAND_MAX)+10+bins*(2*int64(RAND_MAX)/bins)-1,bins));
+}
diff --git a/test/real_qz.cpp b/test/real_qz.cpp
index a1766c6d9..99ac31235 100644
--- a/test/real_qz.cpp
+++ b/test/real_qz.cpp
@@ -7,6 +7,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define EIGEN_RUNTIME_NO_MALLOC
 #include "main.h"
 #include <limits>
 #include <Eigen/Eigenvalues>
@@ -41,7 +42,11 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
     break;
   }
 
-  RealQZ<MatrixType> qz(A,B);
+  RealQZ<MatrixType> qz(dim);
+  // TODO enable full-prealocation of required memory, this probably requires an in-place mode for HessenbergDecomposition
+  //Eigen::internal::set_is_malloc_allowed(false);
+  qz.compute(A,B);
+  //Eigen::internal::set_is_malloc_allowed(true);
   
   VERIFY_IS_EQUAL(qz.info(), Success);
   // check for zeros
@@ -49,11 +54,20 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
   for (Index i=0; i<A.cols(); i++)
     for (Index j=0; j<i; j++) {
       if (abs(qz.matrixT()(i,j))!=Scalar(0.0))
+      {
+        std::cerr << "Error: T(" << i << "," << j << ") = " << qz.matrixT()(i,j) << std::endl;
         all_zeros = false;
+      }
       if (j<i-1 && abs(qz.matrixS()(i,j))!=Scalar(0.0))
+      {
+        std::cerr << "Error: S(" << i << "," << j << ") = " << qz.matrixS()(i,j) << std::endl;
         all_zeros = false;
+      }
       if (j==i-1 && j>0 && abs(qz.matrixS()(i,j))!=Scalar(0.0) && abs(qz.matrixS()(i-1,j-1))!=Scalar(0.0))
+      {
+        std::cerr << "Error: S(" << i << "," << j << ") = " << qz.matrixS()(i,j)  << " && S(" << i-1 << "," << j-1 << ") = " << qz.matrixS()(i-1,j-1) << std::endl;
         all_zeros = false;
+      }
     }
   VERIFY_IS_EQUAL(all_zeros, true);
   VERIFY_IS_APPROX(qz.matrixQ()*qz.matrixS()*qz.matrixZ(), A);
diff --git a/test/redux.cpp b/test/redux.cpp
index 0d176e500..989e1057b 100644
--- a/test/redux.cpp
+++ b/test/redux.cpp
@@ -2,11 +2,14 @@
 // for linear algebra.
 //
 // Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
 #include "main.h"
 
 template<typename MatrixType> void matrixRedux(const MatrixType& m)
@@ -21,7 +24,7 @@ template<typename MatrixType> void matrixRedux(const MatrixType& m)
   MatrixType m1 = MatrixType::Random(rows, cols);
 
   // The entries of m1 are uniformly distributed in [0,1], so m1.prod() is very small. This may lead to test
-  // failures if we underflow into denormals. Thus, we scale so that entires are close to 1.
+  // failures if we underflow into denormals. Thus, we scale so that entries are close to 1.
   MatrixType m1_for_prod = MatrixType::Ones(rows, cols) + RealScalar(0.2) * m1;
 
   VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).sum(), Scalar(1));
@@ -53,10 +56,24 @@ template<typename MatrixType> void matrixRedux(const MatrixType& m)
   VERIFY_IS_APPROX(m1_for_prod.block(r0,c0,r1,c1).prod(), m1_for_prod.block(r0,c0,r1,c1).eval().prod());
   VERIFY_IS_APPROX(m1.block(r0,c0,r1,c1).real().minCoeff(), m1.block(r0,c0,r1,c1).real().eval().minCoeff());
   VERIFY_IS_APPROX(m1.block(r0,c0,r1,c1).real().maxCoeff(), m1.block(r0,c0,r1,c1).real().eval().maxCoeff());
+
+  // regression for bug 1090
+  const int R1 = MatrixType::RowsAtCompileTime>=2 ? MatrixType::RowsAtCompileTime/2 : 6;
+  const int C1 = MatrixType::ColsAtCompileTime>=2 ? MatrixType::ColsAtCompileTime/2 : 6;
+  if(R1<=rows-r0 && C1<=cols-c0)
+  {
+    VERIFY_IS_APPROX( (m1.template block<R1,C1>(r0,c0).sum()), m1.block(r0,c0,R1,C1).sum() );
+  }
   
   // test empty objects
   VERIFY_IS_APPROX(m1.block(r0,c0,0,0).sum(),   Scalar(0));
   VERIFY_IS_APPROX(m1.block(r0,c0,0,0).prod(),  Scalar(1));
+
+  // test nesting complex expression
+  VERIFY_EVALUATION_COUNT( (m1.matrix()*m1.matrix().transpose()).sum(), (MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1) );
+  Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> m2(rows,rows);
+  m2.setRandom();
+  VERIFY_EVALUATION_COUNT( ((m1.matrix()*m1.matrix().transpose())+m2).sum(),(MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1));
 }
 
 template<typename VectorType> void vectorRedux(const VectorType& w)
@@ -139,8 +156,10 @@ void test_redux()
     CALL_SUBTEST_1( matrixRedux(Array<float, 1, 1>()) );
     CALL_SUBTEST_2( matrixRedux(Matrix2f()) );
     CALL_SUBTEST_2( matrixRedux(Array2f()) );
+    CALL_SUBTEST_2( matrixRedux(Array22f()) );
     CALL_SUBTEST_3( matrixRedux(Matrix4d()) );
     CALL_SUBTEST_3( matrixRedux(Array4d()) );
+    CALL_SUBTEST_3( matrixRedux(Array44d()) );
     CALL_SUBTEST_4( matrixRedux(MatrixXcf(internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
     CALL_SUBTEST_4( matrixRedux(ArrayXXcf(internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
     CALL_SUBTEST_5( matrixRedux(MatrixXd (internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
diff --git a/test/ref.cpp b/test/ref.cpp
index 44bbd3bf1..769db0414 100644
--- a/test/ref.cpp
+++ b/test/ref.cpp
@@ -12,27 +12,23 @@
 #undef EIGEN_DEFAULT_TO_ROW_MAJOR
 #endif
 
-static int nb_temporaries;
-
-inline void on_temporary_creation(int) {
-  // here's a great place to set a breakpoint when debugging failures in this test!
-  nb_temporaries++;
-}
-  
-
-#define EIGEN_DENSE_STORAGE_CTOR_PLUGIN { on_temporary_creation(size); }
+#define TEST_ENABLE_TEMPORARY_TRACKING
 
 #include "main.h"
 
-#define VERIFY_EVALUATION_COUNT(XPR,N) {\
-    nb_temporaries = 0; \
-    XPR; \
-    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
-    VERIFY( (#XPR) && nb_temporaries==N ); \
-  }
+// test Ref.h
 
+// Deal with i387 extended precision
+#if EIGEN_ARCH_i386 && !(EIGEN_ARCH_x86_64)
 
-// test Ref.h
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(4,4)
+#pragma GCC optimize ("-ffloat-store")
+#else
+#undef VERIFY_IS_EQUAL
+#define VERIFY_IS_EQUAL(X,Y) VERIFY_IS_APPROX(X,Y)
+#endif
+
+#endif
 
 template<typename MatrixType> void ref_matrix(const MatrixType& m)
 {
@@ -71,7 +67,6 @@ template<typename MatrixType> void ref_matrix(const MatrixType& m)
   rm2 = m2.block(i,j,brows,bcols);
   VERIFY_IS_EQUAL(m1, m2);
   
-  
   ConstRefDynMat rm3 = m1.block(i,j,brows,bcols);
   m1.block(i,j,brows,bcols) *= 2;
   m2.block(i,j,brows,bcols) *= 2;
@@ -237,6 +232,12 @@ int test_ref_overload_fun1(Ref<MatrixXf> )       { return 3; }
 int test_ref_overload_fun2(Ref<const MatrixXd> ) { return 4; }
 int test_ref_overload_fun2(Ref<const MatrixXf> ) { return 5; }
 
+void test_ref_ambiguous(const Ref<const ArrayXd> &A, Ref<ArrayXd> B)
+{
+  B = A;
+  B = A - A;
+}
+
 // See also bug 969
 void test_ref_overloads()
 {
@@ -249,6 +250,9 @@ void test_ref_overloads()
   VERIFY( test_ref_overload_fun2(Ad)==4 );
   VERIFY( test_ref_overload_fun2(Ad+Bd)==4 );
   VERIFY( test_ref_overload_fun2(Af+Bf)==5 );
+  
+  ArrayXd A, B;
+  test_ref_ambiguous(A, B);
 }
 
 void test_ref()
diff --git a/test/runtest.sh b/test/runtest.sh
deleted file mode 100755
index 2be250819..000000000
--- a/test/runtest.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-black='\E[30m'
-red='\E[31m'
-green='\E[32m'
-yellow='\E[33m'
-blue='\E[34m'
-magenta='\E[35m'
-cyan='\E[36m'
-white='\E[37m'
-
-if ! ./$1 > /dev/null 2> .runtest.log ; then
-  echo -e  $red Test $1 failed: $black
-  echo -e $blue
-  cat .runtest.log
-  echo -e $black
-  exit 1
-else
-echo -e $green Test $1 passed$black
-fi
diff --git a/test/rvalue_types.cpp b/test/rvalue_types.cpp
new file mode 100644
index 000000000..8887f1b1b
--- /dev/null
+++ b/test/rvalue_types.cpp
@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/Core>
+
+using internal::UIntPtr;
+
+#if EIGEN_HAS_RVALUE_REFERENCES
+template <typename MatrixType>
+void rvalue_copyassign(const MatrixType& m)
+{
+
+  typedef typename internal::traits<MatrixType>::Scalar Scalar;
+  
+  // create a temporary which we are about to destroy by moving
+  MatrixType tmp = m;
+  UIntPtr src_address = reinterpret_cast<UIntPtr>(tmp.data());
+  
+  // move the temporary to n
+  MatrixType n = std::move(tmp);
+  UIntPtr dst_address = reinterpret_cast<UIntPtr>(n.data());
+
+  if (MatrixType::RowsAtCompileTime==Dynamic|| MatrixType::ColsAtCompileTime==Dynamic)
+  {
+    // verify that we actually moved the guts
+    VERIFY_IS_EQUAL(src_address, dst_address);
+  }
+
+  // verify that the content did not change
+  Scalar abs_diff = (m-n).array().abs().sum();
+  VERIFY_IS_EQUAL(abs_diff, Scalar(0));
+}
+#else
+template <typename MatrixType>
+void rvalue_copyassign(const MatrixType&) {}
+#endif
+
+void test_rvalue_types()
+{
+  CALL_SUBTEST_1(rvalue_copyassign( MatrixXf::Random(50,50).eval() ));
+  CALL_SUBTEST_1(rvalue_copyassign( ArrayXXf::Random(50,50).eval() ));
+
+  CALL_SUBTEST_1(rvalue_copyassign( Matrix<float,1,Dynamic>::Random(50).eval() ));
+  CALL_SUBTEST_1(rvalue_copyassign( Array<float,1,Dynamic>::Random(50).eval() ));
+
+  CALL_SUBTEST_1(rvalue_copyassign( Matrix<float,Dynamic,1>::Random(50).eval() ));
+  CALL_SUBTEST_1(rvalue_copyassign( Array<float,Dynamic,1>::Random(50).eval() ));
+  
+  CALL_SUBTEST_2(rvalue_copyassign( Array<float,2,1>::Random().eval() ));
+  CALL_SUBTEST_2(rvalue_copyassign( Array<float,3,1>::Random().eval() ));
+  CALL_SUBTEST_2(rvalue_copyassign( Array<float,4,1>::Random().eval() ));
+
+  CALL_SUBTEST_2(rvalue_copyassign( Array<float,2,2>::Random().eval() ));
+  CALL_SUBTEST_2(rvalue_copyassign( Array<float,3,3>::Random().eval() ));
+  CALL_SUBTEST_2(rvalue_copyassign( Array<float,4,4>::Random().eval() ));
+}
diff --git a/test/schur_complex.cpp b/test/schur_complex.cpp
index 5e869790f..deb78e44e 100644
--- a/test/schur_complex.cpp
+++ b/test/schur_complex.cpp
@@ -25,7 +25,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
     ComplexMatrixType T = schurOfA.matrixT();
     for(int row = 1; row < size; ++row) {
       for(int col = 0; col < row; ++col) {
-	VERIFY(T(row,col) == (typename MatrixType::Scalar)0);
+        VERIFY(T(row,col) == (typename MatrixType::Scalar)0);
       }
     }
     VERIFY_IS_APPROX(A.template cast<ComplexScalar>(), U * T * U.adjoint());
@@ -70,7 +70,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   VERIFY_IS_EQUAL(cs1.matrixT(), csOnlyT.matrixT());
   VERIFY_RAISES_ASSERT(csOnlyT.matrixU());
 
-  if (size > 1)
+  if (size > 1 && size < 20)
   {
     // Test matrix with NaN
     A(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
diff --git a/test/schur_real.cpp b/test/schur_real.cpp
index 36b9c24d1..4aede87df 100644
--- a/test/schur_real.cpp
+++ b/test/schur_real.cpp
@@ -82,7 +82,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   Atriangular.template triangularView<StrictlyLower>().setZero(); 
   rs3.setMaxIterations(1).compute(Atriangular); // triangular matrices do not need any iterations
   VERIFY_IS_EQUAL(rs3.info(), Success);
-  VERIFY_IS_EQUAL(rs3.matrixT(), Atriangular);
+  VERIFY_IS_APPROX(rs3.matrixT(), Atriangular); // approx because of scaling...
   VERIFY_IS_EQUAL(rs3.matrixU(), MatrixType::Identity(size, size));
 
   // Test computation of only T, not U
@@ -91,7 +91,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   VERIFY_IS_EQUAL(rs1.matrixT(), rsOnlyT.matrixT());
   VERIFY_RAISES_ASSERT(rsOnlyT.matrixU());
 
-  if (size > 2)
+  if (size > 2 && size < 20)
   {
     // Test matrix with NaN
     A(0,0) = std::numeric_limits<typename MatrixType::Scalar>::quiet_NaN();
diff --git a/test/selfadjoint.cpp b/test/selfadjoint.cpp
index 76dab6d64..92401e506 100644
--- a/test/selfadjoint.cpp
+++ b/test/selfadjoint.cpp
@@ -21,7 +21,9 @@ template<typename MatrixType> void selfadjoint(const MatrixType& m)
   Index cols = m.cols();
 
   MatrixType m1 = MatrixType::Random(rows, cols),
-             m3(rows, cols);
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols),
+             m4(rows, cols);
 
   m1.diagonal() = m1.diagonal().real().template cast<Scalar>();
 
@@ -30,10 +32,19 @@ template<typename MatrixType> void selfadjoint(const MatrixType& m)
   VERIFY_IS_APPROX(MatrixType(m3.template triangularView<Upper>()), MatrixType(m1.template triangularView<Upper>()));
   VERIFY_IS_APPROX(m3, m3.adjoint());
 
-
   m3 = m1.template selfadjointView<Lower>();
   VERIFY_IS_APPROX(MatrixType(m3.template triangularView<Lower>()), MatrixType(m1.template triangularView<Lower>()));
   VERIFY_IS_APPROX(m3, m3.adjoint());
+
+  m3 = m1.template selfadjointView<Upper>();
+  m4 = m2;
+  m4 += m1.template selfadjointView<Upper>();
+  VERIFY_IS_APPROX(m4, m2+m3);
+
+  m3 = m1.template selfadjointView<Lower>();
+  m4 = m2;
+  m4 -= m1.template selfadjointView<Lower>();
+  VERIFY_IS_APPROX(m4, m2-m3);
 }
 
 void bug_159()
diff --git a/test/simplicial_cholesky.cpp b/test/simplicial_cholesky.cpp
index 786468421..649c817b4 100644
--- a/test/simplicial_cholesky.cpp
+++ b/test/simplicial_cholesky.cpp
@@ -9,16 +9,17 @@
 
 #include "sparse_solver.h"
 
-template<typename T> void test_simplicial_cholesky_T()
+template<typename T, typename I> void test_simplicial_cholesky_T()
 {
-  SimplicialCholesky<SparseMatrix<T>, Lower> chol_colmajor_lower_amd;
-  SimplicialCholesky<SparseMatrix<T>, Upper> chol_colmajor_upper_amd;
-  SimplicialLLT<SparseMatrix<T>, Lower> llt_colmajor_lower_amd;
-  SimplicialLLT<SparseMatrix<T>, Upper> llt_colmajor_upper_amd;
-  SimplicialLDLT<SparseMatrix<T>, Lower> ldlt_colmajor_lower_amd;
-  SimplicialLDLT<SparseMatrix<T>, Upper> ldlt_colmajor_upper_amd;
-  SimplicialLDLT<SparseMatrix<T>, Lower, NaturalOrdering<int> > ldlt_colmajor_lower_nat;
-  SimplicialLDLT<SparseMatrix<T>, Upper, NaturalOrdering<int> > ldlt_colmajor_upper_nat;
+  typedef SparseMatrix<T,0,I> SparseMatrixType;
+  SimplicialCholesky<SparseMatrixType, Lower> chol_colmajor_lower_amd;
+  SimplicialCholesky<SparseMatrixType, Upper> chol_colmajor_upper_amd;
+  SimplicialLLT<     SparseMatrixType, Lower> llt_colmajor_lower_amd;
+  SimplicialLLT<     SparseMatrixType, Upper> llt_colmajor_upper_amd;
+  SimplicialLDLT<    SparseMatrixType, Lower> ldlt_colmajor_lower_amd;
+  SimplicialLDLT<    SparseMatrixType, Upper> ldlt_colmajor_upper_amd;
+  SimplicialLDLT<    SparseMatrixType, Lower, NaturalOrdering<I> > ldlt_colmajor_lower_nat;
+  SimplicialLDLT<    SparseMatrixType, Upper, NaturalOrdering<I> > ldlt_colmajor_upper_nat;
 
   check_sparse_spd_solving(chol_colmajor_lower_amd);
   check_sparse_spd_solving(chol_colmajor_upper_amd);
@@ -34,12 +35,13 @@ template<typename T> void test_simplicial_cholesky_T()
   check_sparse_spd_determinant(ldlt_colmajor_lower_amd);
   check_sparse_spd_determinant(ldlt_colmajor_upper_amd);
   
-  check_sparse_spd_solving(ldlt_colmajor_lower_nat);
-  check_sparse_spd_solving(ldlt_colmajor_upper_nat);
+  check_sparse_spd_solving(ldlt_colmajor_lower_nat, 300, 1000);
+  check_sparse_spd_solving(ldlt_colmajor_upper_nat, 300, 1000);
 }
 
 void test_simplicial_cholesky()
 {
-  CALL_SUBTEST_1(test_simplicial_cholesky_T<double>());
-  CALL_SUBTEST_2(test_simplicial_cholesky_T<std::complex<double> >());
+  CALL_SUBTEST_1(( test_simplicial_cholesky_T<double,int>() ));
+  CALL_SUBTEST_2(( test_simplicial_cholesky_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_simplicial_cholesky_T<double,long int>() ));
 }
diff --git a/test/sizeof.cpp b/test/sizeof.cpp
index d9ad35620..03ad20453 100644
--- a/test/sizeof.cpp
+++ b/test/sizeof.cpp
@@ -13,14 +13,27 @@ template<typename MatrixType> void verifySizeOf(const MatrixType&)
 {
   typedef typename MatrixType::Scalar Scalar;
   if (MatrixType::RowsAtCompileTime!=Dynamic && MatrixType::ColsAtCompileTime!=Dynamic)
-    VERIFY(std::ptrdiff_t(sizeof(MatrixType))==std::ptrdiff_t(sizeof(Scalar))*std::ptrdiff_t(MatrixType::SizeAtCompileTime));
+    VERIFY_IS_EQUAL(std::ptrdiff_t(sizeof(MatrixType)),std::ptrdiff_t(sizeof(Scalar))*std::ptrdiff_t(MatrixType::SizeAtCompileTime));
   else
-    VERIFY(sizeof(MatrixType)==sizeof(Scalar*) + 2 * sizeof(typename MatrixType::Index));
+    VERIFY_IS_EQUAL(sizeof(MatrixType),sizeof(Scalar*) + 2 * sizeof(typename MatrixType::Index));
 }
 
 void test_sizeof()
 {
   CALL_SUBTEST(verifySizeOf(Matrix<float, 1, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 2, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 3, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 4, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 5, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 6, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 7, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 8, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 9, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 10, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 11, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 12, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Vector2d()) );
+  CALL_SUBTEST(verifySizeOf(Vector4f()) );
   CALL_SUBTEST(verifySizeOf(Matrix4d()) );
   CALL_SUBTEST(verifySizeOf(Matrix<double, 4, 2>()) );
   CALL_SUBTEST(verifySizeOf(Matrix<bool, 7, 5>()) );
diff --git a/test/sizeoverflow.cpp b/test/sizeoverflow.cpp
index 16d6f8d04..240d22294 100644
--- a/test/sizeoverflow.cpp
+++ b/test/sizeoverflow.cpp
@@ -18,8 +18,6 @@
     VERIFY(threw && "should have thrown bad_alloc: " #a);     \
   }
 
-typedef DenseIndex Index;
-
 template<typename MatrixType>
 void triggerMatrixBadAlloc(Index rows, Index cols)
 {
diff --git a/test/sparse.h b/test/sparse.h
index e19a76316..9912e1e24 100644
--- a/test/sparse.h
+++ b/test/sparse.h
@@ -53,15 +53,15 @@ enum {
  * \param zeroCoords and nonzeroCoords allows to get the coordinate lists of the non zero,
  *        and zero coefficients respectively.
  */
-template<typename Scalar,int Opt1,int Opt2,typename Index> void
+template<typename Scalar,int Opt1,int Opt2,typename StorageIndex> void
 initSparse(double density,
            Matrix<Scalar,Dynamic,Dynamic,Opt1>& refMat,
-           SparseMatrix<Scalar,Opt2,Index>& sparseMat,
+           SparseMatrix<Scalar,Opt2,StorageIndex>& sparseMat,
            int flags = 0,
-           std::vector<Matrix<Index,2,1> >* zeroCoords = 0,
-           std::vector<Matrix<Index,2,1> >* nonzeroCoords = 0)
+           std::vector<Matrix<StorageIndex,2,1> >* zeroCoords = 0,
+           std::vector<Matrix<StorageIndex,2,1> >* nonzeroCoords = 0)
 {
-  enum { IsRowMajor = SparseMatrix<Scalar,Opt2,Index>::IsRowMajor };
+  enum { IsRowMajor = SparseMatrix<Scalar,Opt2,StorageIndex>::IsRowMajor };
   sparseMat.setZero();
   //sparseMat.reserve(int(refMat.rows()*refMat.cols()*density));
   sparseMat.reserve(VectorXi::Constant(IsRowMajor ? refMat.rows() : refMat.cols(), int((1.5*density)*(IsRowMajor?refMat.cols():refMat.rows()))));
@@ -71,14 +71,17 @@ initSparse(double density,
     //sparseMat.startVec(j);
     for(Index i=0; i<sparseMat.innerSize(); i++)
     {
-      int ai(i), aj(j);
+      Index ai(i), aj(j);
       if(IsRowMajor)
         std::swap(ai,aj);
       Scalar v = (internal::random<double>(0,1) < density) ? internal::random<Scalar>() : Scalar(0);
       if ((flags&ForceNonZeroDiag) && (i==j))
       {
+        // FIXME: the following is too conservative
         v = internal::random<Scalar>()*Scalar(3.);
-        v = v*v + Scalar(5.);
+        v = v*v;
+        if(numext::real(v)>0) v += Scalar(5);
+        else                  v -= Scalar(5);
       }
       if ((flags & MakeLowerTriangular) && aj>ai)
         v = Scalar(0);
@@ -93,11 +96,11 @@ initSparse(double density,
         //sparseMat.insertBackByOuterInner(j,i) = v;
         sparseMat.insertByOuterInner(j,i) = v;
         if (nonzeroCoords)
-          nonzeroCoords->push_back(Matrix<Index,2,1> (ai,aj));
+          nonzeroCoords->push_back(Matrix<StorageIndex,2,1> (ai,aj));
       }
       else if (zeroCoords)
       {
-        zeroCoords->push_back(Matrix<Index,2,1> (ai,aj));
+        zeroCoords->push_back(Matrix<StorageIndex,2,1> (ai,aj));
       }
       refMat(ai,aj) = v;
     }
@@ -163,7 +166,7 @@ initSparse(double density,
 {
   sparseVec.reserve(int(refVec.size()*density));
   sparseVec.setZero();
-  for(Index i=0; i<refVec.size(); i++)
+  for(int i=0; i<refVec.size(); i++)
   {
     Scalar v = (internal::random<double>(0,1) < density) ? internal::random<Scalar>() : Scalar(0);
     if (v!=Scalar(0))
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index ce41d713c..384985028 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -9,22 +9,28 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+static long g_realloc_count = 0;
+#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++;
+
 #include "sparse.h"
 
 template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& ref)
 {
-  typedef typename SparseMatrixType::Index Index;
-  typedef Matrix<Index,2,1> Vector2;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  typedef Matrix<StorageIndex,2,1> Vector2;
   
   const Index rows = ref.rows();
   const Index cols = ref.cols();
+  //const Index inner = ref.innerSize();
+  //const Index outer = ref.outerSize();
+
   typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::RealScalar RealScalar;
   enum { Flags = SparseMatrixType::Flags };
 
   double density = (std::max)(8./(rows*cols), 0.01);
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
-  typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
   Scalar eps = 1e-6;
 
   Scalar s1 = internal::random<Scalar>();
@@ -37,94 +43,22 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     std::vector<Vector2> nonzeroCoords;
     initSparse<Scalar>(density, refMat, m, 0, &zeroCoords, &nonzeroCoords);
 
-    if (zeroCoords.size()==0 || nonzeroCoords.size()==0)
-      return;
-
     // test coeff and coeffRef
-    for (int i=0; i<(int)zeroCoords.size(); ++i)
+    for (std::size_t i=0; i<zeroCoords.size(); ++i)
     {
       VERIFY_IS_MUCH_SMALLER_THAN( m.coeff(zeroCoords[i].x(),zeroCoords[i].y()), eps );
       if(internal::is_same<SparseMatrixType,SparseMatrix<Scalar,Flags> >::value)
-        VERIFY_RAISES_ASSERT( m.coeffRef(zeroCoords[0].x(),zeroCoords[0].y()) = 5 );
+        VERIFY_RAISES_ASSERT( m.coeffRef(zeroCoords[i].x(),zeroCoords[i].y()) = 5 );
     }
     VERIFY_IS_APPROX(m, refMat);
 
-    m.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
-    refMat.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
+    if(!nonzeroCoords.empty()) {
+      m.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
+      refMat.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
+    }
 
     VERIFY_IS_APPROX(m, refMat);
-      
-      // test InnerIterators and Block expressions
-      for (int t=0; t<10; ++t)
-      {
-        int j = internal::random<int>(0,cols-1);
-        int i = internal::random<int>(0,rows-1);
-        int w = internal::random<int>(1,cols-j-1);
-        int h = internal::random<int>(1,rows-i-1);
 
-        VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w));
-        for(int c=0; c<w; c++)
-        {
-          VERIFY_IS_APPROX(m.block(i,j,h,w).col(c), refMat.block(i,j,h,w).col(c));
-          for(int r=0; r<h; r++)
-          {
-            VERIFY_IS_APPROX(m.block(i,j,h,w).col(c).coeff(r), refMat.block(i,j,h,w).col(c).coeff(r));
-            VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
-          }
-        }
-        for(int r=0; r<h; r++)
-        {
-          VERIFY_IS_APPROX(m.block(i,j,h,w).row(r), refMat.block(i,j,h,w).row(r));
-          for(int c=0; c<w; c++)
-          {
-            VERIFY_IS_APPROX(m.block(i,j,h,w).row(r).coeff(c), refMat.block(i,j,h,w).row(r).coeff(c));
-            VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
-          }
-        }
-        
-        VERIFY_IS_APPROX(m.middleCols(j,w), refMat.middleCols(j,w));
-        VERIFY_IS_APPROX(m.middleRows(i,h), refMat.middleRows(i,h));
-        for(int r=0; r<h; r++)
-        {
-          VERIFY_IS_APPROX(m.middleCols(j,w).row(r), refMat.middleCols(j,w).row(r));
-          VERIFY_IS_APPROX(m.middleRows(i,h).row(r), refMat.middleRows(i,h).row(r));
-          for(int c=0; c<w; c++)
-          {
-            VERIFY_IS_APPROX(m.col(c).coeff(r), refMat.col(c).coeff(r));
-            VERIFY_IS_APPROX(m.row(r).coeff(c), refMat.row(r).coeff(c));
-            
-            VERIFY_IS_APPROX(m.middleCols(j,w).coeff(r,c), refMat.middleCols(j,w).coeff(r,c));
-            VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
-            if(m.middleCols(j,w).coeff(r,c) != Scalar(0))
-            {
-              VERIFY_IS_APPROX(m.middleCols(j,w).coeffRef(r,c), refMat.middleCols(j,w).coeff(r,c));
-            }
-            if(m.middleRows(i,h).coeff(r,c) != Scalar(0))
-            {
-              VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
-            }
-          }
-        }
-        for(int c=0; c<w; c++)
-        {
-          VERIFY_IS_APPROX(m.middleCols(j,w).col(c), refMat.middleCols(j,w).col(c));
-          VERIFY_IS_APPROX(m.middleRows(i,h).col(c), refMat.middleRows(i,h).col(c));
-        }
-      }
-
-      for(int c=0; c<cols; c++)
-      {
-        VERIFY_IS_APPROX(m.col(c) + m.col(c), (m + m).col(c));
-        VERIFY_IS_APPROX(m.col(c) + m.col(c), refMat.col(c) + refMat.col(c));
-      }
-
-      for(int r=0; r<rows; r++)
-      {
-        VERIFY_IS_APPROX(m.row(r) + m.row(r), (m + m).row(r));
-        VERIFY_IS_APPROX(m.row(r) + m.row(r), refMat.row(r) + refMat.row(r));
-      }
-      
-      
       // test assertion
       VERIFY_RAISES_ASSERT( m.coeffRef(-1,1) = 0 );
       VERIFY_RAISES_ASSERT( m.coeffRef(0,m.cols()) = 0 );
@@ -135,17 +69,31 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       DenseMatrix m1(rows,cols);
       m1.setZero();
       SparseMatrixType m2(rows,cols);
-      if(internal::random<int>()%2)
-        m2.reserve(VectorXi::Constant(m2.outerSize(), 2));
+      bool call_reserve = internal::random<int>()%2;
+      Index nnz = internal::random<int>(1,int(rows)/2);
+      if(call_reserve)
+      {
+        if(internal::random<int>()%2)
+          m2.reserve(VectorXi::Constant(m2.outerSize(), int(nnz)));
+        else
+          m2.reserve(m2.outerSize() * nnz);
+      }
+      g_realloc_count = 0;
       for (Index j=0; j<cols; ++j)
       {
-        for (Index k=0; k<rows/2; ++k)
+        for (Index k=0; k<nnz; ++k)
         {
           Index i = internal::random<Index>(0,rows-1);
           if (m1.coeff(i,j)==Scalar(0))
             m2.insert(i,j) = m1(i,j) = internal::random<Scalar>();
         }
       }
+      
+      if(call_reserve && !SparseMatrixType::IsRowMajor)
+      {
+        VERIFY(g_realloc_count==0);
+      }
+      
       m2.finalize();
       VERIFY_IS_APPROX(m2,m1);
     }
@@ -179,9 +127,9 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       DenseMatrix m1(rows,cols);
       m1.setZero();
       SparseMatrixType m2(rows,cols);
-      VectorXi r(VectorXi::Constant(m2.outerSize(), ((mode%2)==0) ? m2.innerSize() : std::max<int>(1,m2.innerSize()/8)));
+      VectorXi r(VectorXi::Constant(m2.outerSize(), ((mode%2)==0) ? int(m2.innerSize()) : std::max<int>(1,int(m2.innerSize())/8)));
       m2.reserve(r);
-      for (int k=0; k<rows*cols; ++k)
+      for (Index k=0; k<rows*cols; ++k)
       {
         Index i = internal::random<Index>(0,rows-1);
         Index j = internal::random<Index>(0,cols-1);
@@ -195,110 +143,46 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       VERIFY_IS_APPROX(m2,m1);
     }
 
-  // test innerVector()
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m2(rows, rows);
-    initSparse<Scalar>(density, refMat2, m2);
-    Index j0 = internal::random<Index>(0,rows-1);
-    Index j1 = internal::random<Index>(0,rows-1);
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.row(j0));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0));
-
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.row(j0)+refMat2.row(j1));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1));
-
-    SparseMatrixType m3(rows,rows);
-    m3.reserve(VectorXi::Constant(rows,rows/2));
-    for(Index j=0; j<rows; ++j)
-      for(Index k=0; k<j; ++k)
-        m3.insertByOuterInner(j,k) = k+1;
-    for(Index j=0; j<rows; ++j)
-    {
-      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
-      if(j>0)
-        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
-    }
-    m3.makeCompressed();
-    for(Index j=0; j<rows; ++j)
-    {
-      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
-      if(j>0)
-        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
-    }
-
-    //m2.innerVector(j0) = 2*m2.innerVector(j1);
-    //refMat2.col(j0) = 2*refMat2.col(j1);
-    //VERIFY_IS_APPROX(m2, refMat2);
-  }
-
-  // test innerVectors()
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m2(rows, rows);
-    initSparse<Scalar>(density, refMat2, m2);
-    if(internal::random<float>(0,1)>0.5) m2.makeCompressed();
-    
-    Index j0 = internal::random<Index>(0,rows-2);
-    Index j1 = internal::random<Index>(0,rows-2);
-    Index n0 = internal::random<Index>(1,rows-(std::max)(j0,j1));
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(j0,0,n0,cols));
-    else
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(0,j0,rows,n0));
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
-                       refMat2.middleRows(j0,n0)+refMat2.middleRows(j1,n0));
-    else
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
-                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
-    
-    VERIFY_IS_APPROX(m2, refMat2);
-    
-    m2.innerVectors(j0,n0) = m2.innerVectors(j0,n0) + m2.innerVectors(j1,n0);
-    if(SparseMatrixType::IsRowMajor)
-      refMat2.middleRows(j0,n0) = (refMat2.middleRows(j0,n0) + refMat2.middleRows(j1,n0)).eval();
-    else
-      refMat2.middleCols(j0,n0) = (refMat2.middleCols(j0,n0) + refMat2.middleCols(j1,n0)).eval();
-    
-    VERIFY_IS_APPROX(m2, refMat2);
-    
-  }
-  
   // test basic computations
   {
-    DenseMatrix refM1 = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refM2 = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refM3 = DenseMatrix::Zero(rows, rows);
-    DenseMatrix refM4 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m1(rows, rows);
-    SparseMatrixType m2(rows, rows);
-    SparseMatrixType m3(rows, rows);
-    SparseMatrixType m4(rows, rows);
+    DenseMatrix refM1 = DenseMatrix::Zero(rows, cols);
+    DenseMatrix refM2 = DenseMatrix::Zero(rows, cols);
+    DenseMatrix refM3 = DenseMatrix::Zero(rows, cols);
+    DenseMatrix refM4 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m1(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    SparseMatrixType m3(rows, cols);
+    SparseMatrixType m4(rows, cols);
     initSparse<Scalar>(density, refM1, m1);
     initSparse<Scalar>(density, refM2, m2);
     initSparse<Scalar>(density, refM3, m3);
     initSparse<Scalar>(density, refM4, m4);
 
+    if(internal::random<bool>())
+      m1.makeCompressed();
+
+    Index m1_nnz = m1.nonZeros();
+
+    VERIFY_IS_APPROX(m1*s1, refM1*s1);
     VERIFY_IS_APPROX(m1+m2, refM1+refM2);
     VERIFY_IS_APPROX(m1+m2+m3, refM1+refM2+refM3);
     VERIFY_IS_APPROX(m3.cwiseProduct(m1+m2), refM3.cwiseProduct(refM1+refM2));
     VERIFY_IS_APPROX(m1*s1-m2, refM1*s1-refM2);
-
-    VERIFY_IS_APPROX(m1*=s1, refM1*=s1);
-    VERIFY_IS_APPROX(m1/=s1, refM1/=s1);
-
-    VERIFY_IS_APPROX(m1+=m2, refM1+=refM2);
-    VERIFY_IS_APPROX(m1-=m2, refM1-=refM2);
+    VERIFY_IS_APPROX(m4=m1/s1, refM1/s1);
+    VERIFY_IS_EQUAL(m4.nonZeros(), m1_nnz);
 
     if(SparseMatrixType::IsRowMajor)
       VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.row(0)), refM1.row(0).dot(refM2.row(0)));
     else
-      VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.row(0)), refM1.col(0).dot(refM2.row(0)));
+      VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.col(0)), refM1.col(0).dot(refM2.col(0)));
+
+    DenseVector rv = DenseVector::Random(m1.cols());
+    DenseVector cv = DenseVector::Random(m1.rows());
+    Index r = internal::random<Index>(0,m1.rows()-2);
+    Index c = internal::random<Index>(0,m1.cols()-1);
+    VERIFY_IS_APPROX(( m1.template block<1,Dynamic>(r,0,1,m1.cols()).dot(rv)) , refM1.row(r).dot(rv));
+    VERIFY_IS_APPROX(m1.row(r).dot(rv), refM1.row(r).dot(rv));
+    VERIFY_IS_APPROX(m1.col(c).dot(cv), refM1.col(c).dot(cv));
 
     VERIFY_IS_APPROX(m1.conjugate(), refM1.conjugate());
     VERIFY_IS_APPROX(m1.real(), refM1.real());
@@ -306,105 +190,167 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     refM4.setRandom();
     // sparse cwise* dense
     VERIFY_IS_APPROX(m3.cwiseProduct(refM4), refM3.cwiseProduct(refM4));
+    // dense cwise* sparse
+    VERIFY_IS_APPROX(refM4.cwiseProduct(m3), refM4.cwiseProduct(refM3));
 //     VERIFY_IS_APPROX(m3.cwise()/refM4, refM3.cwise()/refM4);
 
+    VERIFY_IS_APPROX(refM4 + m3, refM4 + refM3);
+    VERIFY_IS_APPROX(m3 + refM4, refM3 + refM4);
+    VERIFY_IS_APPROX(refM4 - m3, refM4 - refM3);
+    VERIFY_IS_APPROX(m3 - refM4, refM3 - refM4);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + RealScalar(0.5)*m3).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + m3*RealScalar(0.5)).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + m3.cwiseProduct(m3)).eval(), RealScalar(0.5)*refM4 + refM3.cwiseProduct(refM3));
+
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + RealScalar(0.5)*m3).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + m3*RealScalar(0.5)).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + (m3+m3)).eval(), RealScalar(0.5)*refM4 + (refM3+refM3));
+    VERIFY_IS_APPROX(((refM3+m3)+RealScalar(0.5)*m3).eval(), RealScalar(0.5)*refM3 + (refM3+refM3));
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + (refM3+m3)).eval(), RealScalar(0.5)*refM4 + (refM3+refM3));
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + (m3+refM3)).eval(), RealScalar(0.5)*refM4 + (refM3+refM3));
+
+
+    VERIFY_IS_APPROX(m1.sum(), refM1.sum());
+
+    m4 = m1; refM4 = m4;
+
+    VERIFY_IS_APPROX(m1*=s1, refM1*=s1);
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    VERIFY_IS_APPROX(m1/=s1, refM1/=s1);
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+
+    VERIFY_IS_APPROX(m1+=m2, refM1+=refM2);
+    VERIFY_IS_APPROX(m1-=m2, refM1-=refM2);
+
+    if (rows>=2 && cols>=2)
+    {
+      VERIFY_RAISES_ASSERT( m1 += m1.innerVector(0) );
+      VERIFY_RAISES_ASSERT( m1 -= m1.innerVector(0) );
+      VERIFY_RAISES_ASSERT( refM1 -= m1.innerVector(0) );
+      VERIFY_RAISES_ASSERT( refM1 += m1.innerVector(0) );
+      m1 = m4; refM1 = refM4;
+    }
+
     // test aliasing
     VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
     VERIFY_IS_APPROX((m1 = m1.transpose()), (refM1 = refM1.transpose().eval()));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
     VERIFY_IS_APPROX((m1 = -m1.transpose()), (refM1 = -refM1.transpose().eval()));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
     VERIFY_IS_APPROX((m1 += -m1), (refM1 += -refM1));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
+
+    if(m1.isCompressed())
+    {
+      VERIFY_IS_APPROX(m1.coeffs().sum(), m1.sum());
+      m1.coeffs() += s1;
+      for(Index j = 0; j<m1.outerSize(); ++j)
+        for(typename SparseMatrixType::InnerIterator it(m1,j); it; ++it)
+          refM1(it.row(), it.col()) += s1;
+      VERIFY_IS_APPROX(m1, refM1);
+    }
+
+    // and/or
+    {
+      typedef SparseMatrix<bool, SparseMatrixType::Options, typename SparseMatrixType::StorageIndex> SpBool;
+      SpBool mb1 = m1.real().template cast<bool>();
+      SpBool mb2 = m2.real().template cast<bool>();
+      VERIFY_IS_EQUAL(mb1.template cast<int>().sum(), refM1.real().template cast<bool>().count());
+      VERIFY_IS_EQUAL((mb1 && mb2).template cast<int>().sum(), (refM1.real().template cast<bool>() && refM2.real().template cast<bool>()).count());
+      VERIFY_IS_EQUAL((mb1 || mb2).template cast<int>().sum(), (refM1.real().template cast<bool>() || refM2.real().template cast<bool>()).count());
+      SpBool mb3 = mb1 && mb2;
+      if(mb1.coeffs().all() && mb2.coeffs().all())
+      {
+        VERIFY_IS_EQUAL(mb3.nonZeros(), (refM1.real().template cast<bool>() && refM2.real().template cast<bool>()).count());
+      }
+    }
   }
 
-  // test transpose
+  // test reverse iterators
   {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m2(rows, rows);
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
     initSparse<Scalar>(density, refMat2, m2);
-    VERIFY_IS_APPROX(m2.transpose().eval(), refMat2.transpose().eval());
-    VERIFY_IS_APPROX(m2.transpose(), refMat2.transpose());
+    std::vector<Scalar> ref_value(m2.innerSize());
+    std::vector<Index> ref_index(m2.innerSize());
+    if(internal::random<bool>())
+      m2.makeCompressed();
+    for(Index j = 0; j<m2.outerSize(); ++j)
+    {
+      Index count_forward = 0;
 
-    VERIFY_IS_APPROX(SparseMatrixType(m2.adjoint()), refMat2.adjoint());
+      for(typename SparseMatrixType::InnerIterator it(m2,j); it; ++it)
+      {
+        ref_value[ref_value.size()-1-count_forward] = it.value();
+        ref_index[ref_index.size()-1-count_forward] = it.index();
+        count_forward++;
+      }
+      Index count_reverse = 0;
+      for(typename SparseMatrixType::ReverseInnerIterator it(m2,j); it; --it)
+      {
+        VERIFY_IS_APPROX( std::abs(ref_value[ref_value.size()-count_forward+count_reverse])+1, std::abs(it.value())+1);
+        VERIFY_IS_EQUAL( ref_index[ref_index.size()-count_forward+count_reverse] , it.index());
+        count_reverse++;
+      }
+      VERIFY_IS_EQUAL(count_forward, count_reverse);
+    }
   }
 
-  
-  
-  // test generic blocks
+  // test transpose
   {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m2(rows, rows);
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
     initSparse<Scalar>(density, refMat2, m2);
-    Index j0 = internal::random<Index>(0,rows-2);
-    Index j1 = internal::random<Index>(0,rows-2);
-    Index n0 = internal::random<Index>(1,rows-(std::max)(j0,j1));
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols), refMat2.block(j0,0,n0,cols));
-    else
-      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0), refMat2.block(0,j0,rows,n0));
-    
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols)+m2.block(j1,0,n0,cols),
-                      refMat2.block(j0,0,n0,cols)+refMat2.block(j1,0,n0,cols));
-    else
-      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0)+m2.block(0,j1,rows,n0),
-                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
-      
-    Index i = internal::random<Index>(0,m2.outerSize()-1);
-    if(SparseMatrixType::IsRowMajor) {
-      m2.innerVector(i) = m2.innerVector(i) * s1;
-      refMat2.row(i) = refMat2.row(i) * s1;
-      VERIFY_IS_APPROX(m2,refMat2);
-    } else {
-      m2.innerVector(i) = m2.innerVector(i) * s1;
-      refMat2.col(i) = refMat2.col(i) * s1;
-      VERIFY_IS_APPROX(m2,refMat2);
-    }
-    
-    VERIFY_IS_APPROX(DenseVector(m2.col(j0)), refMat2.col(j0));
-    VERIFY_IS_APPROX(m2.col(j0), refMat2.col(j0));
-    
-    VERIFY_IS_APPROX(RowDenseVector(m2.row(j0)), refMat2.row(j0));
-    VERIFY_IS_APPROX(m2.row(j0), refMat2.row(j0));
+    VERIFY_IS_APPROX(m2.transpose().eval(), refMat2.transpose().eval());
+    VERIFY_IS_APPROX(m2.transpose(), refMat2.transpose());
+
+    VERIFY_IS_APPROX(SparseMatrixType(m2.adjoint()), refMat2.adjoint());
     
-    VERIFY_IS_APPROX(m2.block(j0,j1,n0,n0), refMat2.block(j0,j1,n0,n0));
-    VERIFY_IS_APPROX((2*m2).block(j0,j1,n0,n0), (2*refMat2).block(j0,j1,n0,n0));
+    // check isApprox handles opposite storage order
+    typename Transpose<SparseMatrixType>::PlainObject m3(m2);
+    VERIFY(m2.isApprox(m3));
   }
 
   // test prune
   {
-    SparseMatrixType m2(rows, rows);
-    DenseMatrix refM2(rows, rows);
+    SparseMatrixType m2(rows, cols);
+    DenseMatrix refM2(rows, cols);
     refM2.setZero();
     int countFalseNonZero = 0;
     int countTrueNonZero = 0;
-    for (Index j=0; j<m2.outerSize(); ++j)
+    m2.reserve(VectorXi::Constant(m2.outerSize(), int(m2.innerSize())));
+    for (Index j=0; j<m2.cols(); ++j)
     {
-      m2.startVec(j);
-      for (Index i=0; i<m2.innerSize(); ++i)
+      for (Index i=0; i<m2.rows(); ++i)
       {
         float x = internal::random<float>(0,1);
-        if (x<0.1)
+        if (x<0.1f)
         {
           // do nothing
         }
-        else if (x<0.5)
+        else if (x<0.5f)
         {
           countFalseNonZero++;
-          m2.insertBackByOuterInner(j,i) = Scalar(0);
+          m2.insert(i,j) = Scalar(0);
         }
         else
         {
           countTrueNonZero++;
-          m2.insertBackByOuterInner(j,i) = Scalar(1);
-          if(SparseMatrixType::IsRowMajor)
-            refM2(j,i) = Scalar(1);
-          else
-            refM2(i,j) = Scalar(1);
+          m2.insert(i,j) = Scalar(1);
+          refM2(i,j) = Scalar(1);
         }
       }
     }
-    m2.finalize();
+    if(internal::random<bool>())
+      m2.makeCompressed();
     VERIFY(countFalseNonZero+countTrueNonZero == m2.nonZeros());
-    VERIFY_IS_APPROX(m2, refM2);
+    if(countTrueNonZero>0)
+      VERIFY_IS_APPROX(m2, refM2);
     m2.prune(Scalar(1));
     VERIFY(countTrueNonZero==m2.nonZeros());
     VERIFY_IS_APPROX(m2, refM2);
@@ -412,29 +358,74 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
 
   // test setFromTriplets
   {
-    typedef Triplet<Scalar,Index> TripletType;
+    typedef Triplet<Scalar,StorageIndex> TripletType;
     std::vector<TripletType> triplets;
-    int ntriplets = rows*cols;
+    Index ntriplets = rows*cols;
     triplets.reserve(ntriplets);
-    DenseMatrix refMat(rows,cols);
-    refMat.setZero();
-    for(int i=0;i<ntriplets;++i)
+    DenseMatrix refMat_sum  = DenseMatrix::Zero(rows,cols);
+    DenseMatrix refMat_prod = DenseMatrix::Zero(rows,cols);
+    DenseMatrix refMat_last = DenseMatrix::Zero(rows,cols);
+
+    for(Index i=0;i<ntriplets;++i)
     {
-      Index r = internal::random<Index>(0,rows-1);
-      Index c = internal::random<Index>(0,cols-1);
+      StorageIndex r = internal::random<StorageIndex>(0,StorageIndex(rows-1));
+      StorageIndex c = internal::random<StorageIndex>(0,StorageIndex(cols-1));
       Scalar v = internal::random<Scalar>();
       triplets.push_back(TripletType(r,c,v));
-      refMat(r,c) += v;
+      refMat_sum(r,c) += v;
+      if(std::abs(refMat_prod(r,c))==0)
+        refMat_prod(r,c) = v;
+      else
+        refMat_prod(r,c) *= v;
+      refMat_last(r,c) = v;
     }
     SparseMatrixType m(rows,cols);
     m.setFromTriplets(triplets.begin(), triplets.end());
-    VERIFY_IS_APPROX(m, refMat);
+    VERIFY_IS_APPROX(m, refMat_sum);
+
+    m.setFromTriplets(triplets.begin(), triplets.end(), std::multiplies<Scalar>());
+    VERIFY_IS_APPROX(m, refMat_prod);
+#if (defined(__cplusplus) && __cplusplus >= 201103L)
+    m.setFromTriplets(triplets.begin(), triplets.end(), [] (Scalar,Scalar b) { return b; });
+    VERIFY_IS_APPROX(m, refMat_last);
+#endif
+  }
+  
+  // test Map
+  {
+    DenseMatrix refMat2(rows, cols), refMat3(rows, cols);
+    SparseMatrixType m2(rows, cols), m3(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    initSparse<Scalar>(density, refMat3, m3);
+    {
+      Map<SparseMatrixType> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr());
+      Map<SparseMatrixType> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr());
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+    }
+    {
+      MappedSparseMatrix<Scalar,SparseMatrixType::Options,StorageIndex> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr());
+      MappedSparseMatrix<Scalar,SparseMatrixType::Options,StorageIndex> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr());
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+    }
+
+    Index i = internal::random<Index>(0,rows-1);
+    Index j = internal::random<Index>(0,cols-1);
+    m2.coeffRef(i,j) = 123;
+    if(internal::random<bool>())
+      m2.makeCompressed();
+    Map<SparseMatrixType> mapMat2(rows, cols, m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(),  m2.innerNonZeroPtr());
+    VERIFY_IS_EQUAL(m2.coeff(i,j),Scalar(123));
+    VERIFY_IS_EQUAL(mapMat2.coeff(i,j),Scalar(123));
+    mapMat2.coeffRef(i,j) = -123;
+    VERIFY_IS_EQUAL(m2.coeff(i,j),Scalar(-123));
   }
 
   // test triangularView
   {
-    DenseMatrix refMat2(rows, rows), refMat3(rows, rows);
-    SparseMatrixType m2(rows, rows), m3(rows, rows);
+    DenseMatrix refMat2(rows, cols), refMat3(rows, cols);
+    SparseMatrixType m2(rows, cols), m3(rows, cols);
     initSparse<Scalar>(density, refMat2, m2);
     refMat3 = refMat2.template triangularView<Lower>();
     m3 = m2.template triangularView<Lower>();
@@ -444,13 +435,15 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     m3 = m2.template triangularView<Upper>();
     VERIFY_IS_APPROX(m3, refMat3);
 
-    refMat3 = refMat2.template triangularView<UnitUpper>();
-    m3 = m2.template triangularView<UnitUpper>();
-    VERIFY_IS_APPROX(m3, refMat3);
+    {
+      refMat3 = refMat2.template triangularView<UnitUpper>();
+      m3 = m2.template triangularView<UnitUpper>();
+      VERIFY_IS_APPROX(m3, refMat3);
 
-    refMat3 = refMat2.template triangularView<UnitLower>();
-    m3 = m2.template triangularView<UnitLower>();
-    VERIFY_IS_APPROX(m3, refMat3);
+      refMat3 = refMat2.template triangularView<UnitLower>();
+      m3 = m2.template triangularView<UnitLower>();
+      VERIFY_IS_APPROX(m3, refMat3);
+    }
 
     refMat3 = refMat2.template triangularView<StrictlyUpper>();
     m3 = m2.template triangularView<StrictlyUpper>();
@@ -459,6 +452,10 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     refMat3 = refMat2.template triangularView<StrictlyLower>();
     m3 = m2.template triangularView<StrictlyLower>();
     VERIFY_IS_APPROX(m3, refMat3);
+
+    // check sparse-triangular to dense
+    refMat3 = m2.template triangularView<StrictlyUpper>();
+    VERIFY_IS_APPROX(refMat3, DenseMatrix(refMat2.template triangularView<StrictlyUpper>()));
   }
   
   // test selfadjointView
@@ -470,6 +467,19 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     refMat3 = refMat2.template selfadjointView<Lower>();
     m3 = m2.template selfadjointView<Lower>();
     VERIFY_IS_APPROX(m3, refMat3);
+
+    refMat3 += refMat2.template selfadjointView<Lower>();
+    m3 += m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    refMat3 -= refMat2.template selfadjointView<Lower>();
+    m3 -= m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    // selfadjointView only works for square matrices:
+    SparseMatrixType m4(rows, rows+1);
+    VERIFY_RAISES_ASSERT(m4.template selfadjointView<Lower>());
+    VERIFY_RAISES_ASSERT(m4.template selfadjointView<Upper>());
   }
   
   // test sparseView
@@ -478,28 +488,59 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     SparseMatrixType m2(rows, rows);
     initSparse<Scalar>(density, refMat2, m2);
     VERIFY_IS_APPROX(m2.eval(), refMat2.sparseView().eval());
+
+    // sparse view on expressions:
+    VERIFY_IS_APPROX((s1*m2).eval(), (s1*refMat2).sparseView().eval());
+    VERIFY_IS_APPROX((m2+m2).eval(), (refMat2+refMat2).sparseView().eval());
+    VERIFY_IS_APPROX((m2*m2).eval(), (refMat2.lazyProduct(refMat2)).sparseView().eval());
+    VERIFY_IS_APPROX((m2*m2).eval(), (refMat2*refMat2).sparseView().eval());
   }
 
   // test diagonal
   {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, rows);
-    SparseMatrixType m2(rows, rows);
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
     initSparse<Scalar>(density, refMat2, m2);
     VERIFY_IS_APPROX(m2.diagonal(), refMat2.diagonal().eval());
+    DenseVector d = m2.diagonal();
+    VERIFY_IS_APPROX(d, refMat2.diagonal().eval());
+    d = m2.diagonal().array();
+    VERIFY_IS_APPROX(d, refMat2.diagonal().eval());
+    VERIFY_IS_APPROX(const_cast<const SparseMatrixType&>(m2).diagonal(), refMat2.diagonal().eval());
+    
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag);
+    m2.diagonal()      += refMat2.diagonal();
+    refMat2.diagonal() += refMat2.diagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+  }
+  
+  // test diagonal to sparse
+  {
+    DenseVector d = DenseVector::Random(rows);
+    DenseMatrix refMat2 = d.asDiagonal();
+    SparseMatrixType m2(rows, rows);
+    m2 = d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+    SparseMatrixType m3(d.asDiagonal());
+    VERIFY_IS_APPROX(m3, refMat2);
+    refMat2 += d.asDiagonal();
+    m2 += d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
   }
   
   // test conservative resize
   {
-      std::vector< std::pair<Index,Index> > inc;
-      inc.push_back(std::pair<Index,Index>(-3,-2));
-      inc.push_back(std::pair<Index,Index>(0,0));
-      inc.push_back(std::pair<Index,Index>(3,2));
-      inc.push_back(std::pair<Index,Index>(3,0));
-      inc.push_back(std::pair<Index,Index>(0,3));
+      std::vector< std::pair<StorageIndex,StorageIndex> > inc;
+      if(rows > 3 && cols > 2)
+        inc.push_back(std::pair<StorageIndex,StorageIndex>(-3,-2));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(0,0));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(3,2));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(3,0));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(0,3));
       
       for(size_t i = 0; i< inc.size(); i++) {
-        Index incRows = inc[i].first;
-        Index incCols = inc[i].second;
+        StorageIndex incRows = inc[i].first;
+        StorageIndex incCols = inc[i].second;
         SparseMatrixType m1(rows, cols);
         DenseMatrix refMat1 = DenseMatrix::Zero(rows, cols);
         initSparse<Scalar>(density, refMat1, m1);
@@ -529,22 +570,119 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     SparseMatrixType m1(rows, rows);
     m1.setIdentity();
     VERIFY_IS_APPROX(m1, refMat1);
+    for(int k=0; k<rows*rows/4; ++k)
+    {
+      Index i = internal::random<Index>(0,rows-1);
+      Index j = internal::random<Index>(0,rows-1);
+      Scalar v = internal::random<Scalar>();
+      m1.coeffRef(i,j) = v;
+      refMat1.coeffRef(i,j) = v;
+      VERIFY_IS_APPROX(m1, refMat1);
+      if(internal::random<Index>(0,10)<2)
+        m1.makeCompressed();
+    }
+    m1.setIdentity();
+    refMat1.setIdentity();
+    VERIFY_IS_APPROX(m1, refMat1);
   }
+
+  // test array/vector of InnerIterator
+  {
+    typedef typename SparseMatrixType::InnerIterator IteratorType;
+
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    IteratorType static_array[2];
+    static_array[0] = IteratorType(m2,0);
+    static_array[1] = IteratorType(m2,m2.outerSize()-1);
+    VERIFY( static_array[0] || m2.innerVector(static_array[0].outer()).nonZeros() == 0 );
+    VERIFY( static_array[1] || m2.innerVector(static_array[1].outer()).nonZeros() == 0 );
+    if(static_array[0] && static_array[1])
+    {
+      ++(static_array[1]);
+      static_array[1] = IteratorType(m2,0);
+      VERIFY( static_array[1] );
+      VERIFY( static_array[1].index() == static_array[0].index() );
+      VERIFY( static_array[1].outer() == static_array[0].outer() );
+      VERIFY( static_array[1].value() == static_array[0].value() );
+    }
+
+    std::vector<IteratorType> iters(2);
+    iters[0] = IteratorType(m2,0);
+    iters[1] = IteratorType(m2,m2.outerSize()-1);
+  }
+}
+
+
+template<typename SparseMatrixType>
+void big_sparse_triplet(Index rows, Index cols, double density) {
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef Triplet<Scalar,Index> TripletType;
+  std::vector<TripletType> triplets;
+  double nelements = density * rows*cols;
+  VERIFY(nelements>=0 && nelements <  NumTraits<StorageIndex>::highest());
+  Index ntriplets = Index(nelements);
+  triplets.reserve(ntriplets);
+  Scalar sum = Scalar(0);
+  for(Index i=0;i<ntriplets;++i)
+  {
+    Index r = internal::random<Index>(0,rows-1);
+    Index c = internal::random<Index>(0,cols-1);
+    Scalar v = internal::random<Scalar>();
+    triplets.push_back(TripletType(r,c,v));
+    sum += v;
+  }
+  SparseMatrixType m(rows,cols);
+  m.setFromTriplets(triplets.begin(), triplets.end());
+  VERIFY(m.nonZeros() <= ntriplets);
+  VERIFY_IS_APPROX(sum, m.sum());
 }
 
+
 void test_sparse_basic()
 {
   for(int i = 0; i < g_repeat; i++) {
-    int s = Eigen::internal::random<int>(1,50);
-    EIGEN_UNUSED_VARIABLE(s);
+    int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    EIGEN_UNUSED_VARIABLE(r+c);
+    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(1, 1)) ));
     CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(8, 8)) ));
-    CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, ColMajor>(s, s)) ));
-    CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, RowMajor>(s, s)) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(s, s)) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,ColMajor,long int>(s, s)) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,RowMajor,long int>(s, s)) ));
+    CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, ColMajor>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, RowMajor>(r, c)) ));
+    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(r, c)) ));
+    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,ColMajor,long int>(r, c)) ));
+    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,RowMajor,long int>(r, c)) ));
     
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,ColMajor,short int>(short(s), short(s))) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,RowMajor,short int>(short(s), short(s))) ));
+    r = Eigen::internal::random<int>(1,100);
+    c = Eigen::internal::random<int>(1,100);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    
+    CALL_SUBTEST_6(( sparse_basic(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_6(( sparse_basic(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+  }
+
+  // Regression test for bug 900: (manually insert higher values here, if you have enough RAM):
+  CALL_SUBTEST_3((big_sparse_triplet<SparseMatrix<float, RowMajor, int> >(10000, 10000, 0.125)));
+  CALL_SUBTEST_4((big_sparse_triplet<SparseMatrix<double, ColMajor, long int> >(10000, 10000, 0.125)));
+
+  // Regression test for bug 1105
+#ifdef EIGEN_TEST_PART_7
+  {
+    int n = Eigen::internal::random<int>(200,600);
+    SparseMatrix<std::complex<double>,0, long> mat(n, n);
+    std::complex<double> val;
+
+    for(int i=0; i<n; ++i)
+    {
+      mat.coeffRef(i, i%(n/10)) = val;
+      VERIFY(mat.data().allocatedSize()<20*n);
+    }
   }
+#endif
 }
diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp
new file mode 100644
index 000000000..2a0b3b617
--- /dev/null
+++ b/test/sparse_block.cpp
@@ -0,0 +1,317 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse.h"
+
+template<typename T>
+typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==RowMajorBit, typename T::RowXpr>::type
+innervec(T& A, Index i)
+{
+  return A.row(i);
+}
+
+template<typename T>
+typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==0, typename T::ColXpr>::type
+innervec(T& A, Index i)
+{
+  return A.col(i);
+}
+
+template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& ref)
+{
+  const Index rows = ref.rows();
+  const Index cols = ref.cols();
+  const Index inner = ref.innerSize();
+  const Index outer = ref.outerSize();
+
+  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+
+  double density = (std::max)(8./(rows*cols), 0.01);
+  typedef Matrix<Scalar,Dynamic,Dynamic,SparseMatrixType::IsRowMajor?RowMajor:ColMajor> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
+  typedef SparseVector<Scalar> SparseVectorType;
+
+  Scalar s1 = internal::random<Scalar>();
+  {
+    SparseMatrixType m(rows, cols);
+    DenseMatrix refMat = DenseMatrix::Zero(rows, cols);
+    initSparse<Scalar>(density, refMat, m);
+
+    VERIFY_IS_APPROX(m, refMat);
+
+    // test InnerIterators and Block expressions
+    for (int t=0; t<10; ++t)
+    {
+      Index j = internal::random<Index>(0,cols-2);
+      Index i = internal::random<Index>(0,rows-2);
+      Index w = internal::random<Index>(1,cols-j);
+      Index h = internal::random<Index>(1,rows-i);
+
+      VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w));
+      for(Index c=0; c<w; c++)
+      {
+        VERIFY_IS_APPROX(m.block(i,j,h,w).col(c), refMat.block(i,j,h,w).col(c));
+        for(Index r=0; r<h; r++)
+        {
+          VERIFY_IS_APPROX(m.block(i,j,h,w).col(c).coeff(r), refMat.block(i,j,h,w).col(c).coeff(r));
+          VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
+        }
+      }
+      for(Index r=0; r<h; r++)
+      {
+        VERIFY_IS_APPROX(m.block(i,j,h,w).row(r), refMat.block(i,j,h,w).row(r));
+        for(Index c=0; c<w; c++)
+        {
+          VERIFY_IS_APPROX(m.block(i,j,h,w).row(r).coeff(c), refMat.block(i,j,h,w).row(r).coeff(c));
+          VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
+        }
+      }
+      
+      VERIFY_IS_APPROX(m.middleCols(j,w), refMat.middleCols(j,w));
+      VERIFY_IS_APPROX(m.middleRows(i,h), refMat.middleRows(i,h));
+      for(Index r=0; r<h; r++)
+      {
+        VERIFY_IS_APPROX(m.middleCols(j,w).row(r), refMat.middleCols(j,w).row(r));
+        VERIFY_IS_APPROX(m.middleRows(i,h).row(r), refMat.middleRows(i,h).row(r));
+        for(Index c=0; c<w; c++)
+        {
+          VERIFY_IS_APPROX(m.col(c).coeff(r), refMat.col(c).coeff(r));
+          VERIFY_IS_APPROX(m.row(r).coeff(c), refMat.row(r).coeff(c));
+          
+          VERIFY_IS_APPROX(m.middleCols(j,w).coeff(r,c), refMat.middleCols(j,w).coeff(r,c));
+          VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
+          if(m.middleCols(j,w).coeff(r,c) != Scalar(0))
+          {
+            VERIFY_IS_APPROX(m.middleCols(j,w).coeffRef(r,c), refMat.middleCols(j,w).coeff(r,c));
+          }
+          if(m.middleRows(i,h).coeff(r,c) != Scalar(0))
+          {
+            VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
+          }
+        }
+      }
+      for(Index c=0; c<w; c++)
+      {
+        VERIFY_IS_APPROX(m.middleCols(j,w).col(c), refMat.middleCols(j,w).col(c));
+        VERIFY_IS_APPROX(m.middleRows(i,h).col(c), refMat.middleRows(i,h).col(c));
+      }
+    }
+
+    for(Index c=0; c<cols; c++)
+    {
+      VERIFY_IS_APPROX(m.col(c) + m.col(c), (m + m).col(c));
+      VERIFY_IS_APPROX(m.col(c) + m.col(c), refMat.col(c) + refMat.col(c));
+    }
+
+    for(Index r=0; r<rows; r++)
+    {
+      VERIFY_IS_APPROX(m.row(r) + m.row(r), (m + m).row(r));
+      VERIFY_IS_APPROX(m.row(r) + m.row(r), refMat.row(r) + refMat.row(r));
+    }
+  }
+
+  // test innerVector()
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    Index j0 = internal::random<Index>(0,outer-1);
+    Index j1 = internal::random<Index>(0,outer-1);
+    Index r0 = internal::random<Index>(0,rows-1);
+    Index c0 = internal::random<Index>(0,cols-1);
+
+    VERIFY_IS_APPROX(m2.innerVector(j0), innervec(refMat2,j0));
+    VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), innervec(refMat2,j0)+innervec(refMat2,j1));
+
+    m2.innerVector(j0) *= Scalar(2);
+    innervec(refMat2,j0) *= Scalar(2);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.row(r0) *= Scalar(3);
+    refMat2.row(r0) *= Scalar(3);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.col(c0) *= Scalar(4);
+    refMat2.col(c0) *= Scalar(4);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.row(r0) /= Scalar(3);
+    refMat2.row(r0) /= Scalar(3);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.col(c0) /= Scalar(4);
+    refMat2.col(c0) /= Scalar(4);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    SparseVectorType v1;
+    VERIFY_IS_APPROX(v1 = m2.col(c0) * 4, refMat2.col(c0)*4);
+    VERIFY_IS_APPROX(v1 = m2.row(r0) * 4, refMat2.row(r0).transpose()*4);
+
+    SparseMatrixType m3(rows,cols);
+    m3.reserve(VectorXi::Constant(outer,int(inner/2)));
+    for(Index j=0; j<outer; ++j)
+      for(Index k=0; k<(std::min)(j,inner); ++k)
+        m3.insertByOuterInner(j,k) = internal::convert_index<StorageIndex>(k+1);
+    for(Index j=0; j<(std::min)(outer, inner); ++j)
+    {
+      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
+      if(j>0)
+        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
+    }
+    m3.makeCompressed();
+    for(Index j=0; j<(std::min)(outer, inner); ++j)
+    {
+      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
+      if(j>0)
+        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
+    }
+
+    VERIFY(m3.innerVector(j0).nonZeros() == m3.transpose().innerVector(j0).nonZeros());
+
+//     m2.innerVector(j0) = 2*m2.innerVector(j1);
+//     refMat2.col(j0) = 2*refMat2.col(j1);
+//     VERIFY_IS_APPROX(m2, refMat2);
+  }
+
+  // test innerVectors()
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    if(internal::random<float>(0,1)>0.5f) m2.makeCompressed();
+    Index j0 = internal::random<Index>(0,outer-2);
+    Index j1 = internal::random<Index>(0,outer-2);
+    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(j0,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(0,j0,rows,n0));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
+                       refMat2.middleRows(j0,n0)+refMat2.middleRows(j1,n0));
+    else
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
+                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
+    
+    VERIFY_IS_APPROX(m2, refMat2);
+    
+    VERIFY(m2.innerVectors(j0,n0).nonZeros() == m2.transpose().innerVectors(j0,n0).nonZeros());
+    
+    m2.innerVectors(j0,n0) = m2.innerVectors(j0,n0) + m2.innerVectors(j1,n0);
+    if(SparseMatrixType::IsRowMajor)
+      refMat2.middleRows(j0,n0) = (refMat2.middleRows(j0,n0) + refMat2.middleRows(j1,n0)).eval();
+    else
+      refMat2.middleCols(j0,n0) = (refMat2.middleCols(j0,n0) + refMat2.middleCols(j1,n0)).eval();
+    
+    VERIFY_IS_APPROX(m2, refMat2);
+  }
+
+  // test generic blocks
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    Index j0 = internal::random<Index>(0,outer-2);
+    Index j1 = internal::random<Index>(0,outer-2);
+    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols), refMat2.block(j0,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0), refMat2.block(0,j0,rows,n0));
+    
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols)+m2.block(j1,0,n0,cols),
+                      refMat2.block(j0,0,n0,cols)+refMat2.block(j1,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0)+m2.block(0,j1,rows,n0),
+                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
+      
+    Index i = internal::random<Index>(0,m2.outerSize()-1);
+    if(SparseMatrixType::IsRowMajor) {
+      m2.innerVector(i) = m2.innerVector(i) * s1;
+      refMat2.row(i) = refMat2.row(i) * s1;
+      VERIFY_IS_APPROX(m2,refMat2);
+    } else {
+      m2.innerVector(i) = m2.innerVector(i) * s1;
+      refMat2.col(i) = refMat2.col(i) * s1;
+      VERIFY_IS_APPROX(m2,refMat2);
+    }
+    
+    Index r0 = internal::random<Index>(0,rows-2);
+    Index c0 = internal::random<Index>(0,cols-2);
+    Index r1 = internal::random<Index>(1,rows-r0);
+    Index c1 = internal::random<Index>(1,cols-c0);
+    
+    VERIFY_IS_APPROX(DenseVector(m2.col(c0)), refMat2.col(c0));
+    VERIFY_IS_APPROX(m2.col(c0), refMat2.col(c0));
+    
+    VERIFY_IS_APPROX(RowDenseVector(m2.row(r0)), refMat2.row(r0));
+    VERIFY_IS_APPROX(m2.row(r0), refMat2.row(r0));
+
+    VERIFY_IS_APPROX(m2.block(r0,c0,r1,c1), refMat2.block(r0,c0,r1,c1));
+    VERIFY_IS_APPROX((2*m2).block(r0,c0,r1,c1), (2*refMat2).block(r0,c0,r1,c1));
+
+    if(m2.nonZeros()>0)
+    {
+      VERIFY_IS_APPROX(m2, refMat2);
+      SparseMatrixType m3(rows, cols);
+      DenseMatrix refMat3(rows, cols); refMat3.setZero();
+      Index n = internal::random<Index>(1,10);
+      for(Index k=0; k<n; ++k)
+      {
+        Index o1 = internal::random<Index>(0,outer-1);
+        Index o2 = internal::random<Index>(0,outer-1);
+        if(SparseMatrixType::IsRowMajor)
+        {
+          m3.innerVector(o1) = m2.row(o2);
+          refMat3.row(o1) = refMat2.row(o2);
+        }
+        else
+        {
+          m3.innerVector(o1) = m2.col(o2);
+          refMat3.col(o1) = refMat2.col(o2);
+        }
+        if(internal::random<bool>())
+          m3.makeCompressed();
+      }
+      if(m3.nonZeros()>0)
+      VERIFY_IS_APPROX(m3, refMat3);
+    }
+  }
+}
+
+void test_sparse_block()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    EIGEN_UNUSED_VARIABLE(r+c);
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(1, 1)) ));
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(8, 8)) ));
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_block(SparseMatrix<std::complex<double>, ColMajor>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_block(SparseMatrix<std::complex<double>, RowMajor>(r, c)) ));
+    
+    CALL_SUBTEST_3(( sparse_block(SparseMatrix<double,ColMajor,long int>(r, c)) ));
+    CALL_SUBTEST_3(( sparse_block(SparseMatrix<double,RowMajor,long int>(r, c)) ));
+    
+    r = Eigen::internal::random<int>(1,100);
+    c = Eigen::internal::random<int>(1,100);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    
+    CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+  }
+}
diff --git a/test/sparse_permutations.cpp b/test/sparse_permutations.cpp
index e4ce1d679..b82cceff8 100644
--- a/test/sparse_permutations.cpp
+++ b/test/sparse_permutations.cpp
@@ -1,25 +1,57 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+
+static long int nb_transposed_copies;
+#define EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN {nb_transposed_copies++;}
+#define VERIFY_TRANSPOSITION_COUNT(XPR,N) {\
+    nb_transposed_copies = 0; \
+    XPR; \
+    if(nb_transposed_copies!=N) std::cerr << "nb_transposed_copies == " << nb_transposed_copies << "\n"; \
+    VERIFY( (#XPR) && nb_transposed_copies==N ); \
+  }
+
 #include "sparse.h"
 
-template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(const SparseMatrixType& ref)
+template<typename T>
+bool is_sorted(const T& mat) {
+  for(Index k = 0; k<mat.outerSize(); ++k)
+  {
+    Index prev = -1;
+    for(typename T::InnerIterator it(mat,k); it; ++it)
+    {
+      if(prev>=it.index())
+        return false;
+      prev = it.index();
+    }
+  }
+  return true;
+}
+
+template<typename T>
+typename internal::nested_eval<T,1>::type eval(const T &xpr)
 {
-  typedef typename SparseMatrixType::Index Index;
+  VERIFY( int(internal::nested_eval<T,1>::type::Flags&RowMajorBit) == int(internal::evaluator<T>::Flags&RowMajorBit) );
+  return xpr;
+}
 
+template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(const SparseMatrixType& ref)
+{
   const Index rows = ref.rows();
   const Index cols = ref.cols();
   typedef typename SparseMatrixType::Scalar Scalar;
-  typedef typename SparseMatrixType::Index Index;
-  typedef SparseMatrix<Scalar, OtherStorage, Index> OtherSparseMatrixType;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, OtherStorage, StorageIndex> OtherSparseMatrixType;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
-  typedef Matrix<Index,Dynamic,1> VectorI;
+  typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+//   bool IsRowMajor1 = SparseMatrixType::IsRowMajor;
+//   bool IsRowMajor2 = OtherSparseMatrixType::IsRowMajor;
   
   double density = (std::max)(8./(rows*cols), 0.01);
   
@@ -44,58 +76,69 @@ template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(c
   randomPermutationVector(pi, cols);
   p.indices() = pi;
 
-  res = mat*p;
+  VERIFY( is_sorted( ::eval(mat*p) ));
+  VERIFY( is_sorted( res = mat*p ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(mat*p), 0);
+  //VERIFY_TRANSPOSITION_COUNT( res = mat*p, IsRowMajor ? 1 : 0 );
   res_d = mat_d*p;
   VERIFY(res.isApprox(res_d) && "mat*p");
 
-  res = p*mat;
+  VERIFY( is_sorted( ::eval(p*mat) ));
+  VERIFY( is_sorted( res = p*mat ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(p*mat), 0);
   res_d = p*mat_d;
   VERIFY(res.isApprox(res_d) && "p*mat");
 
-  res = mat*p.inverse();
+  VERIFY( is_sorted( (mat*p).eval() ));
+  VERIFY( is_sorted( res = mat*p.inverse() ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(mat*p.inverse()), 0);
   res_d = mat*p.inverse();
   VERIFY(res.isApprox(res_d) && "mat*inv(p)");
 
-  res = p.inverse()*mat;
+  VERIFY( is_sorted( (p*mat+p*mat).eval() ));
+  VERIFY( is_sorted( res = p.inverse()*mat ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(p.inverse()*mat), 0);
   res_d = p.inverse()*mat_d;
   VERIFY(res.isApprox(res_d) && "inv(p)*mat");
 
-  res = mat.twistedBy(p);
+  VERIFY( is_sorted( (p * mat * p.inverse()).eval() ));
+  VERIFY( is_sorted( res = mat.twistedBy(p) ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(p * mat * p.inverse()), 0);
   res_d = (p * mat_d) * p.inverse();
   VERIFY(res.isApprox(res_d) && "p*mat*inv(p)");
 
   
-  res = mat.template selfadjointView<Upper>().twistedBy(p_null);
+  VERIFY( is_sorted( res = mat.template selfadjointView<Upper>().twistedBy(p_null) ));
   res_d = up_sym_d;
   VERIFY(res.isApprox(res_d) && "full selfadjoint upper to full");
   
-  res = mat.template selfadjointView<Lower>().twistedBy(p_null);
+  VERIFY( is_sorted( res = mat.template selfadjointView<Lower>().twistedBy(p_null) ));
   res_d = lo_sym_d;
   VERIFY(res.isApprox(res_d) && "full selfadjoint lower to full");
   
   
-  res = up.template selfadjointView<Upper>().twistedBy(p_null);
+  VERIFY( is_sorted( res = up.template selfadjointView<Upper>().twistedBy(p_null) ));
   res_d = up_sym_d;
   VERIFY(res.isApprox(res_d) && "upper selfadjoint to full");
   
-  res = lo.template selfadjointView<Lower>().twistedBy(p_null);
+  VERIFY( is_sorted( res = lo.template selfadjointView<Lower>().twistedBy(p_null) ));
   res_d = lo_sym_d;
   VERIFY(res.isApprox(res_d) && "lower selfadjoint full");
 
 
-  res = mat.template selfadjointView<Upper>();
+  VERIFY( is_sorted( res = mat.template selfadjointView<Upper>() ));
   res_d = up_sym_d;
   VERIFY(res.isApprox(res_d) && "full selfadjoint upper to full");
 
-  res = mat.template selfadjointView<Lower>();
+  VERIFY( is_sorted( res = mat.template selfadjointView<Lower>() ));
   res_d = lo_sym_d;
   VERIFY(res.isApprox(res_d) && "full selfadjoint lower to full");
 
-  res = up.template selfadjointView<Upper>();
+  VERIFY( is_sorted( res = up.template selfadjointView<Upper>() ));
   res_d = up_sym_d;
   VERIFY(res.isApprox(res_d) && "upper selfadjoint to full");
 
-  res = lo.template selfadjointView<Lower>();
+  VERIFY( is_sorted( res = lo.template selfadjointView<Lower>() ));
   res_d = lo_sym_d;
   VERIFY(res.isApprox(res_d) && "lower selfadjoint full");
 
@@ -152,19 +195,19 @@ template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(c
   VERIFY(res.isApprox(res_d) && "upper selfadjoint twisted to lower");
 
   
-  res = mat.template selfadjointView<Upper>().twistedBy(p);
+  VERIFY( is_sorted( res = mat.template selfadjointView<Upper>().twistedBy(p) ));
   res_d = (p * up_sym_d) * p.inverse();
   VERIFY(res.isApprox(res_d) && "full selfadjoint upper twisted to full");
   
-  res = mat.template selfadjointView<Lower>().twistedBy(p);
+  VERIFY( is_sorted( res = mat.template selfadjointView<Lower>().twistedBy(p) ));
   res_d = (p * lo_sym_d) * p.inverse();
   VERIFY(res.isApprox(res_d) && "full selfadjoint lower twisted to full");
   
-  res = up.template selfadjointView<Upper>().twistedBy(p);
+  VERIFY( is_sorted( res = up.template selfadjointView<Upper>().twistedBy(p) ));
   res_d = (p * up_sym_d) * p.inverse();
   VERIFY(res.isApprox(res_d) && "upper selfadjoint twisted to full");
   
-  res = lo.template selfadjointView<Lower>().twistedBy(p);
+  VERIFY( is_sorted( res = lo.template selfadjointView<Lower>().twistedBy(p) ));
   res_d = (p * lo_sym_d) * p.inverse();
   VERIFY(res.isApprox(res_d) && "lower selfadjoint twisted to full");
 }
@@ -184,4 +227,10 @@ void test_sparse_permutations()
     CALL_SUBTEST_1((  sparse_permutations_all<double>(s) ));
     CALL_SUBTEST_2((  sparse_permutations_all<std::complex<double> >(s) ));
   }
+
+  VERIFY((internal::is_same<internal::permutation_matrix_product<SparseMatrix<double>,OnTheRight,false,SparseShape>::ReturnType,
+                            internal::nested_eval<Product<SparseMatrix<double>,PermutationMatrix<Dynamic,Dynamic>,AliasFreeProduct>,1>::type>::value));
+
+  VERIFY((internal::is_same<internal::permutation_matrix_product<SparseMatrix<double>,OnTheLeft,false,SparseShape>::ReturnType,
+                            internal::nested_eval<Product<PermutationMatrix<Dynamic,Dynamic>,SparseMatrix<double>,AliasFreeProduct>,1>::type>::value));
 }
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index a2ea9d5b7..c1edd26e3 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -7,37 +7,29 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#include "sparse.h"
+static long int nb_temporaries;
 
-template<typename SparseMatrixType, typename DenseMatrix, bool IsRowMajor=SparseMatrixType::IsRowMajor> struct test_outer;
+inline void on_temporary_creation() {
+  // here's a great place to set a breakpoint when debugging failures in this test!
+  nb_temporaries++;
+}
 
-template<typename SparseMatrixType, typename DenseMatrix> struct test_outer<SparseMatrixType,DenseMatrix,false> {
-  static void run(SparseMatrixType& m2, SparseMatrixType& m4, DenseMatrix& refMat2, DenseMatrix& refMat4) {
-    typedef typename SparseMatrixType::Index Index;
-    Index c  = internal::random<Index>(0,m2.cols()-1);
-    Index c1 = internal::random<Index>(0,m2.cols()-1);
-    VERIFY_IS_APPROX(m4=m2.col(c)*refMat2.col(c1).transpose(), refMat4=refMat2.col(c)*refMat2.col(c1).transpose());
-    VERIFY_IS_APPROX(m4=refMat2.col(c1)*m2.col(c).transpose(), refMat4=refMat2.col(c1)*refMat2.col(c).transpose());
-  }
-};
-
-template<typename SparseMatrixType, typename DenseMatrix> struct test_outer<SparseMatrixType,DenseMatrix,true> {
-  static void run(SparseMatrixType& m2, SparseMatrixType& m4, DenseMatrix& refMat2, DenseMatrix& refMat4) {
-    typedef typename SparseMatrixType::Index Index;
-    Index r  = internal::random<Index>(0,m2.rows()-1);
-    Index c1 = internal::random<Index>(0,m2.cols()-1);
-    VERIFY_IS_APPROX(m4=m2.row(r).transpose()*refMat2.col(c1).transpose(), refMat4=refMat2.row(r).transpose()*refMat2.col(c1).transpose());
-    VERIFY_IS_APPROX(m4=refMat2.col(c1)*m2.row(r), refMat4=refMat2.col(c1)*refMat2.row(r));
+#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN { on_temporary_creation(); }
+
+#include "sparse.h"
+
+#define VERIFY_EVALUATION_COUNT(XPR,N) {\
+    nb_temporaries = 0; \
+    CALL_SUBTEST( XPR ); \
+    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
+    VERIFY( (#XPR) && nb_temporaries==N ); \
   }
-};
 
-// (m2,m4,refMat2,refMat4,dv1);
-//     VERIFY_IS_APPROX(m4=m2.innerVector(c)*dv1.transpose(), refMat4=refMat2.colVector(c)*dv1.transpose());
-//     VERIFY_IS_APPROX(m4=dv1*mcm.col(c).transpose(), refMat4=dv1*refMat2.col(c).transpose());
+
 
 template<typename SparseMatrixType> void sparse_product()
 {
-  typedef typename SparseMatrixType::Index Index;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
   Index n = 100;
   const Index rows  = internal::random<Index>(1,n);
   const Index cols  = internal::random<Index>(1,n);
@@ -45,12 +37,12 @@ template<typename SparseMatrixType> void sparse_product()
   typedef typename SparseMatrixType::Scalar Scalar;
   enum { Flags = SparseMatrixType::Flags };
 
-  double density = (std::max)(8./(rows*cols), 0.1);
+  double density = (std::max)(8./(rows*cols), 0.2);
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
   typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
-  typedef SparseVector<Scalar,0,Index> ColSpVector;
-  typedef SparseVector<Scalar,RowMajor,Index> RowSpVector;
+  typedef SparseVector<Scalar,0,StorageIndex> ColSpVector;
+  typedef SparseVector<Scalar,RowMajor,StorageIndex> RowSpVector;
 
   Scalar s1 = internal::random<Scalar>();
   Scalar s2 = internal::random<Scalar>();
@@ -93,33 +85,124 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(m4 = m2*m3/s1, refMat4 = refMat2*refMat3/s1);
     VERIFY_IS_APPROX(m4 = m2*m3*s1, refMat4 = refMat2*refMat3*s1);
     VERIFY_IS_APPROX(m4 = s2*m2*m3*s1, refMat4 = s2*refMat2*refMat3*s1);
+    VERIFY_IS_APPROX(m4 = (m2+m2)*m3, refMat4 = (refMat2+refMat2)*refMat3);
+    VERIFY_IS_APPROX(m4 = m2*m3.leftCols(cols/2), refMat4 = refMat2*refMat3.leftCols(cols/2));
+    VERIFY_IS_APPROX(m4 = m2*(m3+m3).leftCols(cols/2), refMat4 = refMat2*(refMat3+refMat3).leftCols(cols/2));
 
     VERIFY_IS_APPROX(m4=(m2*m3).pruned(0), refMat4=refMat2*refMat3);
     VERIFY_IS_APPROX(m4=(m2t.transpose()*m3).pruned(0), refMat4=refMat2t.transpose()*refMat3);
     VERIFY_IS_APPROX(m4=(m2t.transpose()*m3t.transpose()).pruned(0), refMat4=refMat2t.transpose()*refMat3t.transpose());
     VERIFY_IS_APPROX(m4=(m2*m3t.transpose()).pruned(0), refMat4=refMat2*refMat3t.transpose());
 
+    // make sure the right product implementation is called:
+    if((!SparseMatrixType::IsRowMajor) && m2.rows()<=m3.cols())
+    {
+      VERIFY_EVALUATION_COUNT(m4 = m2*m3, 3); // 1 temp for the result + 2 for transposing and get a sorted result.
+      VERIFY_EVALUATION_COUNT(m4 = (m2*m3).pruned(0), 1);
+      VERIFY_EVALUATION_COUNT(m4 = (m2*m3).eval().pruned(0), 4);
+    }
+
+    // and that pruning is effective:
+    {
+      DenseMatrix Ad(2,2);
+      Ad << -1, 1, 1, 1;
+      SparseMatrixType As(Ad.sparseView()), B(2,2);
+      VERIFY_IS_EQUAL( (As*As.transpose()).eval().nonZeros(), 4);
+      VERIFY_IS_EQUAL( (Ad*Ad.transpose()).eval().sparseView().eval().nonZeros(), 2);
+      VERIFY_IS_EQUAL( (As*As.transpose()).pruned(1e-6).eval().nonZeros(), 2);
+    }
+
+    // dense ?= sparse * sparse
+    VERIFY_IS_APPROX(dm4 =m2*m3, refMat4 =refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4+=m2*m3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4-=m2*m3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4 =m2t.transpose()*m3, refMat4 =refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4+=m2t.transpose()*m3, refMat4+=refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4-=m2t.transpose()*m3, refMat4-=refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4 =m2t.transpose()*m3t.transpose(), refMat4 =refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4+=m2t.transpose()*m3t.transpose(), refMat4+=refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4-=m2t.transpose()*m3t.transpose(), refMat4-=refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4 =m2*m3t.transpose(), refMat4 =refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4+=m2*m3t.transpose(), refMat4+=refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4-=m2*m3t.transpose(), refMat4-=refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4 = m2*m3*s1, refMat4 = refMat2*refMat3*s1);
+
     // test aliasing
     m4 = m2; refMat4 = refMat2;
     VERIFY_IS_APPROX(m4=m4*m3, refMat4=refMat4*refMat3);
 
-    // sparse * dense
+    // sparse * dense matrix
     VERIFY_IS_APPROX(dm4=m2*refMat3, refMat4=refMat2*refMat3);
     VERIFY_IS_APPROX(dm4=m2*refMat3t.transpose(), refMat4=refMat2*refMat3t.transpose());
     VERIFY_IS_APPROX(dm4=m2t.transpose()*refMat3, refMat4=refMat2t.transpose()*refMat3);
     VERIFY_IS_APPROX(dm4=m2t.transpose()*refMat3t.transpose(), refMat4=refMat2t.transpose()*refMat3t.transpose());
 
+    VERIFY_IS_APPROX(dm4=m2*refMat3, refMat4=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4=dm4+m2*refMat3, refMat4=refMat4+refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4+=m2*refMat3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4-=m2*refMat3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()+=m2*refMat3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()-=m2*refMat3, refMat4-=refMat2*refMat3);
     VERIFY_IS_APPROX(dm4=m2*(refMat3+refMat3), refMat4=refMat2*(refMat3+refMat3));
     VERIFY_IS_APPROX(dm4=m2t.transpose()*(refMat3+refMat5)*0.5, refMat4=refMat2t.transpose()*(refMat3+refMat5)*0.5);
+    
+    // sparse * dense vector
+    VERIFY_IS_APPROX(dm4.col(0)=m2*refMat3.col(0), refMat4.col(0)=refMat2*refMat3.col(0));
+    VERIFY_IS_APPROX(dm4.col(0)=m2*refMat3t.transpose().col(0), refMat4.col(0)=refMat2*refMat3t.transpose().col(0));
+    VERIFY_IS_APPROX(dm4.col(0)=m2t.transpose()*refMat3.col(0), refMat4.col(0)=refMat2t.transpose()*refMat3.col(0));
+    VERIFY_IS_APPROX(dm4.col(0)=m2t.transpose()*refMat3t.transpose().col(0), refMat4.col(0)=refMat2t.transpose()*refMat3t.transpose().col(0));
 
     // dense * sparse
     VERIFY_IS_APPROX(dm4=refMat2*m3, refMat4=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4=dm4+refMat2*m3, refMat4=refMat4+refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4+=refMat2*m3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4-=refMat2*m3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()+=refMat2*m3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()-=refMat2*m3, refMat4-=refMat2*refMat3);
     VERIFY_IS_APPROX(dm4=refMat2*m3t.transpose(), refMat4=refMat2*refMat3t.transpose());
     VERIFY_IS_APPROX(dm4=refMat2t.transpose()*m3, refMat4=refMat2t.transpose()*refMat3);
     VERIFY_IS_APPROX(dm4=refMat2t.transpose()*m3t.transpose(), refMat4=refMat2t.transpose()*refMat3t.transpose());
 
     // sparse * dense and dense * sparse outer product
-    test_outer<SparseMatrixType,DenseMatrix>::run(m2,m4,refMat2,refMat4);
+    {
+      Index c  = internal::random<Index>(0,depth-1);
+      Index r  = internal::random<Index>(0,rows-1);
+      Index c1 = internal::random<Index>(0,cols-1);
+      Index r1 = internal::random<Index>(0,depth-1);
+      DenseMatrix dm5  = DenseMatrix::Random(depth, cols);
+
+      VERIFY_IS_APPROX( m4=m2.col(c)*dm5.col(c1).transpose(), refMat4=refMat2.col(c)*dm5.col(c1).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX( m4=m2.middleCols(c,1)*dm5.col(c1).transpose(), refMat4=refMat2.col(c)*dm5.col(c1).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=m2.col(c)*dm5.col(c1).transpose(), refMat4=refMat2.col(c)*dm5.col(c1).transpose());
+      
+      VERIFY_IS_APPROX(m4=dm5.col(c1)*m2.col(c).transpose(), refMat4=dm5.col(c1)*refMat2.col(c).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(m4=dm5.col(c1)*m2.middleCols(c,1).transpose(), refMat4=dm5.col(c1)*refMat2.col(c).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=dm5.col(c1)*m2.col(c).transpose(), refMat4=dm5.col(c1)*refMat2.col(c).transpose());
+
+      VERIFY_IS_APPROX( m4=dm5.row(r1).transpose()*m2.col(c).transpose(), refMat4=dm5.row(r1).transpose()*refMat2.col(c).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=dm5.row(r1).transpose()*m2.col(c).transpose(), refMat4=dm5.row(r1).transpose()*refMat2.col(c).transpose());
+
+      VERIFY_IS_APPROX( m4=m2.row(r).transpose()*dm5.col(c1).transpose(), refMat4=refMat2.row(r).transpose()*dm5.col(c1).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX( m4=m2.middleRows(r,1).transpose()*dm5.col(c1).transpose(), refMat4=refMat2.row(r).transpose()*dm5.col(c1).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=m2.row(r).transpose()*dm5.col(c1).transpose(), refMat4=refMat2.row(r).transpose()*dm5.col(c1).transpose());
+
+      VERIFY_IS_APPROX( m4=dm5.col(c1)*m2.row(r), refMat4=dm5.col(c1)*refMat2.row(r));
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX( m4=dm5.col(c1)*m2.middleRows(r,1), refMat4=dm5.col(c1)*refMat2.row(r));
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=dm5.col(c1)*m2.row(r), refMat4=dm5.col(c1)*refMat2.row(r));
+
+      VERIFY_IS_APPROX( m4=dm5.row(r1).transpose()*m2.row(r), refMat4=dm5.row(r1).transpose()*refMat2.row(r));
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=dm5.row(r1).transpose()*m2.row(r), refMat4=dm5.row(r1).transpose()*refMat2.row(r));
+    }
 
     VERIFY_IS_APPROX(m6=m6*m6, refMat6=refMat6*refMat6);
     
@@ -131,11 +214,11 @@ template<typename SparseMatrixType> void sparse_product()
     RowSpVector rv0(depth), rv1;
     RowDenseVector drv0(depth), drv1(rv1);
     initSparse(2*density,drv0, rv0);
-    
-    VERIFY_IS_APPROX(cv1=rv0*m3, dcv1=drv0*refMat3);
+
+    VERIFY_IS_APPROX(cv1=m3*cv0, dcv1=refMat3*dcv0);    
     VERIFY_IS_APPROX(rv1=rv0*m3, drv1=drv0*refMat3);
-    VERIFY_IS_APPROX(cv1=m3*cv0, dcv1=refMat3*dcv0);
     VERIFY_IS_APPROX(cv1=m3t.adjoint()*cv0, dcv1=refMat3t.adjoint()*dcv0);
+    VERIFY_IS_APPROX(cv1=rv0*m3, dcv1=drv0*refMat3);
     VERIFY_IS_APPROX(rv1=m3*cv0, drv1=refMat3*dcv0);
   }
   
@@ -158,12 +241,16 @@ template<typename SparseMatrixType> void sparse_product()
     // also check with a SparseWrapper:
     DenseVector v1 = DenseVector::Random(cols);
     DenseVector v2 = DenseVector::Random(rows);
+    DenseVector v3 = DenseVector::Random(rows);
     VERIFY_IS_APPROX(m3=m2*v1.asDiagonal(), refM3=refM2*v1.asDiagonal());
     VERIFY_IS_APPROX(m3=m2.transpose()*v2.asDiagonal(), refM3=refM2.transpose()*v2.asDiagonal());
     VERIFY_IS_APPROX(m3=v2.asDiagonal()*m2, refM3=v2.asDiagonal()*refM2);
     VERIFY_IS_APPROX(m3=v1.asDiagonal()*m2.transpose(), refM3=v1.asDiagonal()*refM2.transpose());
     
     VERIFY_IS_APPROX(m3=v2.asDiagonal()*m2*v1.asDiagonal(), refM3=v2.asDiagonal()*refM2*v1.asDiagonal());
+
+    VERIFY_IS_APPROX(v2=m2*v1.asDiagonal()*v1, refM2*v1.asDiagonal()*v1);
+    VERIFY_IS_APPROX(v3=v2.asDiagonal()*m2*v1, v2.asDiagonal()*refM2*v1);
     
     // evaluate to a dense matrix to check the .row() and .col() iterator functions
     VERIFY_IS_APPROX(d3=m2*d1, refM3=refM2*d1);
@@ -172,7 +259,7 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(d3=d1*m2.transpose(), refM3=d1*refM2.transpose());
   }
 
-  // test self adjoint products
+  // test self-adjoint and triangular-view products
   {
     DenseMatrix b = DenseMatrix::Random(rows, rows);
     DenseMatrix x = DenseMatrix::Random(rows, rows);
@@ -180,9 +267,12 @@ template<typename SparseMatrixType> void sparse_product()
     DenseMatrix refUp = DenseMatrix::Zero(rows, rows);
     DenseMatrix refLo = DenseMatrix::Zero(rows, rows);
     DenseMatrix refS = DenseMatrix::Zero(rows, rows);
+    DenseMatrix refA = DenseMatrix::Zero(rows, rows);
     SparseMatrixType mUp(rows, rows);
     SparseMatrixType mLo(rows, rows);
     SparseMatrixType mS(rows, rows);
+    SparseMatrixType mA(rows, rows);
+    initSparse<Scalar>(density, refA, mA);
     do {
       initSparse<Scalar>(density, refUp, mUp, ForceRealDiag|/*ForceNonZeroDiag|*/MakeUpperTriangular);
     } while (refUp.isZero());
@@ -195,26 +285,41 @@ template<typename SparseMatrixType> void sparse_product()
     for (int k=0; k<mS.outerSize(); ++k)
       for (typename SparseMatrixType::InnerIterator it(mS,k); it; ++it)
         if (it.index() == k)
-          it.valueRef() *= 0.5;
+          it.valueRef() *= Scalar(0.5);
 
     VERIFY_IS_APPROX(refS.adjoint(), refS);
     VERIFY_IS_APPROX(mS.adjoint(), mS);
     VERIFY_IS_APPROX(mS, refS);
     VERIFY_IS_APPROX(x=mS*b, refX=refS*b);
 
+    // sparse selfadjointView with dense matrices
     VERIFY_IS_APPROX(x=mUp.template selfadjointView<Upper>()*b, refX=refS*b);
     VERIFY_IS_APPROX(x=mLo.template selfadjointView<Lower>()*b, refX=refS*b);
     VERIFY_IS_APPROX(x=mS.template selfadjointView<Upper|Lower>()*b, refX=refS*b);
+
+    VERIFY_IS_APPROX(x.noalias()+=mUp.template selfadjointView<Upper>()*b, refX+=refS*b);
+    VERIFY_IS_APPROX(x.noalias()-=mLo.template selfadjointView<Lower>()*b, refX-=refS*b);
+    VERIFY_IS_APPROX(x.noalias()+=mS.template selfadjointView<Upper|Lower>()*b, refX+=refS*b);
     
-    // sparse selfadjointView * sparse 
+    // sparse selfadjointView with sparse matrices
     SparseMatrixType mSres(rows,rows);
     VERIFY_IS_APPROX(mSres = mLo.template selfadjointView<Lower>()*mS,
                      refX = refLo.template selfadjointView<Lower>()*refS);
-    // sparse * sparse selfadjointview
     VERIFY_IS_APPROX(mSres = mS * mLo.template selfadjointView<Lower>(),
                      refX = refS * refLo.template selfadjointView<Lower>());
+    
+    // sparse triangularView with dense matrices
+    VERIFY_IS_APPROX(x=mA.template triangularView<Upper>()*b, refX=refA.template triangularView<Upper>()*b);
+    VERIFY_IS_APPROX(x=mA.template triangularView<Lower>()*b, refX=refA.template triangularView<Lower>()*b);
+    VERIFY_IS_APPROX(x=b*mA.template triangularView<Upper>(), refX=b*refA.template triangularView<Upper>());
+    VERIFY_IS_APPROX(x=b*mA.template triangularView<Lower>(), refX=b*refA.template triangularView<Lower>());
+    
+    // sparse triangularView with sparse matrices
+    VERIFY_IS_APPROX(mSres = mA.template triangularView<Lower>()*mS,   refX = refA.template triangularView<Lower>()*refS);
+    VERIFY_IS_APPROX(mSres = mS * mA.template triangularView<Lower>(), refX = refS * refA.template triangularView<Lower>());
+    VERIFY_IS_APPROX(mSres = mA.template triangularView<Upper>()*mS,   refX = refA.template triangularView<Upper>()*refS);
+    VERIFY_IS_APPROX(mSres = mS * mA.template triangularView<Upper>(), refX = refS * refA.template triangularView<Upper>());
   }
-  
 }
 
 // New test for Bug in SparseTimeDenseProduct
@@ -239,11 +344,35 @@ template<typename SparseMatrixType, typename DenseMatrixType> void sparse_produc
   VERIFY_IS_APPROX( m4(0,0), 0.0 );
 }
 
+template<typename Scalar>
+void bug_942()
+{
+  typedef Matrix<Scalar, Dynamic, 1>     Vector;
+  typedef SparseMatrix<Scalar, ColMajor> ColSpMat;
+  typedef SparseMatrix<Scalar, RowMajor> RowSpMat;
+  ColSpMat cmA(1,1);
+  cmA.insert(0,0) = 1;
+
+  RowSpMat rmA(1,1);
+  rmA.insert(0,0) = 1;
+
+  Vector d(1);
+  d[0] = 2;
+  
+  double res = 2;
+  
+  VERIFY_IS_APPROX( ( cmA*d.asDiagonal() ).eval().coeff(0,0), res );
+  VERIFY_IS_APPROX( ( d.asDiagonal()*rmA ).eval().coeff(0,0), res );
+  VERIFY_IS_APPROX( ( rmA*d.asDiagonal() ).eval().coeff(0,0), res );
+  VERIFY_IS_APPROX( ( d.asDiagonal()*cmA ).eval().coeff(0,0), res );
+}
+
 void test_sparse_product()
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( (sparse_product<SparseMatrix<double,ColMajor> >()) );
     CALL_SUBTEST_1( (sparse_product<SparseMatrix<double,RowMajor> >()) );
+    CALL_SUBTEST_1( (bug_942<double>()) );
     CALL_SUBTEST_2( (sparse_product<SparseMatrix<std::complex<double>, ColMajor > >()) );
     CALL_SUBTEST_2( (sparse_product<SparseMatrix<std::complex<double>, RowMajor > >()) );
     CALL_SUBTEST_3( (sparse_product<SparseMatrix<float,ColMajor,long int> >()) );
diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
new file mode 100644
index 000000000..5e9607234
--- /dev/null
+++ b/test/sparse_ref.cpp
@@ -0,0 +1,139 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This unit test cannot be easily written to work with EIGEN_DEFAULT_TO_ROW_MAJOR
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#undef EIGEN_DEFAULT_TO_ROW_MAJOR
+#endif
+
+static long int nb_temporaries;
+
+inline void on_temporary_creation() {
+  // here's a great place to set a breakpoint when debugging failures in this test!
+  nb_temporaries++;
+}
+
+#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN { on_temporary_creation(); }
+
+#include "main.h"
+#include <Eigen/SparseCore>
+
+#define VERIFY_EVALUATION_COUNT(XPR,N) {\
+    nb_temporaries = 0; \
+    CALL_SUBTEST( XPR ); \
+    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
+    VERIFY( (#XPR) && nb_temporaries==N ); \
+  }
+
+template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
+{
+  // verify that ref-to-const don't have LvalueBit
+  typedef typename internal::add_const<PlainObjectType>::type ConstPlainObjectType;
+  VERIFY( !(internal::traits<Ref<ConstPlainObjectType> >::Flags & LvalueBit) );
+  VERIFY( !(internal::traits<Ref<ConstPlainObjectType, Aligned> >::Flags & LvalueBit) );
+  VERIFY( !(Ref<ConstPlainObjectType>::Flags & LvalueBit) );
+  VERIFY( !(Ref<ConstPlainObjectType, Aligned>::Flags & LvalueBit) );
+}
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_1(Ref<SparseMatrix<float> > a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_2(const Ref<const SparseMatrix<float> >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_3(const Ref<const SparseMatrix<float>, StandardCompressedFormat>& a, const B &b) {
+  VERIFY(a.isCompressed());
+  VERIFY_IS_EQUAL(a.toDense(),b.toDense());
+}
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_4(Ref<SparseVector<float> > a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_5(const Ref<const SparseVector<float> >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+void call_ref()
+{
+  SparseMatrix<float>               A = MatrixXf::Random(10,10).sparseView(0.5,1);
+  SparseMatrix<float,RowMajor>      B = MatrixXf::Random(10,10).sparseView(0.5,1);
+  SparseMatrix<float>               C = MatrixXf::Random(10,10).sparseView(0.5,1);
+  C.reserve(VectorXi::Constant(C.outerSize(), 2));
+  const SparseMatrix<float>&        Ac(A);
+  Block<SparseMatrix<float> >       Ab(A,0,1, 3,3);
+  const Block<SparseMatrix<float> > Abc(A,0,1,3,3);
+  SparseVector<float>               vc =  VectorXf::Random(10).sparseView(0.5,1);
+  SparseVector<float,RowMajor>      vr =  VectorXf::Random(10).sparseView(0.5,1);
+  SparseMatrix<float> AA = A*A;
+  
+
+  VERIFY_EVALUATION_COUNT( call_ref_1(A, A),  0);
+//   VERIFY_EVALUATION_COUNT( call_ref_1(Ac, Ac),  0); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_2(A, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.transpose(), A.transpose()),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A.transpose(), A.transpose()),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Ac,Ac), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_3(Ac,Ac), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A+A,2*Ac), 1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A+A,2*Ac), 1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(B, B),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(B, B),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(B.transpose(), B.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_3(B.transpose(), B.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A*A, AA),  3);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A*A, AA),  3);
+  
+  VERIFY(!C.isCompressed());
+  VERIFY_EVALUATION_COUNT( call_ref_3(C, C),  1);
+  
+  Ref<SparseMatrix<float> > Ar(A);
+  VERIFY_IS_APPROX(Ar+Ar, A+A);
+  VERIFY_EVALUATION_COUNT( call_ref_1(Ar, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Ar, A),  0);
+  
+  Ref<SparseMatrix<float,RowMajor> > Br(B);
+  VERIFY_EVALUATION_COUNT( call_ref_1(Br.transpose(), Br.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Br, Br),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Br.transpose(), Br.transpose()),  0);
+  
+  Ref<const SparseMatrix<float> > Arc(A);
+//   VERIFY_EVALUATION_COUNT( call_ref_1(Arc, Arc),  0); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_2(Arc, Arc),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.col(2), A.col(2)),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(vc, vc),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(vr.transpose(), vr.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(vr, vr.transpose()),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)),  1); // should be 0 (allocate starts/nnz only)
+
+  VERIFY_EVALUATION_COUNT( call_ref_4(vc, vc),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_4(vr, vr.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(vc, vc),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(vr, vr.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_4(A.col(2), A.col(2)),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(A.col(2), A.col(2)),  0);
+  // VERIFY_EVALUATION_COUNT( call_ref_4(A.row(2), A.row(2).transpose()),  1); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_5(A.row(2), A.row(2).transpose()),  1);
+}
+
+void test_sparse_ref()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( check_const_correctness(SparseMatrix<float>()) );
+    CALL_SUBTEST_1( check_const_correctness(SparseMatrix<double,RowMajor>()) );
+    CALL_SUBTEST_2( call_ref() );
+
+    CALL_SUBTEST_3( check_const_correctness(SparseVector<float>()) );
+    CALL_SUBTEST_3( check_const_correctness(SparseVector<double,RowMajor>()) );
+  }
+}
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index 59d77daa2..5145bc3eb 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -9,102 +9,167 @@
 
 #include "sparse.h"
 #include <Eigen/SparseCore>
+#include <sstream>
+
+template<typename Solver, typename Rhs, typename Guess,typename Result>
+void solve_with_guess(IterativeSolverBase<Solver>& solver, const MatrixBase<Rhs>& b, const Guess& g, Result &x) {
+  if(internal::random<bool>())
+  {
+    // With a temporary through evaluator<SolveWithGuess>
+    x = solver.derived().solveWithGuess(b,g) + Result::Zero(x.rows(), x.cols());
+  }
+  else
+  {
+    // direct evaluation within x through Assignment<Result,SolveWithGuess>
+    x = solver.derived().solveWithGuess(b.derived(),g);
+  }
+}
+
+template<typename Solver, typename Rhs, typename Guess,typename Result>
+void solve_with_guess(SparseSolverBase<Solver>& solver, const MatrixBase<Rhs>& b, const Guess& , Result& x) {
+  if(internal::random<bool>())
+    x = solver.derived().solve(b) + Result::Zero(x.rows(), x.cols());
+  else
+    x = solver.derived().solve(b);
+}
+
+template<typename Solver, typename Rhs, typename Guess,typename Result>
+void solve_with_guess(SparseSolverBase<Solver>& solver, const SparseMatrixBase<Rhs>& b, const Guess& , Result& x) {
+  x = solver.derived().solve(b);
+}
 
 template<typename Solver, typename Rhs, typename DenseMat, typename DenseRhs>
 void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const DenseMat& dA, const DenseRhs& db)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
+  typedef typename Mat::StorageIndex StorageIndex;
 
-  DenseRhs refX = dA.lu().solve(db);
+  DenseRhs refX = dA.householderQr().solve(db);
   {
-    Rhs x(b.rows(), b.cols());
+    Rhs x(A.cols(), b.cols());
     Rhs oldb = b;
 
     solver.compute(A);
     if (solver.info() != Success)
     {
-      std::cerr << "sparse solver testing: factorization failed (check_sparse_solving)\n";
-      exit(0);
-      return;
+      std::cerr << "ERROR | sparse solver testing, factorization failed (" << typeid(Solver).name() << ")\n";
+      VERIFY(solver.info() == Success);
     }
     x = solver.solve(b);
     if (solver.info() != Success)
     {
-      std::cerr << "sparse solver testing: solving failed\n";
+      std::cerr << "WARNING | sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
       return;
     }
     VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x.isApprox(refX,test_precision<Scalar>()));
 
+    x.setZero();
+    solve_with_guess(solver, b, x, x);
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
+    VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
     VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    
     x.setZero();
     // test the analyze/factorize API
     solver.analyzePattern(A);
     solver.factorize(A);
-    if (solver.info() != Success)
-    {
-      std::cerr << "sparse solver testing: factorization failed (check_sparse_solving)\n";
-      exit(0);
-      return;
-    }
+    VERIFY(solver.info() == Success && "factorization failed when using analyzePattern/factorize API");
     x = solver.solve(b);
-    if (solver.info() != Success)
-    {
-      std::cerr << "sparse solver testing: solving failed\n";
-      return;
-    }
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
     VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
-
     VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    
+    x.setZero();
+    // test with Map
+    MappedSparseMatrix<Scalar,Mat::Options,StorageIndex> Am(A.rows(), A.cols(), A.nonZeros(), const_cast<StorageIndex*>(A.outerIndexPtr()), const_cast<StorageIndex*>(A.innerIndexPtr()), const_cast<Scalar*>(A.valuePtr()));
+    solver.compute(Am);
+    VERIFY(solver.info() == Success && "factorization failed when using Map");
+    DenseRhs dx(refX);
+    dx.setZero();
+    Map<DenseRhs> xm(dx.data(), dx.rows(), dx.cols());
+    Map<const DenseRhs> bm(db.data(), db.rows(), db.cols());
+    xm = solver.solve(bm);
+    VERIFY(solver.info() == Success && "solving failed when using Map");
+    VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(xm.isApprox(refX,test_precision<Scalar>()));
   }
   
-  // test dense Block as the result and rhs:
+  // if not too large, do some extra check:
+  if(A.rows()<2000)
   {
-    DenseRhs x(db.rows(), db.cols());
-    DenseRhs oldb(db);
-    x.setZero();
-    x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols()));
-    VERIFY(oldb.isApprox(db) && "sparse solver testing: the rhs should not be modified!");
-    VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    // test initialization ctor
+    {
+      Rhs x(b.rows(), b.cols());
+      Solver solver2(A);
+      VERIFY(solver2.info() == Success);
+      x = solver2.solve(b);
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    }
+
+    // test dense Block as the result and rhs:
+    {
+      DenseRhs x(refX.rows(), refX.cols());
+      DenseRhs oldb(db);
+      x.setZero();
+      x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols()));
+      VERIFY(oldb.isApprox(db) && "sparse solver testing: the rhs should not be modified!");
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    }
+
+    // test uncompressed inputs
+    {
+      Mat A2 = A;
+      A2.reserve((ArrayXf::Random(A.outerSize())+2).template cast<typename Mat::StorageIndex>().eval());
+      solver.compute(A2);
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    }
+
+    // test expression as input
+    {
+      solver.compute(0.5*(A+A));
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+
+      Solver solver2(0.5*(A+A));
+      Rhs x2 = solver2.solve(b);
+      VERIFY(x2.isApprox(refX,test_precision<Scalar>()));
+    }
   }
 }
 
 template<typename Solver, typename Rhs>
-void check_sparse_solving_real_cases(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const Rhs& refX)
+void check_sparse_solving_real_cases(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const typename Solver::MatrixType& fullA, const Rhs& refX)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
   typedef typename Mat::RealScalar RealScalar;
   
-  Rhs x(b.rows(), b.cols());
-  
+  Rhs x(A.cols(), b.cols());
+
   solver.compute(A);
   if (solver.info() != Success)
   {
-    std::cerr << "sparse solver testing: factorization failed (check_sparse_solving_real_cases)\n";
-    exit(0);
-    return;
+    std::cerr << "ERROR | sparse solver testing, factorization failed (" << typeid(Solver).name() << ")\n";
+    VERIFY(solver.info() == Success);
   }
   x = solver.solve(b);
+  
   if (solver.info() != Success)
   {
-    std::cerr << "sparse solver testing: solving failed\n";
+    std::cerr << "WARNING | sparse solver testing, solving failed (" << typeid(Solver).name() << ")\n";
     return;
   }
   
-  RealScalar res_error;
-  // Compute the norm of the relative error
-  if(refX.size() != 0)
-    res_error = (refX - x).norm()/refX.norm();
-  else
-  { 
-    // Compute the relative residual norm
-    res_error = (b - A * x).norm()/b.norm();
-  }
-  if (res_error > test_precision<Scalar>() ){
-    std::cerr << "Test " << g_test_stack.back() << " failed in "EI_PP_MAKE_STRING(__FILE__) 
-    << " (" << EI_PP_MAKE_STRING(__LINE__) << ")" << std::endl << std::endl;
-    abort();
+  RealScalar res_error = (fullA*x-b).norm()/b.norm();  
+  VERIFY( (res_error <= test_precision<Scalar>() ) && "sparse solver failed without noticing it"); 
+
+  
+  if(refX.size() != 0 && (refX - x).norm()/refX.norm() > test_precision<Scalar>())
+  {
+    std::cerr << "WARNING | found solution is different from the provided reference one\n";
   }
   
 }
@@ -117,7 +182,7 @@ void check_sparse_determinant(Solver& solver, const typename Solver::MatrixType&
   solver.compute(A);
   if (solver.info() != Success)
   {
-    std::cerr << "sparse solver testing: factorization failed (check_sparse_determinant)\n";
+    std::cerr << "WARNING | sparse solver testing: factorization failed (check_sparse_determinant)\n";
     return;
   }
 
@@ -134,7 +199,7 @@ void check_sparse_abs_determinant(Solver& solver, const typename Solver::MatrixT
   solver.compute(A);
   if (solver.info() != Success)
   {
-    std::cerr << "sparse solver testing: factorization failed (check_sparse_abs_determinant)\n";
+    std::cerr << "WARNING | sparse solver testing: factorization failed (check_sparse_abs_determinant)\n";
     return;
   }
 
@@ -181,13 +246,33 @@ inline std::string get_matrixfolder()
     mat_folder = mat_folder + static_cast<std::string>("/real/");
   return mat_folder;
 }
+std::string sym_to_string(int sym)
+{
+  if(sym==Symmetric) return "Symmetric ";
+  if(sym==SPD)       return "SPD ";
+  return "";
+}
+template<typename Derived>
+std::string solver_stats(const IterativeSolverBase<Derived> &solver)
+{
+  std::stringstream ss;
+  ss << solver.iterations() << " iters, error: " << solver.error();
+  return ss.str();
+}
+template<typename Derived>
+std::string solver_stats(const SparseSolverBase<Derived> &/*solver*/)
+{
+  return "";
+}
 #endif
 
-template<typename Solver> void check_sparse_spd_solving(Solver& solver)
+template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxSize = 300, int maxRealWorldSize = 100000)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
-  typedef SparseMatrix<Scalar,ColMajor> SpMat;
+  typedef typename Mat::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar,ColMajor, StorageIndex> SpMat;
+  typedef SparseVector<Scalar, 0, StorageIndex> SpVec;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
 
@@ -195,7 +280,7 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver)
   Mat A, halfA;
   DenseMatrix dA;
   for (int i = 0; i < g_repeat; i++) {
-    int size = generate_sparse_spd_problem(solver, A, halfA, dA);
+    int size = generate_sparse_spd_problem(solver, A, halfA, dA, maxSize);
 
     // generate the right hand sides
     int rhsCols = internal::random<int>(1,16);
@@ -204,13 +289,17 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver)
     DenseVector b = DenseVector::Random(size);
     DenseMatrix dB(size,rhsCols);
     initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
+    SpVec c = B.col(0);
+    DenseVector dc = dB.col(0);
   
-    check_sparse_solving(solver, A,     b,  dA, b);
-    check_sparse_solving(solver, halfA, b,  dA, b);
-    check_sparse_solving(solver, A,     dB, dA, dB);
-    check_sparse_solving(solver, halfA, dB, dA, dB);
-    check_sparse_solving(solver, A,     B,  dA, dB);
-    check_sparse_solving(solver, halfA, B,  dA, dB);
+    CALL_SUBTEST( check_sparse_solving(solver, A,     b,  dA, b)  );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, b,  dA, b)  );
+    CALL_SUBTEST( check_sparse_solving(solver, A,     dB, dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, dB, dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, A,     B,  dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, B,  dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, A,     c,  dA, dc) );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, c,  dA, dc) );
     
     // check only once
     if(i==0)
@@ -221,25 +310,44 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver)
   }
   
   // First, get the folder 
-#ifdef TEST_REAL_CASES  
-  if (internal::is_same<Scalar, float>::value 
-      || internal::is_same<Scalar, std::complex<float> >::value)
-    return ;
-  
-  std::string mat_folder = get_matrixfolder<Scalar>();
-  MatrixMarketIterator<Scalar> it(mat_folder);
-  for (; it; ++it)
+#ifdef TEST_REAL_CASES
+  // Test real problems with double precision only
+  if (internal::is_same<typename NumTraits<Scalar>::Real, double>::value)
   {
-    if (it.sym() == SPD){
-      Mat halfA;
-      PermutationMatrix<Dynamic, Dynamic, Index> pnull;
-      halfA.template selfadjointView<Solver::UpLo>() = it.matrix().template triangularView<Eigen::Lower>().twistedBy(pnull);
-      
-      std::cout<< " ==== SOLVING WITH MATRIX " << it.matname() << " ==== \n";
-      check_sparse_solving_real_cases(solver, it.matrix(), it.rhs(), it.refX());
-      check_sparse_solving_real_cases(solver, halfA, it.rhs(), it.refX());
+    std::string mat_folder = get_matrixfolder<Scalar>();
+    MatrixMarketIterator<Scalar> it(mat_folder);
+    for (; it; ++it)
+    {
+      if (it.sym() == SPD){
+        A = it.matrix();
+        if(A.diagonal().size() <= maxRealWorldSize)
+        {
+          DenseVector b = it.rhs();
+          DenseVector refX = it.refX();
+          PermutationMatrix<Dynamic, Dynamic, StorageIndex> pnull;
+          halfA.resize(A.rows(), A.cols());
+          if(Solver::UpLo == (Lower|Upper))
+            halfA = A;
+          else
+            halfA.template selfadjointView<Solver::UpLo>() = A.template triangularView<Eigen::Lower>().twistedBy(pnull);
+          
+          std::cout << "INFO | Testing " << sym_to_string(it.sym()) << "sparse problem " << it.matname()
+                  << " (" << A.rows() << "x" << A.cols() << ") using " << typeid(Solver).name() << "..." << std::endl;
+          CALL_SUBTEST( check_sparse_solving_real_cases(solver, A,     b, A, refX) );
+          std::string stats = solver_stats(solver);
+          if(stats.size()>0)
+            std::cout << "INFO |  " << stats << std::endl;
+          CALL_SUBTEST( check_sparse_solving_real_cases(solver, halfA, b, A, refX) );
+        }
+        else
+        {
+          std::cout << "INFO | Skip sparse problem \"" << it.matname() << "\" (too large)" << std::endl;
+        }
+      }
     }
   }
+#else
+  EIGEN_UNUSED_VARIABLE(maxRealWorldSize);
 #endif
 }
 
@@ -261,27 +369,39 @@ template<typename Solver> void check_sparse_spd_determinant(Solver& solver)
 }
 
 template<typename Solver, typename DenseMat>
-int generate_sparse_square_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300)
+Index generate_sparse_square_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
 
-  int size = internal::random<int>(1,maxSize);
+  Index size = internal::random<int>(1,maxSize);
   double density = (std::max)(8./(size*size), 0.01);
   
   A.resize(size,size);
   dA.resize(size,size);
 
-  initSparse<Scalar>(density, dA, A, ForceNonZeroDiag);
+  initSparse<Scalar>(density, dA, A, options);
   
   return size;
 }
 
-template<typename Solver> void check_sparse_square_solving(Solver& solver)
+
+struct prune_column {
+  Index m_col;
+  prune_column(Index col) : m_col(col) {}
+  template<class Scalar>
+  bool operator()(Index, Index col, const Scalar&) const {
+    return col != m_col;
+  }
+};
+
+
+template<typename Solver> void check_sparse_square_solving(Solver& solver, int maxSize = 300, int maxRealWorldSize = 100000, bool checkDeficient = false)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
-  typedef SparseMatrix<Scalar,ColMajor> SpMat;
+  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
+  typedef SparseVector<Scalar, 0, typename Mat::StorageIndex> SpVec;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
 
@@ -290,7 +410,7 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver)
   Mat A;
   DenseMatrix dA;
   for (int i = 0; i < g_repeat; i++) {
-    int size = generate_sparse_square_problem(solver, A, dA);
+    Index size = generate_sparse_square_problem(solver, A, dA, maxSize);
 
     A.makeCompressed();
     DenseVector b = DenseVector::Random(size);
@@ -299,9 +419,12 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver)
     double density = (std::max)(8./(size*rhsCols), 0.1);
     initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
     B.makeCompressed();
-    check_sparse_solving(solver, A, b,  dA, b);
-    check_sparse_solving(solver, A, dB, dA, dB);
-    check_sparse_solving(solver, A, B,  dA, dB);
+    SpVec c = B.col(0);
+    DenseVector dc = dB.col(0);
+    CALL_SUBTEST(check_sparse_solving(solver, A, b,  dA, b));
+    CALL_SUBTEST(check_sparse_solving(solver, A, dB, dA, dB));
+    CALL_SUBTEST(check_sparse_solving(solver, A, B,  dA, dB));
+    CALL_SUBTEST(check_sparse_solving(solver, A, c,  dA, dc));
     
     // check only once
     if(i==0)
@@ -309,21 +432,44 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver)
       b = DenseVector::Zero(size);
       check_sparse_solving(solver, A, b, dA, b);
     }
+    // regression test for Bug 792 (structurally rank deficient matrices):
+    if(checkDeficient && size>1) {
+      Index col = internal::random<int>(0,int(size-1));
+      A.prune(prune_column(col));
+      solver.compute(A);
+      VERIFY_IS_EQUAL(solver.info(), NumericalIssue);
+    }
   }
   
   // First, get the folder 
 #ifdef TEST_REAL_CASES
-  if (internal::is_same<Scalar, float>::value 
-      || internal::is_same<Scalar, std::complex<float> >::value)
-    return ;
-  
-  std::string mat_folder = get_matrixfolder<Scalar>();
-  MatrixMarketIterator<Scalar> it(mat_folder);
-  for (; it; ++it)
+  // Test real problems with double precision only
+  if (internal::is_same<typename NumTraits<Scalar>::Real, double>::value)
   {
-    std::cout<< " ==== SOLVING WITH MATRIX " << it.matname() << " ==== \n";
-    check_sparse_solving_real_cases(solver, it.matrix(), it.rhs(), it.refX());
+    std::string mat_folder = get_matrixfolder<Scalar>();
+    MatrixMarketIterator<Scalar> it(mat_folder);
+    for (; it; ++it)
+    {
+      A = it.matrix();
+      if(A.diagonal().size() <= maxRealWorldSize)
+      {
+        DenseVector b = it.rhs();
+        DenseVector refX = it.refX();
+        std::cout << "INFO | Testing " << sym_to_string(it.sym()) << "sparse problem " << it.matname()
+                  << " (" << A.rows() << "x" << A.cols() << ") using " << typeid(Solver).name() << "..." << std::endl;
+        CALL_SUBTEST(check_sparse_solving_real_cases(solver, A, b, A, refX));
+        std::string stats = solver_stats(solver);
+        if(stats.size()>0)
+          std::cout << "INFO |  " << stats << std::endl;
+      }
+      else
+      {
+        std::cout << "INFO | SKIP sparse problem \"" << it.matname() << "\" (too large)" << std::endl;
+      }
+    }
   }
+#else
+  EIGEN_UNUSED_VARIABLE(maxRealWorldSize);
 #endif
 
 }
@@ -333,13 +479,20 @@ template<typename Solver> void check_sparse_square_determinant(Solver& solver)
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
-
-  // generate the problem
-  Mat A;
-  DenseMatrix dA;
-  generate_sparse_square_problem(solver, A, dA, 30);
-  A.makeCompressed();
+  
   for (int i = 0; i < g_repeat; i++) {
+    // generate the problem
+    Mat A;
+    DenseMatrix dA;
+    
+    int size = internal::random<int>(1,30);
+    dA.setRandom(size,size);
+    
+    dA = (dA.array().abs()<0.3).select(0,dA);
+    dA.diagonal() = (dA.diagonal().array()==0).select(1,dA.diagonal());
+    A = dA.sparseView();
+    A.makeCompressed();
+  
     check_sparse_determinant(solver, A, dA);
   }
 }
@@ -350,13 +503,63 @@ template<typename Solver> void check_sparse_square_abs_determinant(Solver& solve
   typedef typename Mat::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
 
-  // generate the problem
-  Mat A;
-  DenseMatrix dA;
-  generate_sparse_square_problem(solver, A, dA, 30);
-  A.makeCompressed();
   for (int i = 0; i < g_repeat; i++) {
+    // generate the problem
+    Mat A;
+    DenseMatrix dA;
+    generate_sparse_square_problem(solver, A, dA, 30);
+    A.makeCompressed();
     check_sparse_abs_determinant(solver, A, dA);
   }
 }
 
+template<typename Solver, typename DenseMat>
+void generate_sparse_leastsquare_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+
+  int rows = internal::random<int>(1,maxSize);
+  int cols = internal::random<int>(1,rows);
+  double density = (std::max)(8./(rows*cols), 0.01);
+  
+  A.resize(rows,cols);
+  dA.resize(rows,cols);
+
+  initSparse<Scalar>(density, dA, A, options);
+}
+
+template<typename Solver> void check_sparse_leastsquare_solving(Solver& solver)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+
+  int rhsCols = internal::random<int>(1,16);
+
+  Mat A;
+  DenseMatrix dA;
+  for (int i = 0; i < g_repeat; i++) {
+    generate_sparse_leastsquare_problem(solver, A, dA);
+
+    A.makeCompressed();
+    DenseVector b = DenseVector::Random(A.rows());
+    DenseMatrix dB(A.rows(),rhsCols);
+    SpMat B(A.rows(),rhsCols);
+    double density = (std::max)(8./(A.rows()*rhsCols), 0.1);
+    initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
+    B.makeCompressed();
+    check_sparse_solving(solver, A, b,  dA, b);
+    check_sparse_solving(solver, A, dB, dA, dB);
+    check_sparse_solving(solver, A, B,  dA, dB);
+    
+    // check only once
+    if(i==0)
+    {
+      b = DenseVector::Zero(A.rows());
+      check_sparse_solving(solver, A, b, dA, b);
+    }
+  }
+}
diff --git a/test/sparse_vector.cpp b/test/sparse_vector.cpp
index 0c9476803..b3e1dda25 100644
--- a/test/sparse_vector.cpp
+++ b/test/sparse_vector.cpp
@@ -9,22 +9,22 @@
 
 #include "sparse.h"
 
-template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)
+template<typename Scalar,typename StorageIndex> void sparse_vector(int rows, int cols)
 {
   double densityMat = (std::max)(8./(rows*cols), 0.01);
-  double densityVec = (std::max)(8./float(rows), 0.1);
+  double densityVec = (std::max)(8./(rows), 0.1);
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
-  typedef SparseVector<Scalar,0,Index> SparseVectorType;
-  typedef SparseMatrix<Scalar,0,Index> SparseMatrixType;
+  typedef SparseVector<Scalar,0,StorageIndex> SparseVectorType;
+  typedef SparseMatrix<Scalar,0,StorageIndex> SparseMatrixType;
   Scalar eps = 1e-6;
 
   SparseMatrixType m1(rows,rows);
   SparseVectorType v1(rows), v2(rows), v3(rows);
   DenseMatrix refM1 = DenseMatrix::Zero(rows, rows);
   DenseVector refV1 = DenseVector::Random(rows),
-    refV2 = DenseVector::Random(rows),
-    refV3 = DenseVector::Random(rows);
+              refV2 = DenseVector::Random(rows),
+              refV3 = DenseVector::Random(rows);
 
   std::vector<int> zerocoords, nonzerocoords;
   initSparse<Scalar>(densityVec, refV1, v1, &zerocoords, &nonzerocoords);
@@ -52,6 +52,20 @@ template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)
     }
   }
   VERIFY_IS_APPROX(v1, refV1);
+  
+  // test coeffRef with reallocation
+  {
+    SparseVectorType v4(rows);
+    DenseVector v5 = DenseVector::Zero(rows);
+    for(int k=0; k<rows; ++k)
+    {
+      int i = internal::random<int>(0,rows-1);
+      Scalar v = internal::random<Scalar>();
+      v4.coeffRef(i) += v;
+      v5.coeffRef(i) += v;
+    }
+    VERIFY_IS_APPROX(v4,v5);
+  }
 
   v1.coeffRef(nonzerocoords[0]) = Scalar(5);
   refV1.coeffRef(nonzerocoords[0]) = Scalar(5);
@@ -71,9 +85,12 @@ template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)
   VERIFY_IS_APPROX(v1.dot(v2), refV1.dot(refV2));
   VERIFY_IS_APPROX(v1.dot(refV2), refV1.dot(refV2));
 
+  VERIFY_IS_APPROX(m1*v2, refM1*refV2);
   VERIFY_IS_APPROX(v1.dot(m1*v2), refV1.dot(refM1*refV2));
-  int i = internal::random<int>(0,rows-1);
-  VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i)));
+  {
+    int i = internal::random<int>(0,rows-1);
+    VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i)));
+  }
 
 
   VERIFY_IS_APPROX(v1.squaredNorm(), refV1.squaredNorm());
@@ -96,15 +113,51 @@ template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)
   VERIFY_IS_APPROX(refV3 = v1.transpose(),v1.toDense()); 
   VERIFY_IS_APPROX(DenseVector(v1),v1.toDense()); 
 
+  // test conservative resize
+  {
+    std::vector<StorageIndex> inc;
+    if(rows > 3)
+      inc.push_back(-3);
+    inc.push_back(0);
+    inc.push_back(3);
+    inc.push_back(1);
+    inc.push_back(10);
+
+    for(std::size_t i = 0; i< inc.size(); i++) {
+      StorageIndex incRows = inc[i];
+      SparseVectorType vec1(rows);
+      DenseVector refVec1 = DenseVector::Zero(rows);
+      initSparse<Scalar>(densityVec, refVec1, vec1);
+
+      vec1.conservativeResize(rows+incRows);
+      refVec1.conservativeResize(rows+incRows);
+      if (incRows > 0) refVec1.tail(incRows).setZero();
+
+      VERIFY_IS_APPROX(vec1, refVec1);
+
+      // Insert new values
+      if (incRows > 0)
+        vec1.insert(vec1.rows()-1) = refVec1(refVec1.rows()-1) = 1;
+
+      VERIFY_IS_APPROX(vec1, refVec1);
+    }
+  }
+
 }
 
 void test_sparse_vector()
 {
   for(int i = 0; i < g_repeat; i++) {
+    int r = Eigen::internal::random<int>(1,500), c = Eigen::internal::random<int>(1,500);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    EIGEN_UNUSED_VARIABLE(r+c);
+
     CALL_SUBTEST_1(( sparse_vector<double,int>(8, 8) ));
-    CALL_SUBTEST_2(( sparse_vector<std::complex<double>, int>(16, 16) ));
-    CALL_SUBTEST_1(( sparse_vector<double,long int>(299, 535) ));
-    CALL_SUBTEST_1(( sparse_vector<double,short>(299, 535) ));
+    CALL_SUBTEST_2(( sparse_vector<std::complex<double>, int>(r, c) ));
+    CALL_SUBTEST_1(( sparse_vector<double,long int>(r, c) ));
+    CALL_SUBTEST_1(( sparse_vector<double,short>(r, c) ));
   }
 }
 
diff --git a/test/sparselu.cpp b/test/sparselu.cpp
index 37eb069a9..bd000baf1 100644
--- a/test/sparselu.cpp
+++ b/test/sparselu.cpp
@@ -3,25 +3,9 @@
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
-
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 // SparseLU solve does not accept column major matrices for the destination.
 // However, as expected, the generic check_sparse_square_solving routines produces row-major
@@ -41,9 +25,9 @@ template<typename T> void test_sparselu_T()
   SparseLU<SparseMatrix<T, ColMajor>, AMDOrdering<int> > sparselu_amd; 
   SparseLU<SparseMatrix<T, ColMajor, long int>, NaturalOrdering<long int> > sparselu_natural;
   
-  check_sparse_square_solving(sparselu_colamd); 
-  check_sparse_square_solving(sparselu_amd);
-  check_sparse_square_solving(sparselu_natural);
+  check_sparse_square_solving(sparselu_colamd,  300, 100000, true); 
+  check_sparse_square_solving(sparselu_amd,     300,  10000, true);
+  check_sparse_square_solving(sparselu_natural, 300,   2000, true);
   
   check_sparse_square_abs_determinant(sparselu_colamd);
   check_sparse_square_abs_determinant(sparselu_amd);
diff --git a/test/sparseqr.cpp b/test/sparseqr.cpp
index 451c0e7f8..e8605fd21 100644
--- a/test/sparseqr.cpp
+++ b/test/sparseqr.cpp
@@ -10,11 +10,12 @@
 #include <Eigen/SparseQR>
 
 template<typename MatrixType,typename DenseMat>
-int generate_sparse_rectangular_problem(MatrixType& A, DenseMat& dA, int maxRows = 300)
+int generate_sparse_rectangular_problem(MatrixType& A, DenseMat& dA, int maxRows = 300, int maxCols = 150)
 {
+  eigen_assert(maxRows >= maxCols);
   typedef typename MatrixType::Scalar Scalar;
   int rows = internal::random<int>(1,maxRows);
-  int cols = internal::random<int>(1,rows);
+  int cols = internal::random<int>(1,maxCols);
   double density = (std::max)(8./(rows*cols), 0.01);
   
   A.resize(rows,cols);
@@ -53,7 +54,7 @@ template<typename Scalar> void test_sparseqr_scalar()
   
   b = dA * DenseVector::Random(A.cols());
   solver.compute(A);
-  if(internal::random<float>(0,1)>0.5)
+  if(internal::random<float>(0,1)>0.5f)
     solver.factorize(A);  // this checks that calling analyzePattern is not needed if the pattern do not change.
   if (solver.info() != Success)
   {
@@ -88,6 +89,11 @@ template<typename Scalar> void test_sparseqr_scalar()
   QtQ = Q * Q.adjoint();
   idM.resize(Q.rows(), Q.rows()); idM.setIdentity();
   VERIFY(idM.isApprox(QtQ));
+  
+  // Q to dense
+  DenseMat dQ;
+  dQ = solver.matrixQ();
+  VERIFY_IS_APPROX(Q, dQ);
 }
 void test_sparseqr()
 {
diff --git a/test/spqr_support.cpp b/test/spqr_support.cpp
index b8980e081..81e63b6a5 100644
--- a/test/spqr_support.cpp
+++ b/test/spqr_support.cpp
@@ -5,6 +5,8 @@
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
+
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
 #include "sparse.h"
 #include <Eigen/SPQRSupport>
 
@@ -18,8 +20,8 @@ int generate_sparse_rectangular_problem(MatrixType& A, DenseMat& dA, int maxRows
   int cols = internal::random<int>(1,rows);
   double density = (std::max)(8./(rows*cols), 0.01);
   
-  A.resize(rows,rows);
-  dA.resize(rows,rows);
+  A.resize(rows,cols);
+  dA.resize(rows,cols);
   initSparse<Scalar>(density, dA, A,ForceNonZeroDiag);
   A.makeCompressed();
   return rows;
@@ -35,7 +37,7 @@ template<typename Scalar> void test_spqr_scalar()
   SPQR<MatrixType> solver; 
   generate_sparse_rectangular_problem(A,dA);
   
-  int m = A.rows();
+  Index m = A.rows();
   b = DenseVector::Random(m);
   solver.compute(A);
   if (solver.info() != Success)
diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp
index 231dd9189..c3eb5ff31 100644
--- a/test/stable_norm.cpp
+++ b/test/stable_norm.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -9,14 +9,6 @@
 
 #include "main.h"
 
-// workaround aggressive optimization in ICC
-template<typename T> EIGEN_DONT_INLINE  T sub(T a, T b) { return a - b; }
-
-template<typename T> bool isFinite(const T& x)
-{
-  return isNotNaN(sub(x,x));
-}
-
 template<typename T> EIGEN_DONT_INLINE T copy(const T& x)
 {
   return x;
@@ -32,6 +24,8 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
   typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  
+  bool complex_real_product_ok = true;
 
   // Check the basic machine-dependent constants.
   {
@@ -44,6 +38,16 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
 
     VERIFY( (!(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5) || (it<=4 && ibeta <= 3 ) || it<2))
            && "the stable norm algorithm cannot be guaranteed on this computer");
+    
+    Scalar inf = std::numeric_limits<RealScalar>::infinity();
+    if(NumTraits<Scalar>::IsComplex && (numext::isnan)(inf*RealScalar(1)) )
+    {
+      complex_real_product_ok = false;
+      static bool first = true;
+      if(first)
+        std::cerr << "WARNING: compiler mess up complex*real product, " << inf << " * " << 1.0 << " = " << inf*RealScalar(1) << std::endl;
+      first = false;
+    }
   }
 
 
@@ -76,19 +80,19 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
 
   RealScalar size = static_cast<RealScalar>(m.size());
 
-  // test isFinite
-  VERIFY(!isFinite( std::numeric_limits<RealScalar>::infinity()));
-  VERIFY(!isFinite(sqrt(-abs(big))));
+  // test numext::isfinite
+  VERIFY(!(numext::isfinite)( std::numeric_limits<RealScalar>::infinity()));
+  VERIFY(!(numext::isfinite)(sqrt(-abs(big))));
 
   // test overflow
-  VERIFY(isFinite(sqrt(size)*abs(big)));
+  VERIFY((numext::isfinite)(sqrt(size)*abs(big)));
   VERIFY_IS_NOT_APPROX(sqrt(copy(vbig.squaredNorm())), abs(sqrt(size)*big)); // here the default norm must fail
   VERIFY_IS_APPROX(vbig.stableNorm(), sqrt(size)*abs(big));
   VERIFY_IS_APPROX(vbig.blueNorm(),   sqrt(size)*abs(big));
   VERIFY_IS_APPROX(vbig.hypotNorm(),  sqrt(size)*abs(big));
 
   // test underflow
-  VERIFY(isFinite(sqrt(size)*abs(small)));
+  VERIFY((numext::isfinite)(sqrt(size)*abs(small)));
   VERIFY_IS_NOT_APPROX(sqrt(copy(vsmall.squaredNorm())),   abs(sqrt(size)*small)); // here the default norm must fail
   VERIFY_IS_APPROX(vsmall.stableNorm(), sqrt(size)*abs(small));
   VERIFY_IS_APPROX(vsmall.blueNorm(),   sqrt(size)*abs(small));
@@ -101,6 +105,79 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
   VERIFY_IS_APPROX(vrand.rowwise().stableNorm(),      vrand.rowwise().norm());
   VERIFY_IS_APPROX(vrand.rowwise().blueNorm(),        vrand.rowwise().norm());
   VERIFY_IS_APPROX(vrand.rowwise().hypotNorm(),       vrand.rowwise().norm());
+  
+  // test NaN, +inf, -inf 
+  MatrixType v;
+  Index i = internal::random<Index>(0,rows-1);
+  Index j = internal::random<Index>(0,cols-1);
+
+  // NaN
+  {
+    v = vrand;
+    v(i,j) = std::numeric_limits<RealScalar>::quiet_NaN();
+    VERIFY(!(numext::isfinite)(v.squaredNorm()));   VERIFY((numext::isnan)(v.squaredNorm()));
+    VERIFY(!(numext::isfinite)(v.norm()));          VERIFY((numext::isnan)(v.norm()));
+    VERIFY(!(numext::isfinite)(v.stableNorm()));    VERIFY((numext::isnan)(v.stableNorm()));
+    VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY((numext::isnan)(v.blueNorm()));
+    VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isnan)(v.hypotNorm()));
+  }
+  
+  // +inf
+  {
+    v = vrand;
+    v(i,j) = std::numeric_limits<RealScalar>::infinity();
+    VERIFY(!(numext::isfinite)(v.squaredNorm()));   VERIFY(isPlusInf(v.squaredNorm()));
+    VERIFY(!(numext::isfinite)(v.norm()));          VERIFY(isPlusInf(v.norm()));
+    VERIFY(!(numext::isfinite)(v.stableNorm()));
+    if(complex_real_product_ok){
+      VERIFY(isPlusInf(v.stableNorm()));
+    }
+    VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY(isPlusInf(v.blueNorm()));
+    VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY(isPlusInf(v.hypotNorm()));
+  }
+  
+  // -inf
+  {
+    v = vrand;
+    v(i,j) = -std::numeric_limits<RealScalar>::infinity();
+    VERIFY(!(numext::isfinite)(v.squaredNorm()));   VERIFY(isPlusInf(v.squaredNorm()));
+    VERIFY(!(numext::isfinite)(v.norm()));          VERIFY(isPlusInf(v.norm()));
+    VERIFY(!(numext::isfinite)(v.stableNorm()));
+    if(complex_real_product_ok) {
+      VERIFY(isPlusInf(v.stableNorm()));
+    }
+    VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY(isPlusInf(v.blueNorm()));
+    VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY(isPlusInf(v.hypotNorm()));
+  }
+  
+  // mix
+  {
+    Index i2 = internal::random<Index>(0,rows-1);
+    Index j2 = internal::random<Index>(0,cols-1);
+    v = vrand;
+    v(i,j) = -std::numeric_limits<RealScalar>::infinity();
+    v(i2,j2) = std::numeric_limits<RealScalar>::quiet_NaN();
+    VERIFY(!(numext::isfinite)(v.squaredNorm()));   VERIFY((numext::isnan)(v.squaredNorm()));
+    VERIFY(!(numext::isfinite)(v.norm()));          VERIFY((numext::isnan)(v.norm()));
+    VERIFY(!(numext::isfinite)(v.stableNorm()));    VERIFY((numext::isnan)(v.stableNorm()));
+    VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY((numext::isnan)(v.blueNorm()));
+    VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isnan)(v.hypotNorm()));
+  }
+
+  // stableNormalize[d]
+  {
+    VERIFY_IS_APPROX(vrand.stableNormalized(), vrand.normalized());
+    MatrixType vcopy(vrand);
+    vcopy.stableNormalize();
+    VERIFY_IS_APPROX(vcopy, vrand.normalized());
+    VERIFY_IS_APPROX((vrand.stableNormalized()).norm(), RealScalar(1));
+    VERIFY_IS_APPROX(vcopy.norm(), RealScalar(1));
+    VERIFY_IS_APPROX((vbig.stableNormalized()).norm(), RealScalar(1));
+    VERIFY_IS_APPROX((vsmall.stableNormalized()).norm(), RealScalar(1));
+    RealScalar big_scaling = ((std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4));
+    VERIFY_IS_APPROX(vbig/big_scaling, (vbig.stableNorm() * vbig.stableNormalized()).eval()/big_scaling);
+    VERIFY_IS_APPROX(vsmall, vsmall.stableNorm() * vsmall.stableNormalized());
+  }
 }
 
 void test_stable_norm()
diff --git a/test/eigen2/eigen2_newstdvector.cpp b/test/stddeque_overload.cpp
index 5f9009901..4da618bbf 100644
--- a/test/eigen2/eigen2_newstdvector.cpp
+++ b/test/stddeque_overload.cpp
@@ -2,23 +2,36 @@
 // for linear algebra.
 //
 // Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#define EIGEN_USE_NEW_STDVECTOR
 #include "main.h"
-#include <Eigen/StdVector>
+
+#include <Eigen/StdDeque>
 #include <Eigen/Geometry>
 
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Vector4f)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix2f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix4f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix4d)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Affine3f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Affine3d)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaternionf)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaterniond)
+
 template<typename MatrixType>
-void check_stdvector_matrix(const MatrixType& m)
+void check_stddeque_matrix(const MatrixType& m)
 {
-  int rows = m.rows();
-  int cols = m.cols();
+  typename MatrixType::Index rows = m.rows();
+  typename MatrixType::Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
-  std::vector<MatrixType,Eigen::aligned_allocator<MatrixType> > v(10, MatrixType(rows,cols)), w(20, y);
+  std::deque<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
   v[5] = x;
   w[6] = v[5];
   VERIFY_IS_APPROX(w[6], v[5]);
@@ -35,9 +48,8 @@ void check_stdvector_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((std::size_t)&(v[22]) == (std::size_t)&(v[21]) + sizeof(MatrixType));
 
-  // do a lot of push_back such that the vector gets internally resized
+  // do a lot of push_back such that the deque gets internally resized
   // (with memory reallocation)
   MatrixType* ref = &w[0];
   for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
@@ -49,11 +61,11 @@ void check_stdvector_matrix(const MatrixType& m)
 }
 
 template<typename TransformType>
-void check_stdvector_transform(const TransformType&)
+void check_stddeque_transform(const TransformType&)
 {
   typedef typename TransformType::MatrixType MatrixType;
   TransformType x(MatrixType::Random()), y(MatrixType::Random());
-  std::vector<TransformType,Eigen::aligned_allocator<TransformType> > v(10), w(20, y);
+  std::deque<TransformType> v(10), w(20, y);
   v[5] = x;
   w[6] = v[5];
   VERIFY_IS_APPROX(w[6], v[5]);
@@ -70,9 +82,8 @@ void check_stdvector_transform(const TransformType&)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((std::size_t)&(v[22]) == (std::size_t)&(v[21]) + sizeof(TransformType));
 
-  // do a lot of push_back such that the vector gets internally resized
+  // do a lot of push_back such that the deque gets internally resized
   // (with memory reallocation)
   TransformType* ref = &w[0];
   for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
@@ -84,11 +95,11 @@ void check_stdvector_transform(const TransformType&)
 }
 
 template<typename QuaternionType>
-void check_stdvector_quaternion(const QuaternionType&)
+void check_stddeque_quaternion(const QuaternionType&)
 {
   typedef typename QuaternionType::Coefficients Coefficients;
   QuaternionType x(Coefficients::Random()), y(Coefficients::Random());
-  std::vector<QuaternionType,Eigen::aligned_allocator<QuaternionType> > v(10), w(20, y);
+  std::deque<QuaternionType> v(10), w(20, y);
   v[5] = x;
   w[6] = v[5];
   VERIFY_IS_APPROX(w[6], v[5]);
@@ -105,9 +116,8 @@ void check_stdvector_quaternion(const QuaternionType&)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((std::size_t)&(v[22]) == (std::size_t)&(v[21]) + sizeof(QuaternionType));
 
-  // do a lot of push_back such that the vector gets internally resized
+  // do a lot of push_back such that the deque gets internally resized
   // (with memory reallocation)
   QuaternionType* ref = &w[0];
   for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
@@ -118,32 +128,31 @@ void check_stdvector_quaternion(const QuaternionType&)
   }
 }
 
-void test_eigen2_newstdvector()
+void test_stddeque_overload()
 {
   // some non vectorizable fixed sizes
-  CALL_SUBTEST_1(check_stdvector_matrix(Vector2f()));
-  CALL_SUBTEST_1(check_stdvector_matrix(Matrix3f()));
-  CALL_SUBTEST_1(check_stdvector_matrix(Matrix3d()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stddeque_matrix(Matrix3d()));
 
   // some vectorizable fixed sizes
-  CALL_SUBTEST_2(check_stdvector_matrix(Matrix2f()));
-  CALL_SUBTEST_2(check_stdvector_matrix(Vector4f()));
-  CALL_SUBTEST_2(check_stdvector_matrix(Matrix4f()));
-  CALL_SUBTEST_2(check_stdvector_matrix(Matrix4d()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stddeque_matrix(Matrix4d()));
 
   // some dynamic sizes
-  CALL_SUBTEST_3(check_stdvector_matrix(MatrixXd(1,1)));
-  CALL_SUBTEST_3(check_stdvector_matrix(VectorXd(20)));
-  CALL_SUBTEST_3(check_stdvector_matrix(RowVectorXf(20)));
-  CALL_SUBTEST_3(check_stdvector_matrix(MatrixXcf(10,10)));
+  CALL_SUBTEST_3(check_stddeque_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stddeque_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stddeque_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stddeque_matrix(MatrixXcf(10,10)));
 
   // some Transform
-  CALL_SUBTEST_4(check_stdvector_transform(Transform2f()));
-  CALL_SUBTEST_4(check_stdvector_transform(Transform3f()));
-  CALL_SUBTEST_4(check_stdvector_transform(Transform3d()));
-  //CALL_SUBTEST(check_stdvector_transform(Transform4d()));
+  CALL_SUBTEST_4(check_stddeque_transform(Affine2f())); // does not need the specialization (2+1)^2 = 9
+  CALL_SUBTEST_4(check_stddeque_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stddeque_transform(Affine3d()));
 
   // some Quaternion
-  CALL_SUBTEST_5(check_stdvector_quaternion(Quaternionf()));
-  CALL_SUBTEST_5(check_stdvector_quaternion(Quaterniond()));
+  CALL_SUBTEST_5(check_stddeque_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stddeque_quaternion(Quaterniond()));
 }
diff --git a/test/stdlist_overload.cpp b/test/stdlist_overload.cpp
new file mode 100644
index 000000000..bb910bd43
--- /dev/null
+++ b/test/stdlist_overload.cpp
@@ -0,0 +1,192 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/StdList>
+#include <Eigen/Geometry>
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Vector4f)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix2f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix4f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix4d)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Affine3f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Affine3d)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Quaternionf)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Quaterniond)
+
+template <class Container, class Position>
+typename Container::iterator get(Container & c, Position position)
+{
+  typename Container::iterator it = c.begin();
+  std::advance(it, position);
+  return it;
+}
+
+template <class Container, class Position, class Value>
+void set(Container & c, Position position, const Value & value)
+{
+  typename Container::iterator it = c.begin();
+  std::advance(it, position);
+  *it = value;
+}
+
+template<typename MatrixType>
+void check_stdlist_matrix(const MatrixType& m)
+{
+  typename MatrixType::Index rows = m.rows();
+  typename MatrixType::Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::list<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
+  typename std::list<MatrixType>::iterator itv = get(v, 5);
+  typename std::list<MatrixType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  MatrixType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY((*get(v, i))==(*get(w, (i-23)%w.size())));
+  }
+}
+
+template<typename TransformType>
+void check_stdlist_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random());
+  std::list<TransformType> v(10), w(20, y);
+  typename std::list<TransformType>::iterator itv = get(v, 5);
+  typename std::list<TransformType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  TransformType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(get(v, i)->matrix()==get(w, (i-23)%w.size())->matrix());
+  }
+}
+
+template<typename QuaternionType>
+void check_stdlist_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random());
+  std::list<QuaternionType> v(10), w(20, y);
+  typename std::list<QuaternionType>::iterator itv = get(v, 5);
+  typename std::list<QuaternionType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  QuaternionType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(get(v, i)->coeffs()==get(w, (i-23)%w.size())->coeffs());
+  }
+}
+
+void test_stdlist_overload()
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdlist_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stdlist_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stdlist_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stdlist_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stdlist_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stdlist_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stdlist_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stdlist_transform(Affine2f())); // does not need the specialization (2+1)^2 = 9
+  CALL_SUBTEST_4(check_stdlist_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stdlist_transform(Affine3d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stdlist_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stdlist_quaternion(Quaterniond()));
+}
diff --git a/test/stdvector.cpp b/test/stdvector.cpp
index 6e173c678..50cb3341d 100644
--- a/test/stdvector.cpp
+++ b/test/stdvector.cpp
@@ -34,7 +34,7 @@ void check_stdvector_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(MatrixType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(MatrixType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
@@ -69,7 +69,7 @@ void check_stdvector_transform(const TransformType&)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(TransformType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(TransformType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
@@ -104,7 +104,7 @@ void check_stdvector_quaternion(const QuaternionType&)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(QuaternionType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(QuaternionType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
diff --git a/test/stdvector_overload.cpp b/test/stdvector_overload.cpp
index 736ff0ee7..959665954 100644
--- a/test/stdvector_overload.cpp
+++ b/test/stdvector_overload.cpp
@@ -48,7 +48,7 @@ void check_stdvector_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(MatrixType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(MatrixType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
@@ -83,7 +83,7 @@ void check_stdvector_transform(const TransformType&)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(TransformType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(TransformType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
@@ -118,7 +118,7 @@ void check_stdvector_quaternion(const QuaternionType&)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(QuaternionType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(QuaternionType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
diff --git a/test/superlu_support.cpp b/test/superlu_support.cpp
index 3b16135bc..98a7bc5c8 100644
--- a/test/superlu_support.cpp
+++ b/test/superlu_support.cpp
@@ -7,6 +7,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
 #include "sparse_solver.h"
 
 #include <Eigen/SuperLUSupport>
diff --git a/test/svd_common.h b/test/svd_common.h
new file mode 100644
index 000000000..605d5dfef
--- /dev/null
+++ b/test/svd_common.h
@@ -0,0 +1,483 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef SVD_DEFAULT
+#error a macro SVD_DEFAULT(MatrixType) must be defined prior to including svd_common.h
+#endif
+
+#ifndef SVD_FOR_MIN_NORM
+#error a macro SVD_FOR_MIN_NORM(MatrixType) must be defined prior to including svd_common.h
+#endif
+
+#include "svd_fill.h"
+
+// Check that the matrix m is properly reconstructed and that the U and V factors are unitary
+// The SVD must have already been computed.
+template<typename SvdType, typename MatrixType>
+void svd_check_full(const MatrixType& m, const SvdType& svd)
+{
+  typedef typename MatrixType::Index Index;
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime> MatrixUType;
+  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime> MatrixVType;
+
+  MatrixType sigma = MatrixType::Zero(rows,cols);
+  sigma.diagonal() = svd.singularValues().template cast<Scalar>();
+  MatrixUType u = svd.matrixU();
+  MatrixVType v = svd.matrixV();
+  RealScalar scaling = m.cwiseAbs().maxCoeff();
+  if(scaling<(std::numeric_limits<RealScalar>::min)())
+  {
+    VERIFY(sigma.cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
+  }
+  else
+  {
+    VERIFY_IS_APPROX(m/scaling, u * (sigma/scaling) * v.adjoint());
+  }
+  VERIFY_IS_UNITARY(u);
+  VERIFY_IS_UNITARY(v);
+}
+
+// Compare partial SVD defined by computationOptions to a full SVD referenceSvd
+template<typename SvdType, typename MatrixType>
+void svd_compare_to_full(const MatrixType& m,
+                         unsigned int computationOptions,
+                         const SvdType& referenceSvd)
+{
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index rows = m.rows();
+  Index cols = m.cols();
+  Index diagSize = (std::min)(rows, cols);
+  RealScalar prec = test_precision<RealScalar>();
+
+  SvdType svd(m, computationOptions);
+
+  VERIFY_IS_APPROX(svd.singularValues(), referenceSvd.singularValues());
+  
+  if(computationOptions & (ComputeFullV|ComputeThinV))
+  {
+    VERIFY( (svd.matrixV().adjoint()*svd.matrixV()).isIdentity(prec) );
+    VERIFY_IS_APPROX( svd.matrixV().leftCols(diagSize) * svd.singularValues().asDiagonal() * svd.matrixV().leftCols(diagSize).adjoint(),
+                      referenceSvd.matrixV().leftCols(diagSize) * referenceSvd.singularValues().asDiagonal() * referenceSvd.matrixV().leftCols(diagSize).adjoint());
+  }
+  
+  if(computationOptions & (ComputeFullU|ComputeThinU))
+  {
+    VERIFY( (svd.matrixU().adjoint()*svd.matrixU()).isIdentity(prec) );
+    VERIFY_IS_APPROX( svd.matrixU().leftCols(diagSize) * svd.singularValues().cwiseAbs2().asDiagonal() * svd.matrixU().leftCols(diagSize).adjoint(),
+                      referenceSvd.matrixU().leftCols(diagSize) * referenceSvd.singularValues().cwiseAbs2().asDiagonal() * referenceSvd.matrixU().leftCols(diagSize).adjoint());
+  }
+  
+  // The following checks are not critical.
+  // For instance, with Dived&Conquer SVD, if only the factor 'V' is computedt then different matrix-matrix product implementation will be used
+  // and the resulting 'V' factor might be significantly different when the SVD decomposition is not unique, especially with single precision float.
+  ++g_test_level;
+  if(computationOptions & ComputeFullU)  VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU());
+  if(computationOptions & ComputeThinU)  VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU().leftCols(diagSize));
+  if(computationOptions & ComputeFullV)  VERIFY_IS_APPROX(svd.matrixV().cwiseAbs(), referenceSvd.matrixV().cwiseAbs());
+  if(computationOptions & ComputeThinV)  VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV().leftCols(diagSize));
+  --g_test_level;
+}
+
+//
+template<typename SvdType, typename MatrixType>
+void svd_least_square(const MatrixType& m, unsigned int computationOptions)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::Index Index;
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, RowsAtCompileTime, Dynamic> RhsType;
+  typedef Matrix<Scalar, ColsAtCompileTime, Dynamic> SolutionType;
+
+  RhsType rhs = RhsType::Random(rows, internal::random<Index>(1, cols));
+  SvdType svd(m, computationOptions);
+
+       if(internal::is_same<RealScalar,double>::value) svd.setThreshold(1e-8);
+  else if(internal::is_same<RealScalar,float>::value)  svd.setThreshold(2e-4);
+
+  SolutionType x = svd.solve(rhs);
+   
+  RealScalar residual = (m*x-rhs).norm();
+  RealScalar rhs_norm = rhs.norm();
+  if(!test_isMuchSmallerThan(residual,rhs.norm()))
+  {
+    // ^^^ If the residual is very small, then we have an exact solution, so we are already good.
+    
+    // evaluate normal equation which works also for least-squares solutions
+    if(internal::is_same<RealScalar,double>::value || svd.rank()==m.diagonal().size())
+    {
+      using std::sqrt;
+      // This test is not stable with single precision.
+      // This is probably because squaring m signicantly affects the precision.      
+      if(internal::is_same<RealScalar,float>::value) ++g_test_level;
+      
+      VERIFY_IS_APPROX(m.adjoint()*(m*x),m.adjoint()*rhs);
+      
+      if(internal::is_same<RealScalar,float>::value) --g_test_level;
+    }
+    
+    // Check that there is no significantly better solution in the neighborhood of x
+    for(Index k=0;k<x.rows();++k)
+    {
+      using std::abs;
+      
+      SolutionType y(x);
+      y.row(k) = (RealScalar(1)+2*NumTraits<RealScalar>::epsilon())*x.row(k);
+      RealScalar residual_y = (m*y-rhs).norm();
+      VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) ++g_test_level;
+      VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) --g_test_level;
+      
+      y.row(k) = (RealScalar(1)-2*NumTraits<RealScalar>::epsilon())*x.row(k);
+      residual_y = (m*y-rhs).norm();
+      VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) ++g_test_level;
+      VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) --g_test_level;
+    }
+  }
+}
+
+// check minimal norm solutions, the inoput matrix m is only used to recover problem size
+template<typename MatrixType>
+void svd_min_norm(const MatrixType& m, unsigned int computationOptions)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Index Index;
+  Index cols = m.cols();
+
+  enum {
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, ColsAtCompileTime, Dynamic> SolutionType;
+
+  // generate a full-rank m x n problem with m<n
+  enum {
+    RankAtCompileTime2 = ColsAtCompileTime==Dynamic ? Dynamic : (ColsAtCompileTime)/2+1,
+    RowsAtCompileTime3 = ColsAtCompileTime==Dynamic ? Dynamic : ColsAtCompileTime+1
+  };
+  typedef Matrix<Scalar, RankAtCompileTime2, ColsAtCompileTime> MatrixType2;
+  typedef Matrix<Scalar, RankAtCompileTime2, 1> RhsType2;
+  typedef Matrix<Scalar, ColsAtCompileTime, RankAtCompileTime2> MatrixType2T;
+  Index rank = RankAtCompileTime2==Dynamic ? internal::random<Index>(1,cols) : Index(RankAtCompileTime2);
+  MatrixType2 m2(rank,cols);
+  int guard = 0;
+  do {
+    m2.setRandom();
+  } while(SVD_FOR_MIN_NORM(MatrixType2)(m2).setThreshold(test_precision<Scalar>()).rank()!=rank && (++guard)<10);
+  VERIFY(guard<10);
+
+  RhsType2 rhs2 = RhsType2::Random(rank);
+  // use QR to find a reference minimal norm solution
+  HouseholderQR<MatrixType2T> qr(m2.adjoint());
+  Matrix<Scalar,Dynamic,1> tmp = qr.matrixQR().topLeftCorner(rank,rank).template triangularView<Upper>().adjoint().solve(rhs2);
+  tmp.conservativeResize(cols);
+  tmp.tail(cols-rank).setZero();
+  SolutionType x21 = qr.householderQ() * tmp;
+  // now check with SVD
+  SVD_FOR_MIN_NORM(MatrixType2) svd2(m2, computationOptions);
+  SolutionType x22 = svd2.solve(rhs2);
+  VERIFY_IS_APPROX(m2*x21, rhs2);
+  VERIFY_IS_APPROX(m2*x22, rhs2);
+  VERIFY_IS_APPROX(x21, x22);
+
+  // Now check with a rank deficient matrix
+  typedef Matrix<Scalar, RowsAtCompileTime3, ColsAtCompileTime> MatrixType3;
+  typedef Matrix<Scalar, RowsAtCompileTime3, 1> RhsType3;
+  Index rows3 = RowsAtCompileTime3==Dynamic ? internal::random<Index>(rank+1,2*cols) : Index(RowsAtCompileTime3);
+  Matrix<Scalar,RowsAtCompileTime3,Dynamic> C = Matrix<Scalar,RowsAtCompileTime3,Dynamic>::Random(rows3,rank);
+  MatrixType3 m3 = C * m2;
+  RhsType3 rhs3 = C * rhs2;
+  SVD_FOR_MIN_NORM(MatrixType3) svd3(m3, computationOptions);
+  SolutionType x3 = svd3.solve(rhs3);
+  VERIFY_IS_APPROX(m3*x3, rhs3);
+  VERIFY_IS_APPROX(m3*x21, rhs3);
+  VERIFY_IS_APPROX(m2*x3, rhs2);
+  VERIFY_IS_APPROX(x21, x3);
+}
+
+// Check full, compare_to_full, least_square, and min_norm for all possible compute-options
+template<typename SvdType, typename MatrixType>
+void svd_test_all_computation_options(const MatrixType& m, bool full_only)
+{
+//   if (QRPreconditioner == NoQRPreconditioner && m.rows() != m.cols())
+//     return;
+  SvdType fullSvd(m, ComputeFullU|ComputeFullV);
+  CALL_SUBTEST(( svd_check_full(m, fullSvd) ));
+  CALL_SUBTEST(( svd_least_square<SvdType>(m, ComputeFullU | ComputeFullV) ));
+  CALL_SUBTEST(( svd_min_norm(m, ComputeFullU | ComputeFullV) ));
+  
+  #if defined __INTEL_COMPILER
+  // remark #111: statement is unreachable
+  #pragma warning disable 111
+  #endif
+  if(full_only)
+    return;
+
+  CALL_SUBTEST(( svd_compare_to_full(m, ComputeFullU, fullSvd) ));
+  CALL_SUBTEST(( svd_compare_to_full(m, ComputeFullV, fullSvd) ));
+  CALL_SUBTEST(( svd_compare_to_full(m, 0, fullSvd) ));
+
+  if (MatrixType::ColsAtCompileTime == Dynamic) {
+    // thin U/V are only available with dynamic number of columns
+    CALL_SUBTEST(( svd_compare_to_full(m, ComputeFullU|ComputeThinV, fullSvd) ));
+    CALL_SUBTEST(( svd_compare_to_full(m,              ComputeThinV, fullSvd) ));
+    CALL_SUBTEST(( svd_compare_to_full(m, ComputeThinU|ComputeFullV, fullSvd) ));
+    CALL_SUBTEST(( svd_compare_to_full(m, ComputeThinU             , fullSvd) ));
+    CALL_SUBTEST(( svd_compare_to_full(m, ComputeThinU|ComputeThinV, fullSvd) ));
+    
+    CALL_SUBTEST(( svd_least_square<SvdType>(m, ComputeFullU | ComputeThinV) ));
+    CALL_SUBTEST(( svd_least_square<SvdType>(m, ComputeThinU | ComputeFullV) ));
+    CALL_SUBTEST(( svd_least_square<SvdType>(m, ComputeThinU | ComputeThinV) ));
+
+    CALL_SUBTEST(( svd_min_norm(m, ComputeFullU | ComputeThinV) ));
+    CALL_SUBTEST(( svd_min_norm(m, ComputeThinU | ComputeFullV) ));
+    CALL_SUBTEST(( svd_min_norm(m, ComputeThinU | ComputeThinV) ));
+
+    // test reconstruction
+    typedef typename MatrixType::Index Index;
+    Index diagSize = (std::min)(m.rows(), m.cols());
+    SvdType svd(m, ComputeThinU | ComputeThinV);
+    VERIFY_IS_APPROX(m, svd.matrixU().leftCols(diagSize) * svd.singularValues().asDiagonal() * svd.matrixV().leftCols(diagSize).adjoint());
+  }
+}
+
+
+// work around stupid msvc error when constructing at compile time an expression that involves
+// a division by zero, even if the numeric type has floating point
+template<typename Scalar>
+EIGEN_DONT_INLINE Scalar zero() { return Scalar(0); }
+
+// workaround aggressive optimization in ICC
+template<typename T> EIGEN_DONT_INLINE  T sub(T a, T b) { return a - b; }
+
+// all this function does is verify we don't iterate infinitely on nan/inf values
+template<typename SvdType, typename MatrixType>
+void svd_inf_nan()
+{
+  SvdType svd;
+  typedef typename MatrixType::Scalar Scalar;
+  Scalar some_inf = Scalar(1) / zero<Scalar>();
+  VERIFY(sub(some_inf, some_inf) != sub(some_inf, some_inf));
+  svd.compute(MatrixType::Constant(10,10,some_inf), ComputeFullU | ComputeFullV);
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  VERIFY(nan != nan);
+  svd.compute(MatrixType::Constant(10,10,nan), ComputeFullU | ComputeFullV);
+
+  MatrixType m = MatrixType::Zero(10,10);
+  m(internal::random<int>(0,9), internal::random<int>(0,9)) = some_inf;
+  svd.compute(m, ComputeFullU | ComputeFullV);
+
+  m = MatrixType::Zero(10,10);
+  m(internal::random<int>(0,9), internal::random<int>(0,9)) = nan;
+  svd.compute(m, ComputeFullU | ComputeFullV);
+  
+  // regression test for bug 791
+  m.resize(3,3);
+  m << 0,    2*NumTraits<Scalar>::epsilon(),  0.5,
+       0,   -0.5,                             0,
+       nan,  0,                               0;
+  svd.compute(m, ComputeFullU | ComputeFullV);
+  
+  m.resize(4,4);
+  m <<  1, 0, 0, 0,
+        0, 3, 1, 2e-308,
+        1, 0, 1, nan,
+        0, nan, nan, 0;
+  svd.compute(m, ComputeFullU | ComputeFullV);
+}
+
+// Regression test for bug 286: JacobiSVD loops indefinitely with some
+// matrices containing denormal numbers.
+template<typename>
+void svd_underoverflow()
+{
+#if defined __INTEL_COMPILER
+// shut up warning #239: floating point underflow
+#pragma warning push
+#pragma warning disable 239
+#endif
+  Matrix2d M;
+  M << -7.90884e-313, -4.94e-324,
+                 0, 5.60844e-313;
+  SVD_DEFAULT(Matrix2d) svd;
+  svd.compute(M,ComputeFullU|ComputeFullV);
+  CALL_SUBTEST( svd_check_full(M,svd) );
+  
+  // Check all 2x2 matrices made with the following coefficients:
+  VectorXd value_set(9);
+  value_set << 0, 1, -1, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -4.94e-223, 4.94e-223;
+  Array4i id(0,0,0,0);
+  int k = 0;
+  do
+  {
+    M << value_set(id(0)), value_set(id(1)), value_set(id(2)), value_set(id(3));
+    svd.compute(M,ComputeFullU|ComputeFullV);
+    CALL_SUBTEST( svd_check_full(M,svd) );
+
+    id(k)++;
+    if(id(k)>=value_set.size())
+    {
+      while(k<3 && id(k)>=value_set.size()) id(++k)++;
+      id.head(k).setZero();
+      k=0;
+    }
+
+  } while((id<int(value_set.size())).all());
+  
+#if defined __INTEL_COMPILER
+#pragma warning pop
+#endif
+  
+  // Check for overflow:
+  Matrix3d M3;
+  M3 << 4.4331978442502944e+307, -5.8585363752028680e+307,  6.4527017443412964e+307,
+        3.7841695601406358e+307,  2.4331702789740617e+306, -3.5235707140272905e+307,
+       -8.7190887618028355e+307, -7.3453213709232193e+307, -2.4367363684472105e+307;
+
+  SVD_DEFAULT(Matrix3d) svd3;
+  svd3.compute(M3,ComputeFullU|ComputeFullV); // just check we don't loop indefinitely
+  CALL_SUBTEST( svd_check_full(M3,svd3) );
+}
+
+// void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
+
+template<typename MatrixType>
+void svd_all_trivial_2x2( void (*cb)(const MatrixType&,bool) )
+{
+  MatrixType M;
+  VectorXd value_set(3);
+  value_set << 0, 1, -1;
+  Array4i id(0,0,0,0);
+  int k = 0;
+  do
+  {
+    M << value_set(id(0)), value_set(id(1)), value_set(id(2)), value_set(id(3));
+    
+    cb(M,false);
+    
+    id(k)++;
+    if(id(k)>=value_set.size())
+    {
+      while(k<3 && id(k)>=value_set.size()) id(++k)++;
+      id.head(k).setZero();
+      k=0;
+    }
+    
+  } while((id<int(value_set.size())).all());
+}
+
+template<typename>
+void svd_preallocate()
+{
+  Vector3f v(3.f, 2.f, 1.f);
+  MatrixXf m = v.asDiagonal();
+
+  internal::set_is_malloc_allowed(false);
+  VERIFY_RAISES_ASSERT(VectorXf tmp(10);)
+  SVD_DEFAULT(MatrixXf) svd;
+  internal::set_is_malloc_allowed(true);
+  svd.compute(m);
+  VERIFY_IS_APPROX(svd.singularValues(), v);
+
+  SVD_DEFAULT(MatrixXf) svd2(3,3);
+  internal::set_is_malloc_allowed(false);
+  svd2.compute(m);
+  internal::set_is_malloc_allowed(true);
+  VERIFY_IS_APPROX(svd2.singularValues(), v);
+  VERIFY_RAISES_ASSERT(svd2.matrixU());
+  VERIFY_RAISES_ASSERT(svd2.matrixV());
+  svd2.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY_IS_APPROX(svd2.matrixU(), Matrix3f::Identity());
+  VERIFY_IS_APPROX(svd2.matrixV(), Matrix3f::Identity());
+  internal::set_is_malloc_allowed(false);
+  svd2.compute(m);
+  internal::set_is_malloc_allowed(true);
+
+  SVD_DEFAULT(MatrixXf) svd3(3,3,ComputeFullU|ComputeFullV);
+  internal::set_is_malloc_allowed(false);
+  svd2.compute(m);
+  internal::set_is_malloc_allowed(true);
+  VERIFY_IS_APPROX(svd2.singularValues(), v);
+  VERIFY_IS_APPROX(svd2.matrixU(), Matrix3f::Identity());
+  VERIFY_IS_APPROX(svd2.matrixV(), Matrix3f::Identity());
+  internal::set_is_malloc_allowed(false);
+  svd2.compute(m, ComputeFullU|ComputeFullV);
+  internal::set_is_malloc_allowed(true);
+}
+
+template<typename SvdType,typename MatrixType> 
+void svd_verify_assert(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Index Index;
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, RowsAtCompileTime, 1> RhsType;
+  RhsType rhs(rows);
+  SvdType svd;
+  VERIFY_RAISES_ASSERT(svd.matrixU())
+  VERIFY_RAISES_ASSERT(svd.singularValues())
+  VERIFY_RAISES_ASSERT(svd.matrixV())
+  VERIFY_RAISES_ASSERT(svd.solve(rhs))
+  MatrixType a = MatrixType::Zero(rows, cols);
+  a.setZero();
+  svd.compute(a, 0);
+  VERIFY_RAISES_ASSERT(svd.matrixU())
+  VERIFY_RAISES_ASSERT(svd.matrixV())
+  svd.singularValues();
+  VERIFY_RAISES_ASSERT(svd.solve(rhs))
+    
+  if (ColsAtCompileTime == Dynamic)
+  {
+    svd.compute(a, ComputeThinU);
+    svd.matrixU();
+    VERIFY_RAISES_ASSERT(svd.matrixV())
+    VERIFY_RAISES_ASSERT(svd.solve(rhs))
+    svd.compute(a, ComputeThinV);
+    svd.matrixV();
+    VERIFY_RAISES_ASSERT(svd.matrixU())
+    VERIFY_RAISES_ASSERT(svd.solve(rhs))
+  }
+  else
+  {
+    VERIFY_RAISES_ASSERT(svd.compute(a, ComputeThinU))
+    VERIFY_RAISES_ASSERT(svd.compute(a, ComputeThinV))
+  }
+}
+
+#undef SVD_DEFAULT
+#undef SVD_FOR_MIN_NORM
diff --git a/test/svd_fill.h b/test/svd_fill.h
new file mode 100644
index 000000000..3877c0c7e
--- /dev/null
+++ b/test/svd_fill.h
@@ -0,0 +1,119 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+template<typename T>
+Array<T,4,1> four_denorms();
+
+template<>
+Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }
+template<>
+Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); }
+template<typename T>
+Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
+
+template<typename MatrixType>
+void svd_fill_random(MatrixType &m, int Option = 0)
+{
+  using std::pow;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::Index Index;
+  Index diagSize = (std::min)(m.rows(), m.cols());
+  RealScalar s = std::numeric_limits<RealScalar>::max_exponent10/4;
+  s = internal::random<RealScalar>(1,s);
+  Matrix<RealScalar,Dynamic,1> d =  Matrix<RealScalar,Dynamic,1>::Random(diagSize);
+  for(Index k=0; k<diagSize; ++k)
+    d(k) = d(k)*pow(RealScalar(10),internal::random<RealScalar>(-s,s));
+
+  bool dup     = internal::random<int>(0,10) < 3;
+  bool unit_uv = internal::random<int>(0,10) < (dup?7:3); // if we duplicate some diagonal entries, then increase the chance to preserve them using unitary U and V factors
+  
+  // duplicate some singular values
+  if(dup)
+  {
+    Index n = internal::random<Index>(0,d.size()-1);
+    for(Index i=0; i<n; ++i)
+      d(internal::random<Index>(0,d.size()-1)) = d(internal::random<Index>(0,d.size()-1));
+  }
+  
+  Matrix<Scalar,Dynamic,Dynamic> U(m.rows(),diagSize);
+  Matrix<Scalar,Dynamic,Dynamic> VT(diagSize,m.cols());
+  if(unit_uv)
+  {
+    // in very rare cases let's try with a pure diagonal matrix
+    if(internal::random<int>(0,10) < 1)
+    {
+      U.setIdentity();
+      VT.setIdentity();
+    }
+    else
+    {
+      createRandomPIMatrixOfRank(diagSize,U.rows(), U.cols(), U);
+      createRandomPIMatrixOfRank(diagSize,VT.rows(), VT.cols(), VT);
+    }
+  }
+  else
+  {
+    U.setRandom();
+    VT.setRandom();
+  }
+  
+  Matrix<Scalar,Dynamic,1> samples(9);
+  samples << 0, four_denorms<RealScalar>(),
+            -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
+  
+  if(Option==Symmetric)
+  {
+    m = U * d.asDiagonal() * U.transpose();
+    
+    // randomly nullify some rows/columns
+    {
+      Index count = internal::random<Index>(-diagSize,diagSize);
+      for(Index k=0; k<count; ++k)
+      {
+        Index i = internal::random<Index>(0,diagSize-1);
+        m.row(i).setZero();
+        m.col(i).setZero();
+      }
+      if(count<0)
+      // (partly) cancel some coeffs
+      if(!(dup && unit_uv))
+      {
+        
+        Index n = internal::random<Index>(0,m.size()-1);
+        for(Index k=0; k<n; ++k)
+        {
+          Index i = internal::random<Index>(0,m.rows()-1);
+          Index j = internal::random<Index>(0,m.cols()-1);
+          m(j,i) = m(i,j) = samples(internal::random<Index>(0,samples.size()-1));
+          if(NumTraits<Scalar>::IsComplex)
+            *(&numext::real_ref(m(j,i))+1) = *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random<Index>(0,samples.size()-1));
+        }
+      }
+    }
+  }
+  else
+  {
+    m = U * d.asDiagonal() * VT;
+    // (partly) cancel some coeffs
+    if(!(dup && unit_uv))
+    {
+      Index n = internal::random<Index>(0,m.size()-1);
+      for(Index k=0; k<n; ++k)
+      {
+        Index i = internal::random<Index>(0,m.rows()-1);
+        Index j = internal::random<Index>(0,m.cols()-1);
+        m(i,j) = samples(internal::random<Index>(0,samples.size()-1));
+        if(NumTraits<Scalar>::IsComplex)
+          *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random<Index>(0,samples.size()-1));
+      }
+    }
+  }
+}
+
diff --git a/test/swap.cpp b/test/swap.cpp
index 36b353148..f76e3624d 100644
--- a/test/swap.cpp
+++ b/test/swap.cpp
@@ -41,9 +41,15 @@ template<typename MatrixType> void swap(const MatrixType& m)
   OtherMatrixType m3_copy = m3;
   
   // test swapping 2 matrices of same type
+  Scalar *d1=m1.data(), *d2=m2.data();
   m1.swap(m2);
   VERIFY_IS_APPROX(m1,m2_copy);
   VERIFY_IS_APPROX(m2,m1_copy);
+  if(MatrixType::SizeAtCompileTime==Dynamic)
+  {
+    VERIFY(m1.data()==d2);
+    VERIFY(m2.data()==d1);
+  }
   m1 = m1_copy;
   m2 = m2_copy;
   
@@ -68,16 +74,21 @@ template<typename MatrixType> void swap(const MatrixType& m)
   m1 = m1_copy;
   m3 = m3_copy;
   
-  // test assertion on mismatching size -- matrix case
-  VERIFY_RAISES_ASSERT(m1.swap(m1.row(0)));
-  // test assertion on mismatching size -- xpr case
-  VERIFY_RAISES_ASSERT(m1.row(0).swap(m1));
+  if(m1.rows()>1)
+  {
+    // test assertion on mismatching size -- matrix case
+    VERIFY_RAISES_ASSERT(m1.swap(m1.row(0)));
+    // test assertion on mismatching size -- xpr case
+    VERIFY_RAISES_ASSERT(m1.row(0).swap(m1));
+  }
 }
 
 void test_swap()
 {
+  int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
   CALL_SUBTEST_1( swap(Matrix3f()) ); // fixed size, no vectorization 
   CALL_SUBTEST_2( swap(Matrix4d()) ); // fixed size, possible vectorization 
-  CALL_SUBTEST_3( swap(MatrixXd(3,3)) ); // dyn size, no vectorization 
-  CALL_SUBTEST_4( swap(MatrixXf(30,30)) ); // dyn size, possible vectorization 
+  CALL_SUBTEST_3( swap(MatrixXd(s,s)) ); // dyn size, no vectorization 
+  CALL_SUBTEST_4( swap(MatrixXf(s,s)) ); // dyn size, possible vectorization 
+  TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
diff --git a/test/testsuite.cmake b/test/testsuite.cmake
deleted file mode 100644
index 3bec56b3f..000000000
--- a/test/testsuite.cmake
+++ /dev/null
@@ -1,229 +0,0 @@
-
-####################################################################
-#
-# Usage:
-#  - create a new folder, let's call it cdash
-#  - in that folder, do:
-#    ctest -S path/to/eigen/test/testsuite.cmake[,option1=value1[,option2=value2]]
-#
-# Options:
-#  - EIGEN_CXX: compiler, eg.: g++-4.2
-#      default: default c++ compiler
-#  - EIGEN_SITE: eg, INRIA-Bdx_pc-gael, or the name of the contributor, etc.
-#      default: hostname
-#  - EIGEN_BUILD_STRING: a string which identify the system/compiler. It should be formed like that:
-#        <OS_name>-<OS_version>-<arch>-<compiler-version>
-#      with:
-#        <OS_name> = opensuse, debian, osx, windows, cygwin, freebsd, solaris, etc.
-#        <OS_version> = 11.1, XP, vista, leopard, etc.
-#        <arch> = i386, x86_64, ia64, powerpc, etc.
-#        <compiler-version> = gcc-4.3.2, icc-11.0, MSVC-2008, etc.
-#  - EIGEN_EXPLICIT_VECTORIZATION: novec, SSE2, Altivec
-#       default: SSE2 for x86_64 systems, novec otherwise
-#       Its value is automatically appended to EIGEN_BUILD_STRING
-#  - EIGEN_CMAKE_DIR: path to cmake executable
-#  - EIGEN_MODE: dashboard model, can be Experimental, Nightly, or Continuous
-#      default: Nightly
-#  - EIGEN_WORK_DIR: directory used to download the source files and make the builds
-#      default: folder which contains this script
-#  - EIGEN_CMAKE_ARGS: additional arguments passed to cmake
-#  - EIGEN_GENERATOR_TYPE: allows to overwrite the generator type
-#      default: nmake (windows
-#      See http://www.cmake.org/cmake/help/cmake2.6docs.html#section_Generators for a complete
-#      list of supported generators.
-#  - EIGEN_NO_UPDATE: allows to submit dash boards from local repositories
-#      This might be interesting in case you want to submit dashboards
-#      including local changes.
-#  - CTEST_SOURCE_DIRECTORY: path to eigen's src (use a new and empty folder, not the one you are working on)
-#      default: <EIGEN_WORK_DIR>/src
-#  - CTEST_BINARY_DIRECTORY: build directory
-#      default: <EIGEN_WORK_DIR>/nightly-<EIGEN_CXX>
-#
-# Here is an example running several compilers on a linux system:
-# #!/bin/bash
-# ARCH=`uname -m`
-# SITE=`hostname`
-# VERSION=opensuse-11.1
-# WORK_DIR=/home/gael/Coding/eigen/cdash
-# # get the last version of the script
-# wget http://bitbucket.org/eigen/eigen/raw/tip/test/testsuite.cmake -o $WORK_DIR/testsuite.cmake
-# COMMON="ctest -S $WORK_DIR/testsuite.cmake,EIGEN_WORK_DIR=$WORK_DIR,EIGEN_SITE=$SITE,EIGEN_MODE=$1,EIGEN_BUILD_STRING=$OS_VERSION-$ARCH"
-# $COMMON-gcc-3.4.6,EIGEN_CXX=g++-3.4
-# $COMMON-gcc-4.0.1,EIGEN_CXX=g++-4.0.1
-# $COMMON-gcc-4.3.2,EIGEN_CXX=g++-4.3,EIGEN_EXPLICIT_VECTORIZATION=novec
-# $COMMON-gcc-4.3.2,EIGEN_CXX=g++-4.3,EIGEN_EXPLICIT_VECTORIZATION=SSE2
-# $COMMON-icc-11.0,EIGEN_CXX=icpc
-#
-####################################################################
-
-# process the arguments
-
-set(ARGLIST ${CTEST_SCRIPT_ARG})
-while(${ARGLIST} MATCHES  ".+.*")
-
-  # pick first
-  string(REGEX MATCH "([^,]*)(,.*)?" DUMMY ${ARGLIST})
-  SET(TOP ${CMAKE_MATCH_1})
-
-  # remove first
-  string(REGEX MATCHALL "[^,]*,(.*)" DUMMY ${ARGLIST})
-  SET(ARGLIST ${CMAKE_MATCH_1})
-
-  # decompose as a pair key=value
-  string(REGEX MATCH "([^=]*)(=.*)?" DUMMY ${TOP})
-  SET(KEY ${CMAKE_MATCH_1})
-
-  string(REGEX MATCH "[^=]*=(.*)" DUMMY ${TOP})
-  SET(VALUE ${CMAKE_MATCH_1})
-
-  # set the variable to the specified value
-  if(VALUE)
-    SET(${KEY} ${VALUE})
-  else(VALUE)
-    SET(${KEY} ON)
-  endif(VALUE)
-
-endwhile(${ARGLIST} MATCHES ".+.*")
-
-####################################################################
-# Automatically set some user variables if they have not been defined manually
-####################################################################
-cmake_minimum_required(VERSION 2.6 FATAL_ERROR)
-
-if(NOT EIGEN_SITE)
-  site_name(EIGEN_SITE)
-endif(NOT EIGEN_SITE)
-
-if(NOT EIGEN_CMAKE_DIR)
-  SET(EIGEN_CMAKE_DIR "")
-endif(NOT EIGEN_CMAKE_DIR)
-
-if(NOT EIGEN_BUILD_STRING)
-
-  # let's try to find all information we need to make the build string ourself
-
-  # OS
-  build_name(EIGEN_OS_VERSION)
-
-  # arch
-  set(EIGEN_ARCH ${CMAKE_SYSTEM_PROCESSOR})
-  if(WIN32)
-    set(EIGEN_ARCH $ENV{PROCESSOR_ARCHITECTURE})
-  else(WIN32)
-    execute_process(COMMAND uname -m OUTPUT_VARIABLE EIGEN_ARCH OUTPUT_STRIP_TRAILING_WHITESPACE)
-  endif(WIN32)
-
-  set(EIGEN_BUILD_STRING ${EIGEN_OS_VERSION}${EIGEN_ARCH}-${EIGEN_CXX})
-
-endif(NOT EIGEN_BUILD_STRING)
-
-if(DEFINED EIGEN_EXPLICIT_VECTORIZATION)
-  set(EIGEN_BUILD_STRING ${EIGEN_BUILD_STRING}-${EIGEN_EXPLICIT_VECTORIZATION})
-endif(DEFINED EIGEN_EXPLICIT_VECTORIZATION)
-
-if(NOT EIGEN_WORK_DIR)
-  set(EIGEN_WORK_DIR ${CTEST_SCRIPT_DIRECTORY})
-endif(NOT EIGEN_WORK_DIR)
-
-if(NOT CTEST_SOURCE_DIRECTORY)
-  SET (CTEST_SOURCE_DIRECTORY "${EIGEN_WORK_DIR}/src")
-endif(NOT CTEST_SOURCE_DIRECTORY)
-
-if(NOT CTEST_BINARY_DIRECTORY)
-  SET (CTEST_BINARY_DIRECTORY "${EIGEN_WORK_DIR}/nightly_${EIGEN_CXX}")
-endif(NOT CTEST_BINARY_DIRECTORY)
-
-if(NOT EIGEN_MODE)
-  set(EIGEN_MODE Nightly)
-endif(NOT EIGEN_MODE)
-
-## mandatory variables (the default should be ok in most cases):
-
-if(NOT EIGEN_NO_UPDATE)
-  SET (CTEST_CVS_COMMAND "hg")
-  SET (CTEST_CVS_CHECKOUT "${CTEST_CVS_COMMAND} clone http://bitbucket.org/eigen/eigen \"${CTEST_SOURCE_DIRECTORY}\"")
-  SET(CTEST_BACKUP_AND_RESTORE TRUE) # the backup is CVS related ...
-endif(NOT EIGEN_NO_UPDATE)
-
-# which ctest command to use for running the dashboard
-SET (CTEST_COMMAND "${EIGEN_CMAKE_DIR}ctest -D ${EIGEN_MODE} --no-compress-output")
-if($ENV{EIGEN_CTEST_ARGS})
-SET (CTEST_COMMAND "${CTEST_COMMAND} $ENV{EIGEN_CTEST_ARGS}")
-endif($ENV{EIGEN_CTEST_ARGS})
-# what cmake command to use for configuring this dashboard
-SET (CTEST_CMAKE_COMMAND "${EIGEN_CMAKE_DIR}cmake -DEIGEN_LEAVE_TEST_IN_ALL_TARGET=ON")
-
-####################################################################
-# The values in this section are optional you can either
-# have them or leave them commented out
-####################################################################
-
-# this make sure we get consistent outputs
-SET($ENV{LC_MESSAGES} "en_EN")
-
-# should ctest wipe the binary tree before running
-SET(CTEST_START_WITH_EMPTY_BINARY_DIRECTORY TRUE)
-
-# raise the warning/error limit
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "33331")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "33331")
-
-# this is the initial cache to use for the binary tree, be careful to escape
-# any quotes inside of this string if you use it
-if(WIN32 AND NOT UNIX)
-  #message(SEND_ERROR "win32")
-  if(EIGEN_GENERATOR_TYPE)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -G \"${EIGEN_GENERATOR_TYPE}\"")
-    SET (CTEST_INITIAL_CACHE "
-      CMAKE_BUILD_TYPE:STRING=Release
-      BUILDNAME:STRING=${EIGEN_BUILD_STRING}
-      SITE:STRING=${EIGEN_SITE}
-    ")
-  else(EIGEN_GENERATOR_TYPE)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -G \"NMake Makefiles\" -DCMAKE_MAKE_PROGRAM=nmake")
-    SET (CTEST_INITIAL_CACHE "
-      MAKECOMMAND:STRING=nmake /i
-      CMAKE_MAKE_PROGRAM:FILEPATH=nmake
-      CMAKE_GENERATOR:INTERNAL=NMake Makefiles
-      CMAKE_BUILD_TYPE:STRING=Release
-      BUILDNAME:STRING=${EIGEN_BUILD_STRING}
-      SITE:STRING=${EIGEN_SITE}
-    ")
-  endif(EIGEN_GENERATOR_TYPE)
-else(WIN32 AND NOT UNIX)
-  SET (CTEST_INITIAL_CACHE "
-    BUILDNAME:STRING=${EIGEN_BUILD_STRING}
-    SITE:STRING=${EIGEN_SITE}
-  ")
-endif(WIN32 AND NOT UNIX)
-
-# set any extra environment variables to use during the execution of the script here:
-# setting this variable on windows machines causes trouble ...
-
-if(EIGEN_CXX AND NOT WIN32)
-  set(CTEST_ENVIRONMENT "CXX=${EIGEN_CXX}")
-endif(EIGEN_CXX AND NOT WIN32)
-
-if(DEFINED EIGEN_EXPLICIT_VECTORIZATION)
-  if(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSE2)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_SSE2=ON")
-  elseif(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSE3)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_SSE2=ON -DEIGEN_TEST_SSE3=ON")
-  elseif(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSSE3)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_SSE2=ON -DEIGEN_TEST_SSE3=ON -DEIGEN_TEST_SSSE3=ON")  
-  elseif(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSE4_1)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_SSE2=ON -DEIGEN_TEST_SSE3=ON -DEIGEN_TEST_SSSE3=ON -DEIGEN_TEST_SSE4_1=ON")
-  elseif(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSE4_2)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_SSE2=ON -DEIGEN_TEST_SSE3=ON -DEIGEN_TEST_SSSE3=ON -DEIGEN_TEST_SSE4_1=ON -DEIGEN_TEST_SSE4_2=ON")
-  elseif(EIGEN_EXPLICIT_VECTORIZATION MATCHES Altivec)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_ALTIVEC=ON")
-  elseif(EIGEN_EXPLICIT_VECTORIZATION MATCHES novec)
-    set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} -DEIGEN_TEST_NO_EXPLICIT_VECTORIZATION=ON")
-  else(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSE2)
-    message(FATAL_ERROR "Invalid value for EIGEN_EXPLICIT_VECTORIZATION (${EIGEN_EXPLICIT_VECTORIZATION}), must be: novec, SSE2, SSE3, Altivec")
-  endif(EIGEN_EXPLICIT_VECTORIZATION MATCHES SSE2)
-endif(DEFINED EIGEN_EXPLICIT_VECTORIZATION)
-
-if(DEFINED EIGEN_CMAKE_ARGS)
-  set(CTEST_CMAKE_COMMAND "${CTEST_CMAKE_COMMAND} ${EIGEN_CMAKE_ARGS}")
-endif(DEFINED EIGEN_CMAKE_ARGS)
diff --git a/test/triangular.cpp b/test/triangular.cpp
index 54320390b..b96856486 100644
--- a/test/triangular.cpp
+++ b/test/triangular.cpp
@@ -65,7 +65,7 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
 
   m1 = MatrixType::Random(rows, cols);
   for (int i=0; i<rows; ++i)
-    while (numext::abs2(m1(i,i))<1e-1) m1(i,i) = internal::random<Scalar>();
+    while (numext::abs2(m1(i,i))<RealScalar(1e-1)) m1(i,i) = internal::random<Scalar>();
 
   Transpose<MatrixType> trm4(m4);
   // test back and forward subsitution with a vector as the rhs
@@ -78,7 +78,7 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
   m3 = m1.template triangularView<Lower>();
   VERIFY(v2.isApprox(m3.conjugate() * (m1.conjugate().template triangularView<Lower>().solve(v2)), largerEps));
 
-  // test back and forward subsitution with a matrix as the rhs
+  // test back and forward substitution with a matrix as the rhs
   m3 = m1.template triangularView<Upper>();
   VERIFY(m2.isApprox(m3.adjoint() * (m1.adjoint().template triangularView<Lower>().solve(m2)), largerEps));
   m3 = m1.template triangularView<Lower>();
@@ -113,6 +113,21 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
   m3.setZero();
   m3.template triangularView<Upper>().setOnes();
   VERIFY_IS_APPROX(m2,m3);
+  
+  m1.setRandom();
+  m3 = m1.template triangularView<Upper>();
+  Matrix<Scalar, MatrixType::ColsAtCompileTime, Dynamic> m5(cols, internal::random<int>(1,20));  m5.setRandom();
+  Matrix<Scalar, Dynamic, MatrixType::RowsAtCompileTime> m6(internal::random<int>(1,20), rows);  m6.setRandom();
+  VERIFY_IS_APPROX(m1.template triangularView<Upper>() * m5, m3*m5);
+  VERIFY_IS_APPROX(m6*m1.template triangularView<Upper>(), m6*m3);
+
+  m1up = m1.template triangularView<Upper>();
+  VERIFY_IS_APPROX(m1.template selfadjointView<Upper>().template triangularView<Upper>().toDenseMatrix(), m1up);
+  VERIFY_IS_APPROX(m1up.template selfadjointView<Upper>().template triangularView<Upper>().toDenseMatrix(), m1up);
+  VERIFY_IS_APPROX(m1.template selfadjointView<Upper>().template triangularView<Lower>().toDenseMatrix(), m1up.adjoint());
+  VERIFY_IS_APPROX(m1up.template selfadjointView<Upper>().template triangularView<Lower>().toDenseMatrix(), m1up.adjoint());
+
+  VERIFY_IS_APPROX(m1.template selfadjointView<Upper>().diagonal(), m1.diagonal());
 
 }
 
diff --git a/test/umfpack_support.cpp b/test/umfpack_support.cpp
index 9eb84c14b..37ab11f0b 100644
--- a/test/umfpack_support.cpp
+++ b/test/umfpack_support.cpp
@@ -7,6 +7,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
 #include "sparse_solver.h"
 
 #include <Eigen/UmfPackSupport>
diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp
index 601dbf214..731a08977 100644
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@@ -2,13 +2,39 @@
 // for linear algebra.
 //
 // Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#if defined(EIGEN_TEST_PART_1)
+  // default
+#elif defined(EIGEN_TEST_PART_2)
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 16
+  #define EIGEN_MAX_ALIGN_BYTES 16
+#elif defined(EIGEN_TEST_PART_3)
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 32
+  #define EIGEN_MAX_ALIGN_BYTES 32
+#elif defined(EIGEN_TEST_PART_4)
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 64
+  #define EIGEN_MAX_ALIGN_BYTES 64
+#endif
+
 #include "main.h"
 
+typedef Matrix<float,  6,1> Vector6f;
+typedef Matrix<float,  8,1> Vector8f;
+typedef Matrix<float, 12,1> Vector12f;
+
+typedef Matrix<double, 5,1> Vector5d;
+typedef Matrix<double, 6,1> Vector6d;
+typedef Matrix<double, 7,1> Vector7d;
+typedef Matrix<double, 8,1> Vector8d;
+typedef Matrix<double, 9,1> Vector9d;
+typedef Matrix<double,10,1> Vector10d;
+typedef Matrix<double,12,1> Vector12d;
+
 struct TestNew1
 {
   MatrixXd m; // good: m will allocate its own array, taking care of alignment.
@@ -36,7 +62,7 @@ struct TestNew4
 struct TestNew5
 {
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  float f; // try the f at first -- the EIGEN_ALIGN16 attribute of m should make that still work
+  float f; // try the f at first -- the EIGEN_ALIGN_MAX attribute of m should make that still work
   Matrix4f m;
 };
 
@@ -63,13 +89,13 @@ void check_unalignedassert_good()
   delete[] y;
 }
 
-#if EIGEN_ALIGN_STATICALLY
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
 template<typename T>
 void construct_at_boundary(int boundary)
 {
   char buf[sizeof(T)+256];
-  size_t _buf = reinterpret_cast<size_t>(buf);
-  _buf += (16 - (_buf % 16)); // make 16-byte aligned
+  size_t _buf = reinterpret_cast<internal::UIntPtr>(buf);
+  _buf += (EIGEN_MAX_ALIGN_BYTES - (_buf % EIGEN_MAX_ALIGN_BYTES)); // make 16/32/...-byte aligned
   _buf += boundary; // make exact boundary-aligned
   T *x = ::new(reinterpret_cast<void*>(_buf)) T;
   x[0].setZero(); // just in order to silence warnings
@@ -79,26 +105,36 @@ void construct_at_boundary(int boundary)
 
 void unalignedassert()
 {
-  #if EIGEN_ALIGN_STATICALLY
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
   construct_at_boundary<Vector2f>(4);
   construct_at_boundary<Vector3f>(4);
   construct_at_boundary<Vector4f>(16);
+  construct_at_boundary<Vector6f>(4);
+  construct_at_boundary<Vector8f>(EIGEN_MAX_ALIGN_BYTES);
+  construct_at_boundary<Vector12f>(16);
   construct_at_boundary<Matrix2f>(16);
   construct_at_boundary<Matrix3f>(4);
-  construct_at_boundary<Matrix4f>(16);
+  construct_at_boundary<Matrix4f>(EIGEN_MAX_ALIGN_BYTES);
 
   construct_at_boundary<Vector2d>(16);
   construct_at_boundary<Vector3d>(4);
-  construct_at_boundary<Vector4d>(16);
-  construct_at_boundary<Matrix2d>(16);
+  construct_at_boundary<Vector4d>(EIGEN_MAX_ALIGN_BYTES);
+  construct_at_boundary<Vector5d>(4);
+  construct_at_boundary<Vector6d>(16);
+  construct_at_boundary<Vector7d>(4);
+  construct_at_boundary<Vector8d>(EIGEN_MAX_ALIGN_BYTES);
+  construct_at_boundary<Vector9d>(4);
+  construct_at_boundary<Vector10d>(16);
+  construct_at_boundary<Vector12d>(EIGEN_MAX_ALIGN_BYTES);
+  construct_at_boundary<Matrix2d>(EIGEN_MAX_ALIGN_BYTES);
   construct_at_boundary<Matrix3d>(4);
-  construct_at_boundary<Matrix4d>(16);
+  construct_at_boundary<Matrix4d>(EIGEN_MAX_ALIGN_BYTES);
 
   construct_at_boundary<Vector2cf>(16);
   construct_at_boundary<Vector3cf>(4);
-  construct_at_boundary<Vector2cd>(16);
+  construct_at_boundary<Vector2cd>(EIGEN_MAX_ALIGN_BYTES);
   construct_at_boundary<Vector3cd>(16);
-  #endif
+#endif
 
   check_unalignedassert_good<TestNew1>();
   check_unalignedassert_good<TestNew2>();
@@ -109,15 +145,32 @@ void unalignedassert()
   check_unalignedassert_good<TestNew6>();
   check_unalignedassert_good<Depends<true> >();
 
-#if EIGEN_ALIGN_STATICALLY
-  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
-  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(8));
-  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));
-  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(8));
-  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2d>(8));
-  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4d>(8));
-  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8));
-  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cd>(8));
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+  if(EIGEN_MAX_ALIGN_BYTES>=16)
+  {
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12f>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector6d>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8d>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector10d>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12d>(8));
+    // Complexes are disabled because the compiler might aggressively vectorize
+    // the initialization of complex coeffs to 0 before we can check for alignedness
+    //VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4i>(8));
+  }
+  for(int b=8; b<EIGEN_MAX_ALIGN_BYTES; b+=8)
+  {
+    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(b));
+    if(b<64)  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(b));
+    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(b));
+    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2d>(b));
+    if(b<128) VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4d>(b));
+    //if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cd>(b));
+  }
 #endif
 }
 
diff --git a/test/unalignedcount.cpp b/test/unalignedcount.cpp
index ca7e159f3..d6ffeafdf 100644
--- a/test/unalignedcount.cpp
+++ b/test/unalignedcount.cpp
@@ -30,7 +30,14 @@ static int nb_storeu;
 
 void test_unalignedcount()
 {
-  #ifdef EIGEN_VECTORIZE_SSE
+  #if defined(EIGEN_VECTORIZE_AVX)
+  VectorXf a(40), b(40);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 10, 0, 5, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 5, 5, 5, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) -= b.segment(0,40), 5, 5, 5, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) *= 3.5, 5, 0, 5, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) /= 3.5, 5, 0, 5, 0);
+  #elif defined(EIGEN_VECTORIZE_SSE)
   VectorXf a(40), b(40);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 20, 0, 10, 0);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 10, 10, 10, 0);
diff --git a/test/upperbidiagonalization.cpp b/test/upperbidiagonalization.cpp
index d15bf588b..847b34b55 100644
--- a/test/upperbidiagonalization.cpp
+++ b/test/upperbidiagonalization.cpp
@@ -35,7 +35,7 @@ void test_upperbidiagonalization()
    CALL_SUBTEST_1( upperbidiag(MatrixXf(3,3)) );
    CALL_SUBTEST_2( upperbidiag(MatrixXd(17,12)) );
    CALL_SUBTEST_3( upperbidiag(MatrixXcf(20,20)) );
-   CALL_SUBTEST_4( upperbidiag(MatrixXcd(16,15)) );
+   CALL_SUBTEST_4( upperbidiag(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(16,15)) );
    CALL_SUBTEST_5( upperbidiag(Matrix<float,6,4>()) );
    CALL_SUBTEST_6( upperbidiag(Matrix<float,5,5>()) );
    CALL_SUBTEST_7( upperbidiag(Matrix<double,4,3>()) );
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index aee68a87f..83c1439ad 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -1,45 +1,51 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifdef EIGEN_TEST_PART_1
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+#define EIGEN_UNALIGNED_VECTORIZE 0
+#endif
+
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#undef EIGEN_DEFAULT_TO_ROW_MAJOR
+#endif
 #define EIGEN_DEBUG_ASSIGN
 #include "main.h"
 #include <typeinfo>
 
-std::string demangle_traversal(int t)
-{
-  if(t==DefaultTraversal) return "DefaultTraversal";
-  if(t==LinearTraversal) return "LinearTraversal";
-  if(t==InnerVectorizedTraversal) return "InnerVectorizedTraversal";
-  if(t==LinearVectorizedTraversal) return "LinearVectorizedTraversal";
-  if(t==SliceVectorizedTraversal) return "SliceVectorizedTraversal";
-  return "?";
-}
-std::string demangle_unrolling(int t)
-{
-  if(t==NoUnrolling) return "NoUnrolling";
-  if(t==InnerUnrolling) return "InnerUnrolling";
-  if(t==CompleteUnrolling) return "CompleteUnrolling";
-  return "?";
-}
+using internal::demangle_flags;
+using internal::demangle_traversal;
+using internal::demangle_unrolling;
 
 template<typename Dst, typename Src>
 bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
 {
-  internal::assign_traits<Dst,Src>::debug();
-  bool res = internal::assign_traits<Dst,Src>::Traversal==traversal
-          && internal::assign_traits<Dst,Src>::Unrolling==unrolling;
+  typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
+  bool res = traits::Traversal==traversal;
+  if(unrolling==InnerUnrolling+CompleteUnrolling)
+    res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling);
+  else
+    res = res && int(traits::Unrolling)==unrolling;
   if(!res)
   {
+    std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl;
+    std::cerr << "     " << demangle_flags(internal::evaluator<Src>::Flags) << std::endl;
+    std::cerr << "Dst: " << demangle_flags(Dst::Flags) << std::endl;
+    std::cerr << "     " << demangle_flags(internal::evaluator<Dst>::Flags) << std::endl;
+    traits::debug();
     std::cerr << " Expected Traversal == " << demangle_traversal(traversal)
-              << " got " << demangle_traversal(internal::assign_traits<Dst,Src>::Traversal) << "\n";
+              << " got " << demangle_traversal(traits::Traversal) << "\n";
     std::cerr << " Expected Unrolling == " << demangle_unrolling(unrolling)
-              << " got " << demangle_unrolling(internal::assign_traits<Dst,Src>::Unrolling) << "\n";
+              << " got " << demangle_unrolling(traits::Unrolling) << "\n";
   }
   return res;
 }
@@ -47,15 +53,19 @@ bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
 template<typename Dst, typename Src>
 bool test_assign(int traversal, int unrolling)
 {
-  internal::assign_traits<Dst,Src>::debug();
-  bool res = internal::assign_traits<Dst,Src>::Traversal==traversal
-          && internal::assign_traits<Dst,Src>::Unrolling==unrolling;
+  typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
+  bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
   if(!res)
   {
+    std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl;
+    std::cerr << "     " << demangle_flags(internal::evaluator<Src>::Flags) << std::endl;
+    std::cerr << "Dst: " << demangle_flags(Dst::Flags) << std::endl;
+    std::cerr << "     " << demangle_flags(internal::evaluator<Dst>::Flags) << std::endl;
+    traits::debug();
     std::cerr << " Expected Traversal == " << demangle_traversal(traversal)
-              << " got " << demangle_traversal(internal::assign_traits<Dst,Src>::Traversal) << "\n";
+              << " got " << demangle_traversal(traits::Traversal) << "\n";
     std::cerr << " Expected Unrolling == " << demangle_unrolling(unrolling)
-              << " got " << demangle_unrolling(internal::assign_traits<Dst,Src>::Unrolling) << "\n";
+              << " got " << demangle_unrolling(traits::Unrolling) << "\n";
   }
   return res;
 }
@@ -63,10 +73,16 @@ bool test_assign(int traversal, int unrolling)
 template<typename Xpr>
 bool test_redux(const Xpr&, int traversal, int unrolling)
 {
-  typedef internal::redux_traits<internal::scalar_sum_op<typename Xpr::Scalar>,Xpr> traits;
+  typedef typename Xpr::Scalar Scalar;
+  typedef internal::redux_traits<internal::scalar_sum_op<Scalar,Scalar>,internal::redux_evaluator<Xpr> > traits;
+  
   bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
   if(!res)
   {
+    std::cerr << demangle_flags(Xpr::Flags) << std::endl;
+    std::cerr << demangle_flags(internal::evaluator<Xpr>::Flags) << std::endl;
+    traits::debug();
+    
     std::cerr << " Expected Traversal == " << demangle_traversal(traversal)
               << " got " << demangle_traversal(traits::Traversal) << "\n";
     std::cerr << " Expected Unrolling == " << demangle_unrolling(unrolling)
@@ -75,10 +91,16 @@ bool test_redux(const Xpr&, int traversal, int unrolling)
   return res;
 }
 
-template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectorizable> struct vectorization_logic
+template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectorizable>
+struct vectorization_logic
 {
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  
+  typedef typename internal::packet_traits<Scalar>::type PacketType;
+  typedef typename internal::unpacket_traits<PacketType>::half HalfPacketType;
   enum {
-    PacketSize = internal::packet_traits<Scalar>::size
+    PacketSize = internal::unpacket_traits<PacketType>::size,
+    HalfPacketSize = internal::unpacket_traits<HalfPacketType>::size
   };
   static void run()
   {
@@ -90,8 +112,8 @@ template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectori
     typedef Matrix<Scalar,2*PacketSize,2*PacketSize> Matrix22;
     typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44;
     typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u;
-    typedef Matrix<Scalar,4*PacketSize,16,ColMajor> Matrix44c;
-    typedef Matrix<Scalar,4*PacketSize,16,RowMajor> Matrix44r;
+    typedef Matrix<Scalar,4*PacketSize,4*PacketSize,ColMajor> Matrix44c;
+    typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r;
 
     typedef Matrix<Scalar,
         (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
@@ -131,35 +153,63 @@ template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectori
       InnerVectorizedTraversal,InnerUnrolling));
 
     VERIFY(test_assign(Matrix44u(),Matrix44()+Matrix44(),
-      LinearTraversal,NoUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearTraversal,
+      EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling));
+
+    VERIFY(test_assign(Matrix1(),Matrix1()+Matrix1(),
+      (Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal,
+      CompleteUnrolling));
 
     VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(),
-      LinearTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
+                                : LinearTraversal, CompleteUnrolling));
 
     VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3),
       InnerVectorizedTraversal,CompleteUnrolling));
-    
+
     VERIFY(test_assign(Matrix44r().row(2),Matrix44r().row(1)+Matrix44r().row(1),
       InnerVectorizedTraversal,CompleteUnrolling));
-        
+
     if(PacketSize>1)
     {
       typedef Matrix<Scalar,3,3,ColMajor> Matrix33c;
+      typedef Matrix<Scalar,3,1,ColMajor> Vector3;
       VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1),
         LinearTraversal,CompleteUnrolling));
+      VERIFY(test_assign(Vector3(),Vector3()+Vector3(),
+        EIGEN_UNALIGNED_VECTORIZE ? (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearTraversal), CompleteUnrolling));
       VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
-        LinearTraversal,CompleteUnrolling));
-      
-      VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()),
+        EIGEN_UNALIGNED_VECTORIZE ? (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : (HalfPacketSize==1 ? SliceVectorizedTraversal : LinearTraversal),
+        ((!EIGEN_UNALIGNED_VECTORIZE) && HalfPacketSize==1) ? NoUnrolling : CompleteUnrolling));
+
+      VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()),
         LinearVectorizedTraversal,CompleteUnrolling));
 
       VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
-        LinearTraversal,NoUnrolling));
+        HalfPacketSize==1             ? InnerVectorizedTraversal  :
+        EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal :
+                                        LinearTraversal,
+        NoUnrolling));
 
-      VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(10,4),
-      DefaultTraversal,CompleteUnrolling));
+      VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling));
+
+
+      VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
+        (EIGEN_UNALIGNED_VECTORIZE) ? InnerVectorizedTraversal : DefaultTraversal, CompleteUnrolling|InnerUnrolling));
+
+      VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
+                         InnerVectorizedTraversal,CompleteUnrolling));
+
+      VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()),
+                         InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling));
     }
-    
+
+    VERIFY(test_redux(Vector1(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
     VERIFY(test_redux(Matrix3(),
       LinearVectorizedTraversal,CompleteUnrolling));
 
@@ -174,18 +224,19 @@ template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectori
 
     VERIFY(test_redux(Matrix44r().template block<1,2*PacketSize>(2,1),
       LinearVectorizedTraversal,CompleteUnrolling));
-    
+
     VERIFY((test_assign<
-            Map<Matrix22, Aligned, OuterStride<3*PacketSize> >,
+            Map<Matrix22, AlignedMax, OuterStride<3*PacketSize> >,
             Matrix22
             >(InnerVectorizedTraversal,CompleteUnrolling)));
 
     VERIFY((test_assign<
-            Map<Matrix22, Aligned, InnerStride<3*PacketSize> >,
-            Matrix22
-            >(DefaultTraversal,CompleteUnrolling)));
+            Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, AlignedMax, InnerStride<3*PacketSize> >,
+            Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>
+            >(DefaultTraversal,PacketSize>=8?InnerUnrolling:CompleteUnrolling)));
 
-    VERIFY((test_assign(Matrix11(), Matrix11()*Matrix11(), InnerVectorizedTraversal, CompleteUnrolling)));
+    VERIFY((test_assign(Matrix11(), Matrix<Scalar,PacketSize,EIGEN_PLAIN_ENUM_MIN(2,PacketSize)>()*Matrix<Scalar,EIGEN_PLAIN_ENUM_MIN(2,PacketSize),PacketSize>(),
+                        InnerVectorizedTraversal, CompleteUnrolling)));
     #endif
 
     VERIFY(test_assign(MatrixXX(10,10),MatrixXX(20,20).block(10,10,2,3),
@@ -193,12 +244,138 @@ template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectori
 
     VERIFY(test_redux(VectorX(10),
       LinearVectorizedTraversal,NoUnrolling));
+  }
+};
+
+template<typename Scalar> struct vectorization_logic<Scalar,false>
+{
+  static void run() {}
+};
+
+template<typename Scalar, bool Enable = !internal::is_same<typename internal::unpacket_traits<typename internal::packet_traits<Scalar>::type>::half,
+                                                           typename internal::packet_traits<Scalar>::type>::value >
+struct vectorization_logic_half
+{
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename internal::unpacket_traits<typename internal::packet_traits<Scalar>::type>::half PacketType;
+  enum {
+    PacketSize = internal::unpacket_traits<PacketType>::size
+  };
+  static void run()
+  {
+    
+    typedef Matrix<Scalar,PacketSize,1> Vector1;
+    typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11;
+    typedef Matrix<Scalar,5*PacketSize,7,ColMajor> Matrix57;
+    typedef Matrix<Scalar,3*PacketSize,5,ColMajor> Matrix35;
+    typedef Matrix<Scalar,5*PacketSize,7,DontAlign|ColMajor> Matrix57u;
+//     typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44;
+//     typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u;
+//     typedef Matrix<Scalar,4*PacketSize,4*PacketSize,ColMajor> Matrix44c;
+//     typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r;
+
+    typedef Matrix<Scalar,
+        (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
+      > Matrix1;
+
+    typedef Matrix<Scalar,
+        (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
+      DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u;
 
+    // this type is made such that it can only be vectorized when viewed as a linear 1D vector
+    typedef Matrix<Scalar,
+        (PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
+        (PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
+      > Matrix3;
     
+    #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
+    VERIFY(test_assign(Vector1(),Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1()+Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1().template segment<PacketSize>(0).derived(),
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Scalar(2.1)*Vector1()-Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),(Scalar(2.1)*Vector1().template segment<PacketSize>(0)-Vector1().template segment<PacketSize>(0)).derived(),
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1().template cast<Scalar>(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+
+
+    VERIFY(test_assign(Vector1(),Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1()+Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()),
+      InnerVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_assign(Matrix57(),Matrix57()+Matrix57(),
+      InnerVectorizedTraversal,InnerUnrolling));
+
+    VERIFY(test_assign(Matrix57u(),Matrix57()+Matrix57(),
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearTraversal,
+      EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling));
+
+    VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(),
+      EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling));
+        
+    if(PacketSize>1)
+    {
+      typedef Matrix<Scalar,3,3,ColMajor> Matrix33c;
+      VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1),
+        LinearTraversal,CompleteUnrolling));
+      VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
+        EIGEN_UNALIGNED_VECTORIZE ? (PacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling));
+              
+      VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()),
+        PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
+        
+      VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
+        EIGEN_UNALIGNED_VECTORIZE ? (PacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,
+        NoUnrolling));
+        
+      VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
+        EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling));
+
+      VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
+                         InnerVectorizedTraversal,CompleteUnrolling));
+
+      VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()),
+                         InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling));
+    }
+    
+    VERIFY(test_redux(Vector1(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix3(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix35(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix57().template block<PacketSize,3>(1,0),
+      DefaultTraversal,CompleteUnrolling));
+
+    VERIFY((test_assign<
+            Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, AlignedMax, InnerStride<3*PacketSize> >,
+            Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>
+            >(DefaultTraversal,CompleteUnrolling)));
+
+    VERIFY((test_assign(Matrix57(), Matrix<Scalar,5*PacketSize,3>()*Matrix<Scalar,3,7>(),
+                        InnerVectorizedTraversal, InnerUnrolling|CompleteUnrolling)));
+    #endif
   }
 };
 
-template<typename Scalar> struct vectorization_logic<Scalar,false>
+template<typename Scalar> struct vectorization_logic_half<Scalar,false>
 {
   static void run() {}
 };
@@ -208,27 +385,34 @@ void test_vectorization_logic()
 
 #ifdef EIGEN_VECTORIZE
 
+  CALL_SUBTEST( vectorization_logic<int>::run() );
   CALL_SUBTEST( vectorization_logic<float>::run() );
   CALL_SUBTEST( vectorization_logic<double>::run() );
   CALL_SUBTEST( vectorization_logic<std::complex<float> >::run() );
   CALL_SUBTEST( vectorization_logic<std::complex<double> >::run() );
   
+  CALL_SUBTEST( vectorization_logic_half<int>::run() );
+  CALL_SUBTEST( vectorization_logic_half<float>::run() );
+  CALL_SUBTEST( vectorization_logic_half<double>::run() );
+  CALL_SUBTEST( vectorization_logic_half<std::complex<float> >::run() );
+  CALL_SUBTEST( vectorization_logic_half<std::complex<double> >::run() );
+  
   if(internal::packet_traits<float>::Vectorizable)
   {
     VERIFY(test_assign(Matrix<float,3,3>(),Matrix<float,3,3>()+Matrix<float,3,3>(),
-      LinearTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
       
     VERIFY(test_redux(Matrix<float,5,2>(),
-      DefaultTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : DefaultTraversal,CompleteUnrolling));
   }
   
   if(internal::packet_traits<double>::Vectorizable)
   {
     VERIFY(test_assign(Matrix<double,3,3>(),Matrix<double,3,3>()+Matrix<double,3,3>(),
-      LinearTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
     
     VERIFY(test_redux(Matrix<double,7,3>(),
-      DefaultTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : DefaultTraversal,CompleteUnrolling));
   }
 #endif // EIGEN_VECTORIZE
 
diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp
index 6cd1acdda..f3ab561ee 100644
--- a/test/vectorwiseop.cpp
+++ b/test/vectorwiseop.cpp
@@ -2,11 +2,13 @@
 // for linear algebra.
 //
 // Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define TEST_ENABLE_TEMPORARY_TRACKING
 #define EIGEN_NO_STATIC_ASSERT
 
 #include "main.h"
@@ -101,16 +103,28 @@ template<typename ArrayType> void vectorwiseop_array(const ArrayType& m)
 
   VERIFY_RAISES_ASSERT(m2.rowwise() /= rowvec.transpose());
   VERIFY_RAISES_ASSERT(m1.rowwise() / rowvec.transpose());
-  
+
   m2 = m1;
   // yes, there might be an aliasing issue there but ".rowwise() /="
-  // is suppposed to evaluate " m2.colwise().sum()" into to temporary to avoid
-  // evaluating the reducions multiple times
+  // is supposed to evaluate " m2.colwise().sum()" into a temporary to avoid
+  // evaluating the reduction multiple times
   if(ArrayType::RowsAtCompileTime>2 || ArrayType::RowsAtCompileTime==Dynamic)
   {
     m2.rowwise() /= m2.colwise().sum();
     VERIFY_IS_APPROX(m2, m1.rowwise() / m1.colwise().sum());
   }
+
+  // all/any
+  Array<bool,Dynamic,Dynamic> mb(rows,cols);
+  mb = (m1.real()<=0.7).colwise().all();
+  VERIFY( (mb.col(c) == (m1.real().col(c)<=0.7).all()).all() );
+  mb = (m1.real()<=0.7).rowwise().all();
+  VERIFY( (mb.row(r) == (m1.real().row(r)<=0.7).all()).all() );
+
+  mb = (m1.real()>=0.7).colwise().any();
+  VERIFY( (mb.col(c) == (m1.real().col(c)>=0.7).any()).all() );
+  mb = (m1.real()>=0.7).rowwise().any();
+  VERIFY( (mb.row(r) == (m1.real().row(r)>=0.7).any()).all() );
 }
 
 template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
@@ -144,16 +158,22 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(m2, m1.colwise() + colvec);
   VERIFY_IS_APPROX(m2.col(c), m1.col(c) + colvec);
 
-  VERIFY_RAISES_ASSERT(m2.colwise() += colvec.transpose());
-  VERIFY_RAISES_ASSERT(m1.colwise() + colvec.transpose());
+  if(rows>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.colwise() += colvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.colwise() + colvec.transpose());
+  }
 
   m2 = m1;
   m2.rowwise() += rowvec;
   VERIFY_IS_APPROX(m2, m1.rowwise() + rowvec);
   VERIFY_IS_APPROX(m2.row(r), m1.row(r) + rowvec);
 
-  VERIFY_RAISES_ASSERT(m2.rowwise() += rowvec.transpose());
-  VERIFY_RAISES_ASSERT(m1.rowwise() + rowvec.transpose());
+  if(cols>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.rowwise() += rowvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.rowwise() + rowvec.transpose());
+  }
 
   // test substraction
 
@@ -162,29 +182,43 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(m2, m1.colwise() - colvec);
   VERIFY_IS_APPROX(m2.col(c), m1.col(c) - colvec);
 
-  VERIFY_RAISES_ASSERT(m2.colwise() -= colvec.transpose());
-  VERIFY_RAISES_ASSERT(m1.colwise() - colvec.transpose());
+  if(rows>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.colwise() -= colvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.colwise() - colvec.transpose());
+  }
 
   m2 = m1;
   m2.rowwise() -= rowvec;
   VERIFY_IS_APPROX(m2, m1.rowwise() - rowvec);
   VERIFY_IS_APPROX(m2.row(r), m1.row(r) - rowvec);
 
-  VERIFY_RAISES_ASSERT(m2.rowwise() -= rowvec.transpose());
-  VERIFY_RAISES_ASSERT(m1.rowwise() - rowvec.transpose());
-  
+  if(cols>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.rowwise() -= rowvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.rowwise() - rowvec.transpose());
+  }
+
   // test norm
   rrres = m1.colwise().norm();
   VERIFY_IS_APPROX(rrres(c), m1.col(c).norm());
   rcres = m1.rowwise().norm();
   VERIFY_IS_APPROX(rcres(r), m1.row(r).norm());
-  
+
+  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum(), m1.colwise().template lpNorm<1>());
+  VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().sum(), m1.rowwise().template lpNorm<1>());
+  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().maxCoeff(), m1.colwise().template lpNorm<Infinity>());
+  VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().maxCoeff(), m1.rowwise().template lpNorm<Infinity>());
+
+  // regression for bug 1158
+  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum().x(), m1.col(0).cwiseAbs().sum());
+
   // test normalized
   m2 = m1.colwise().normalized();
   VERIFY_IS_APPROX(m2.col(c), m1.col(c).normalized());
   m2 = m1.rowwise().normalized();
   VERIFY_IS_APPROX(m2.row(r), m1.row(r).normalized());
-  
+
   // test normalize
   m2 = m1;
   m2.colwise().normalize();
@@ -192,14 +226,27 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   m2 = m1;
   m2.rowwise().normalize();
   VERIFY_IS_APPROX(m2.row(r), m1.row(r).normalized());
+
+  // test with partial reduction of products
+  Matrix<Scalar,MatrixType::RowsAtCompileTime,MatrixType::RowsAtCompileTime> m1m1 = m1 * m1.transpose();
+  VERIFY_IS_APPROX( (m1 * m1.transpose()).colwise().sum(), m1m1.colwise().sum());
+  Matrix<Scalar,1,MatrixType::RowsAtCompileTime> tmp(rows);
+  VERIFY_EVALUATION_COUNT( tmp = (m1 * m1.transpose()).colwise().sum(), 1);
+
+  m2 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows())).eval();
+  m1 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows()));
+  VERIFY_IS_APPROX( m1, m2 );
+  VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime!=1 ? 1 : 0) );
 }
 
 void test_vectorwiseop()
 {
-  CALL_SUBTEST_1(vectorwiseop_array(Array22cd()));
-  CALL_SUBTEST_2(vectorwiseop_array(Array<double, 3, 2>()));
-  CALL_SUBTEST_3(vectorwiseop_array(ArrayXXf(3, 4)));
-  CALL_SUBTEST_4(vectorwiseop_matrix(Matrix4cf()));
-  CALL_SUBTEST_5(vectorwiseop_matrix(Matrix<float,4,5>()));
-  CALL_SUBTEST_6(vectorwiseop_matrix(MatrixXd(7,2)));
+  CALL_SUBTEST_1( vectorwiseop_array(Array22cd()) );
+  CALL_SUBTEST_2( vectorwiseop_array(Array<double, 3, 2>()) );
+  CALL_SUBTEST_3( vectorwiseop_array(ArrayXXf(3, 4)) );
+  CALL_SUBTEST_4( vectorwiseop_matrix(Matrix4cf()) );
+  CALL_SUBTEST_5( vectorwiseop_matrix(Matrix<float,4,5>()) );
+  CALL_SUBTEST_6( vectorwiseop_matrix(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  CALL_SUBTEST_7( vectorwiseop_matrix(VectorXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  CALL_SUBTEST_7( vectorwiseop_matrix(RowVectorXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
 }
diff --git a/test/visitor.cpp b/test/visitor.cpp
index 39a5d6b5f..844170ec6 100644
--- a/test/visitor.cpp
+++ b/test/visitor.cpp
@@ -55,6 +55,11 @@ template<typename MatrixType> void matrixVisitor(const MatrixType& p)
   VERIFY_IS_APPROX(maxc, eigen_maxc);
   VERIFY_IS_APPROX(minc, m.minCoeff());
   VERIFY_IS_APPROX(maxc, m.maxCoeff());
+
+  eigen_maxc = (m.adjoint()*m).maxCoeff(&eigen_maxrow,&eigen_maxcol);
+  eigen_maxc = (m.adjoint()*m).eval().maxCoeff(&maxrow,&maxcol);
+  VERIFY(maxrow == eigen_maxrow);
+  VERIFY(maxcol == eigen_maxcol);
 }
 
 template<typename VectorType> void vectorVisitor(const VectorType& w)
diff --git a/test/zerosized.cpp b/test/zerosized.cpp
index da7dd0481..477ff0070 100644
--- a/test/zerosized.cpp
+++ b/test/zerosized.cpp
@@ -25,6 +25,7 @@ template<typename MatrixType> void zeroReduction(const MatrixType& m) {
 template<typename MatrixType> void zeroSizedMatrix()
 {
   MatrixType t1;
+  typedef typename MatrixType::Scalar Scalar;
 
   if (MatrixType::SizeAtCompileTime == Dynamic || MatrixType::SizeAtCompileTime == 0)
   {
@@ -37,7 +38,7 @@ template<typename MatrixType> void zeroSizedMatrix()
     if (MatrixType::RowsAtCompileTime == Dynamic && MatrixType::ColsAtCompileTime == Dynamic)
     {
 
-      MatrixType t2(0, 0);
+      MatrixType t2(0, 0), t3(t1);
       VERIFY(t2.rows() == 0);
       VERIFY(t2.cols() == 0);
 
@@ -45,6 +46,23 @@ template<typename MatrixType> void zeroSizedMatrix()
       VERIFY(t1==t2);
     }
   }
+
+  if(MatrixType::MaxColsAtCompileTime!=0 && MatrixType::MaxRowsAtCompileTime!=0)
+  {
+    Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random<Index>(1,10) : Index(MatrixType::RowsAtCompileTime);
+    Index cols = MatrixType::ColsAtCompileTime==Dynamic ? internal::random<Index>(1,10) : Index(MatrixType::ColsAtCompileTime);
+    MatrixType m(rows,cols);
+    zeroReduction(m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols));
+    zeroReduction(m.template block<MatrixType::RowsAtCompileTime,0>(0,0,rows,0));
+    zeroReduction(m.template block<0,1>(0,0));
+    zeroReduction(m.template block<1,0>(0,0));
+    Matrix<Scalar,Dynamic,Dynamic> prod = m.template block<MatrixType::RowsAtCompileTime,0>(0,0,rows,0) * m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols);
+    VERIFY(prod.rows()==rows && prod.cols()==cols);
+    VERIFY(prod.isZero());
+    prod = m.template block<1,0>(0,0) * m.template block<0,1>(0,0);
+    VERIFY(prod.size()==1);
+    VERIFY(prod.isZero());
+  }
 }
 
 template<typename VectorType> void zeroSizedVector()
diff --git a/unsupported/Eigen/AdolcForward b/unsupported/Eigen/AdolcForward
index 2627decd0..15f5f0731 100644
--- a/unsupported/Eigen/AdolcForward
+++ b/unsupported/Eigen/AdolcForward
@@ -25,7 +25,7 @@
 #ifndef NUMBER_DIRECTIONS
 # define NUMBER_DIRECTIONS 2
 #endif
-#include <adolc/adouble.h>
+#include <adolc/adtl.h>
 
 // adolc defines some very stupid macros:
 #if defined(malloc)
diff --git a/unsupported/Eigen/AlignedVector3 b/unsupported/Eigen/AlignedVector3
index 7b45e6cce..47a86d4c0 100644
--- a/unsupported/Eigen/AlignedVector3
+++ b/unsupported/Eigen/AlignedVector3
@@ -57,6 +57,11 @@ template<typename _Scalar> class AlignedVector3
 
     inline Index rows() const { return 3; }
     inline Index cols() const { return 1; }
+    
+    Scalar* data() { return m_coeffs.data(); }
+    const Scalar* data() const { return m_coeffs.data(); }
+    Index innerStride() const { return 1; }
+    Index outerStride() const { return 3; }
 
     inline const Scalar& coeff(Index row, Index col) const
     { return m_coeffs.coeff(row, col); }
@@ -100,7 +105,7 @@ template<typename _Scalar> class AlignedVector3
     };
 
     template<typename Derived>
-    inline explicit AlignedVector3(const MatrixBase<Derived>& other)
+    inline AlignedVector3(const MatrixBase<Derived>& other)
     {
       generic_assign_selector<Derived>::run(*this,other.derived());
     }
@@ -108,6 +113,12 @@ template<typename _Scalar> class AlignedVector3
     inline AlignedVector3& operator=(const AlignedVector3& other)
     { m_coeffs = other.m_coeffs; return *this; }
 
+    template <typename Derived>
+    inline AlignedVector3& operator=(const MatrixBase<Derived>& other)
+    {
+      generic_assign_selector<Derived>::run(*this,other.derived());
+      return *this;
+    }
 
     inline AlignedVector3 operator+(const AlignedVector3& other) const
     { return AlignedVector3(m_coeffs + other.m_coeffs); }
@@ -148,7 +159,7 @@ template<typename _Scalar> class AlignedVector3
       m_coeffs /= norm();
     }
 
-    inline AlignedVector3 normalized()
+    inline AlignedVector3 normalized() const
     {
       return AlignedVector3(m_coeffs / norm());
     }
@@ -177,12 +188,35 @@ template<typename _Scalar> class AlignedVector3
     }
 
     template<typename Derived>
-    inline bool isApprox(const MatrixBase<Derived>& other, RealScalar eps=NumTraits<Scalar>::dummy_precision()) const
+    inline bool isApprox(const MatrixBase<Derived>& other, const RealScalar& eps=NumTraits<Scalar>::dummy_precision()) const
     {
       return m_coeffs.template head<3>().isApprox(other,eps);
     }
+    
+    CoeffType& coeffs() { return m_coeffs; }
+    const CoeffType& coeffs() const { return m_coeffs; }
 };
 
+namespace internal {
+
+template<typename _Scalar>
+struct eval<AlignedVector3<_Scalar>, Dense>
+{
+ typedef const AlignedVector3<_Scalar>& type;
+};
+
+template<typename Scalar>
+struct evaluator<AlignedVector3<Scalar> >
+  : evaluator<Matrix<Scalar,4,1> >
+{
+  typedef AlignedVector3<Scalar> XprType;
+  typedef evaluator<Matrix<Scalar,4,1> > Base;
+  
+  evaluator(const XprType &m) : Base(m.coeffs()) {}  
+};
+
+}
+
 //@}
 
 }
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index e1fbf97e2..631a06014 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt
@@ -1,11 +1,32 @@
-set(Eigen_HEADERS AdolcForward AlignedVector3 ArpackSupport AutoDiff BVH FFT IterativeSolvers KroneckerProduct LevenbergMarquardt
-                  MatrixFunctions MoreVectorization MPRealSupport NonLinearOptimization NumericalDiff OpenGLSupport Polynomials
-                  Skyline SparseExtra Splines
-   )
+set(Eigen_HEADERS 
+  AdolcForward
+  AlignedVector3
+  ArpackSupport
+  AutoDiff
+  BVH
+  EulerAngles
+  FFT
+  IterativeSolvers 
+  KroneckerProduct
+  LevenbergMarquardt
+  MatrixFunctions 
+  MoreVectorization
+  MPRealSupport
+  NonLinearOptimization
+  NumericalDiff
+  OpenGLSupport
+  Polynomials
+  Skyline 
+  SparseExtra
+  SpecialFunctions
+  Splines
+  )
 
 install(FILES
   ${Eigen_HEADERS}
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
   )
 
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
+
+add_subdirectory(CXX11)
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
new file mode 100644
index 000000000..385ed240c
--- /dev/null
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
+
+install(FILES
+  ${Eigen_CXX11_HEADERS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
+  )
+
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
new file mode 100644
index 000000000..7ecb4c74d
--- /dev/null
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+//#ifndef EIGEN_CXX11_TENSOR_MODULE
+//#define EIGEN_CXX11_TENSOR_MODULE
+
+#include "../../../Eigen/Core"
+
+#ifdef EIGEN_USE_SYCL
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#include <SYCL/sycl.hpp>
+#include <map>
+#include <memory>
+#include <utility>
+#endif
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+#include "../SpecialFunctions"
+#include "src/util/CXX11Meta.h"
+#include "src/util/MaxSizeVector.h"
+
+/** \defgroup CXX11_Tensor_Module Tensor Module
+  *
+  * This module provides a Tensor class for storing arbitrarily indexed
+  * objects.
+  *
+  * \code
+  * #include <Eigen/CXX11/Tensor>
+  * \endcode
+  */
+
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+
+#ifdef _WIN32
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#else
+#include <stdint.h>
+#endif
+
+#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
+#include <random>
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+#ifdef EIGEN_USE_THREADS
+#include "ThreadPool"
+#endif
+
+#ifdef EIGEN_USE_GPU
+#include <iostream>
+#include <cuda_runtime.h>
+#if __cplusplus >= 201103L
+#include <atomic>
+#include <unistd.h>
+#endif
+#endif
+
+#include "src/Tensor/TensorMacros.h"
+#include "src/Tensor/TensorForwardDeclarations.h"
+#include "src/Tensor/TensorMeta.h"
+#include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorCostModel.h"
+#include "src/Tensor/TensorDeviceDefault.h"
+#include "src/Tensor/TensorDeviceThreadPool.h"
+#include "src/Tensor/TensorDeviceCuda.h"
+#include "src/Tensor/TensorDeviceSycl.h"
+#include "src/Tensor/TensorIndexList.h"
+#include "src/Tensor/TensorDimensionList.h"
+#include "src/Tensor/TensorDimensions.h"
+#include "src/Tensor/TensorInitializer.h"
+#include "src/Tensor/TensorTraits.h"
+#include "src/Tensor/TensorRandom.h"
+#include "src/Tensor/TensorUInt128.h"
+#include "src/Tensor/TensorIntDiv.h"
+#include "src/Tensor/TensorGlobalFunctions.h"
+
+#include "src/Tensor/TensorBase.h"
+
+#include "src/Tensor/TensorEvaluator.h"
+#include "src/Tensor/TensorExpr.h"
+#include "src/Tensor/TensorReduction.h"
+#include "src/Tensor/TensorReductionCuda.h"
+#include "src/Tensor/TensorArgMax.h"
+#include "src/Tensor/TensorConcatenation.h"
+#include "src/Tensor/TensorContractionMapper.h"
+#include "src/Tensor/TensorContractionBlocking.h"
+#include "src/Tensor/TensorContraction.h"
+#include "src/Tensor/TensorContractionThreadPool.h"
+#include "src/Tensor/TensorContractionCuda.h"
+#include "src/Tensor/TensorConversion.h"
+#include "src/Tensor/TensorConvolution.h"
+#include "src/Tensor/TensorFFT.h"
+#include "src/Tensor/TensorPatch.h"
+#include "src/Tensor/TensorImagePatch.h"
+#include "src/Tensor/TensorVolumePatch.h"
+#include "src/Tensor/TensorBroadcasting.h"
+#include "src/Tensor/TensorChipping.h"
+#include "src/Tensor/TensorInflation.h"
+#include "src/Tensor/TensorLayoutSwap.h"
+#include "src/Tensor/TensorMorphing.h"
+#include "src/Tensor/TensorPadding.h"
+#include "src/Tensor/TensorReverse.h"
+#include "src/Tensor/TensorShuffling.h"
+#include "src/Tensor/TensorStriding.h"
+#include "src/Tensor/TensorCustomOp.h"
+#include "src/Tensor/TensorEvalTo.h"
+#include "src/Tensor/TensorForcedEval.h"
+#include "src/Tensor/TensorGenerator.h"
+#include "src/Tensor/TensorAssign.h"
+#include "src/Tensor/TensorScan.h"
+
+#include "src/Tensor/TensorSycl.h"
+#include "src/Tensor/TensorExecutor.h"
+#include "src/Tensor/TensorDevice.h"
+
+#include "src/Tensor/TensorStorage.h"
+#include "src/Tensor/Tensor.h"
+#include "src/Tensor/TensorFixedSize.h"
+#include "src/Tensor/TensorMap.h"
+#include "src/Tensor/TensorRef.h"
+
+#include "src/Tensor/TensorIO.h"
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+//#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry
new file mode 100644
index 000000000..fb1b0c0fb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/TensorSymmetry
@@ -0,0 +1,42 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE
+#define EIGEN_CXX11_TENSORSYMMETRY_MODULE
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+#include "src/util/CXX11Meta.h"
+
+/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
+  *
+  * This module provides a classes that allow for the definition of
+  * symmetries w.r.t. tensor indices.
+  *
+  * Including this module will implicitly include the Tensor module.
+  *
+  * \code
+  * #include <Eigen/TensorSymmetry>
+  * \endcode
+  */
+
+#include "src/TensorSymmetry/util/TemplateGroupTheory.h"
+#include "src/TensorSymmetry/Symmetry.h"
+#include "src/TensorSymmetry/StaticSymmetry.h"
+#include "src/TensorSymmetry/DynamicSymmetry.h"
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
new file mode 100644
index 000000000..09d637e9a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -0,0 +1,65 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_MODULE
+#define EIGEN_CXX11_THREADPOOL_MODULE
+
+#include "../../../Eigen/Core"
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
+  *
+  * This module provides 2 threadpool implementations
+  *  - a simple reference implementation
+  *  - a faster non blocking implementation
+  *
+  * This module requires C++11.
+  *
+  * \code
+  * #include <Eigen/CXX11/ThreadPool>
+  * \endcode
+  */
+
+
+// The code depends on CXX11, so only include the module if the
+// compiler supports it.
+#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+#include <cstddef>
+#include <cstring>
+#include <stdint.h>
+#include <time.h>
+
+#include <vector>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+#include <functional>
+#include <memory>
+
+#include "src/util/CXX11Meta.h"
+#include "src/util/MaxSizeVector.h"
+
+#include "src/ThreadPool/ThreadLocal.h"
+#include "src/ThreadPool/ThreadYield.h"
+#include "src/ThreadPool/EventCount.h"
+#include "src/ThreadPool/RunQueue.h"
+#include "src/ThreadPool/ThreadPoolInterface.h"
+#include "src/ThreadPool/ThreadEnvironment.h"
+#include "src/ThreadPool/SimpleThreadPool.h"
+#include "src/ThreadPool/NonBlockingThreadPool.h"
+
+#endif
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_CXX11_THREADPOOL_MODULE
+
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
new file mode 100644
index 000000000..02146527b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -0,0 +1,1757 @@
+# Eigen Tensors
+
+Tensors are multidimensional arrays of elements. Elements are typically scalars,
+but more complex types such as strings are also supported.
+
+[TOC]
+
+## Tensor Classes
+
+You can manipulate a tensor with one of the following classes.  They all are in
+the namespace ```::Eigen.```
+
+
+### Class Tensor<data_type, rank>
+
+This is the class to use to create a tensor and allocate memory for it.  The
+class is templatized with the tensor datatype, such as float or int, and the
+tensor rank.  The rank is the number of dimensions, for example rank 2 is a
+matrix.
+
+Tensors of this class are resizable.  For example, if you assign a tensor of a
+different size to a Tensor, that tensor is resized to match its new value.
+
+#### Constructor Tensor<data_type, rank>(size0, size1, ...)
+
+Constructor for a Tensor.  The constructor must be passed ```rank``` integers
+indicating the sizes of the instance along each of the the ```rank```
+dimensions.
+
+    // Create a tensor of rank 3 of sizes 2, 3, 4.  This tensor owns
+    // memory to hold 24 floating point values (24 = 2 x 3 x 4).
+    Tensor<float, 3> t_3d(2, 3, 4);
+
+    // Resize t_3d by assigning a tensor of different sizes, but same rank.
+    t_3d = Tensor<float, 3>(3, 4, 3);
+
+#### Constructor Tensor<data_type, rank>(size_array)
+
+Constructor where the sizes for the constructor are specified as an array of
+values instead of an explicitly list of parameters.  The array type to use is
+```Eigen::array<Eigen::Index>```.  The array can be constructed automatically
+from an initializer list.
+
+    // Create a tensor of strings of rank 2 with sizes 5, 7.
+    Tensor<string, 2> t_2d({5, 7});
+
+
+### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>>
+
+Class to use for tensors of fixed size, where the size is known at compile
+time.  Fixed sized tensors can provide very fast computations because all their
+dimensions are known by the compiler.  FixedSize tensors are not resizable.
+
+If the total number of elements in a fixed size tensor is small enough the
+tensor data is held onto the stack and does not cause heap allocation and free.
+
+    // Create a 4 x 3 tensor of floats.
+    TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+
+### Class TensorMap<Tensor<data_type, rank>>
+
+This is the class to use to create a tensor on top of memory allocated and
+owned by another part of your code.  It allows to view any piece of allocated
+memory as a Tensor.  Instances of this class do not own the memory where the
+data are stored.
+
+A TensorMap is not resizable because it does not own the memory where its data
+are stored.
+
+#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...)
+
+Constructor for a Tensor.  The constructor must be passed a pointer to the
+storage for the data, and "rank" size attributes.  The storage has to be
+large enough to hold all the data.
+
+    // Map a tensor of ints on top of stack-allocated storage.
+    int storage[128];  // 2 x 4 x 2 x 8 = 128
+    TensorMap<int, 4> t_4d(storage, 2, 4, 2, 8);
+
+    // The same storage can be viewed as a different tensor.
+    // You can also pass the sizes as an array.
+    TensorMap<int, 2> t_2d(storage, 16, 8);
+
+    // You can also map fixed-size tensors.  Here we get a 1d view of
+    // the 2d fixed-size tensor.
+    Tensor<float, Sizes<4, 5>> t_4x3;
+    TensorMap<float, 1> t_12(t_4x3, 12);
+
+
+#### Class TensorRef
+
+See Assigning to a TensorRef below.
+
+## Accessing Tensor Elements
+
+#### <data_type> tensor(index0, index1...)
+
+Return the element at position ```(index0, index1...)``` in tensor
+```tensor```.  You must pass as many parameters as the rank of ```tensor```.
+The expression can be used as an l-value to set the value of the element at the
+specified position.  The value returned is of the datatype of the tensor.
+
+    // Set the value of the element at position (0, 1, 0);
+    Tensor<float, 3> t_3d(2, 3, 4);
+    t_3d(0, 1, 0) = 12.0f;
+
+    // Initialize all elements to random values.
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 4; ++k) {
+          t_3d(i, j, k) = ...some random value...;
+        }
+      }
+    }
+
+    // Print elements of a tensor.
+    for (int i = 0; i < 2; ++i) {
+      LOG(INFO) << t_3d(i, 0, 0);
+    }
+
+
+## TensorLayout
+
+The tensor library supports 2 layouts: ```ColMajor``` (the default) and
+```RowMajor```.  Only the default column major layout is currently fully
+supported, and it is therefore not recommended to attempt to use the row major
+layout at the moment.
+
+The layout of a tensor is optionally specified as part of its type. If not
+specified explicitly column major is assumed.
+
+    Tensor<float, 3, ColMajor> col_major;  // equivalent to Tensor<float, 3>
+    TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...);
+
+All the arguments to an expression must use the same layout. Attempting to mix
+different layouts will result in a compilation error.
+
+It is possible to change the layout of a tensor or an expression using the
+```swap_layout()``` method.  Note that this will also reverse the order of the
+dimensions.
+
+    Tensor<float, 2, ColMajor> col_major(2, 4);
+    Tensor<float, 2, RowMajor> row_major(2, 4);
+
+    Tensor<float, 2> col_major_result = col_major;  // ok, layouts match
+    Tensor<float, 2> col_major_result = row_major;  // will not compile
+
+    // Simple layout swap
+    col_major_result = row_major.swap_layout();
+    eigen_assert(col_major_result.dimension(0) == 4);
+    eigen_assert(col_major_result.dimension(1) == 2);
+
+    // Swap the layout and preserve the order of the dimensions
+    array<int, 2> shuffle(1, 0);
+    col_major_result = row_major.swap_layout().shuffle(shuffle);
+    eigen_assert(col_major_result.dimension(0) == 2);
+    eigen_assert(col_major_result.dimension(1) == 4);
+
+
+## Tensor Operations
+
+The Eigen Tensor library provides a vast library of operations on Tensors:
+numerical operations such as addition and multiplication, geometry operations
+such as slicing and shuffling, etc.  These operations are available as methods
+of the Tensor classes, and in some cases as operator overloads.  For example
+the following code computes the elementwise addition of two tensors:
+
+    Tensor<float, 3> t1(2, 3, 4);
+    ...set some values in t1...
+    Tensor<float, 3> t2(2, 3, 4);
+    ...set some values in t2...
+    // Set t3 to the element wise sum of t1 and t2
+    Tensor<float, 3> t3 = t1 + t2;
+
+While the code above looks easy enough, it is important to understand that the
+expression ```t1 + t2``` is not actually adding the values of the tensors.  The
+expression instead constructs a "tensor operator" object of the class
+TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors
+```t1``` and ```t2```.  This is a small C++ object that knows how to add
+```t1``` and ```t2```.  It is only when the value of the expression is assigned
+to the tensor ```t3``` that the addition is actually performed.  Technically,
+this happens through the overloading of ```operator=()``` in the Tensor class.
+
+This mechanism for computing tensor expressions allows for lazy evaluation and
+optimizations which are what make the tensor library very fast.
+
+Of course, the tensor operators do nest, and the expression ```t1 + t2 *
+0.3f``` is actually represented with the (approximate) tree of operators:
+
+    TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f))
+
+
+### Tensor Operations and C++ "auto"
+
+Because Tensor operations create tensor operators, the C++ ```auto``` keyword
+does not have its intuitive meaning.  Consider these 2 lines of code:
+
+    Tensor<float, 3> t3 = t1 + t2;
+    auto t4 = t1 + t2;
+
+In the first line we allocate the tensor ```t3``` and it will contain the
+result of the addition of ```t1``` and ```t2```.  In the second line, ```t4```
+is actually the tree of tensor operators that will compute the addition of
+```t1``` and ```t2```.  In fact, ```t4``` is *not* a tensor and you cannot get
+the values of its elements:
+
+    Tensor<float, 3> t3 = t1 + t2;
+    cout << t3(0, 0, 0);  // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0)
+
+    auto t4 = t1 + t2;
+    cout << t4(0, 0, 0);  // Compilation error!
+
+When you use ```auto``` you do not get a Tensor as a result but instead a
+non-evaluated expression.  So only use ```auto``` to delay evaluation.
+
+Unfortunately, there is no single underlying concrete type for holding
+non-evaluated expressions, hence you have to use auto in the case when you do
+want to hold non-evaluated expressions.
+
+When you need the results of set of tensor computations you have to assign the
+result to a Tensor that will be capable of holding onto them.  This can be
+either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing
+piece of memory.  All the following will work:
+
+    auto t4 = t1 + t2;
+
+    Tensor<float, 3> result = t4;  // Could also be: result(t4);
+    cout << result(0, 0, 0);
+
+    TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4;
+    cout << result(0, 0, 0);
+
+    TensorFixedSize<float, Sizes<size0, ...>> result = t4;
+    cout << result(0, 0, 0);
+
+Until you need the results, you can keep the operation around, and even reuse
+it for additional operations.  As long as you keep the expression as an
+operation, no computation is performed.
+
+    // One way to compute exp((t1 + t2) * 0.2f);
+    auto t3 = t1 + t2;
+    auto t4 = t3 * 0.2f;
+    auto t5 = t4.exp();
+    Tensor<float, 3> result = t5;
+
+    // Another way, exactly as efficient as the previous one:
+    Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+### Controlling When Expression are Evaluated
+
+There are several ways to control when expressions are evaluated:
+
+*   Assignment to a Tensor, TensorFixedSize, or TensorMap.
+*   Use of the eval() method.
+*   Assignment to a TensorRef.
+
+#### Assigning to a Tensor, TensorFixedSize, or TensorMap.
+
+The most common way to evaluate an expression is to assign it to a Tensor.  In
+the example below, the ```auto``` declarations make the intermediate values
+"Operations", not Tensors, and do not cause the expressions to be evaluated.
+The assignment to the Tensor ```result``` causes the evaluation of all the
+operations.
+
+    auto t3 = t1 + t2;             // t3 is an Operation.
+    auto t4 = t3 * 0.2f;           // t4 is an Operation.
+    auto t5 = t4.exp();            // t5 is an Operation.
+    Tensor<float, 3> result = t5;  // The operations are evaluated.
+
+If you know the ranks and sizes of the Operation value you can assign the
+Operation to a TensorFixedSize instead of a Tensor, which is a bit more
+efficient.
+
+    // We know that the result is a 4x4x2 tensor!
+    TensorFixedSize<float, 4, 4, 2> result = t5;
+
+Simiarly, assigning an expression to a TensorMap causes its evaluation.  Like
+tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
+have the rank and sizes of the expression that are assigned to them.
+
+#### Calling eval().
+
+When you compute large composite expressions, you sometimes want to tell Eigen
+that an intermediate value in the expression tree is worth evaluating ahead of
+time.  This is done by inserting a call to the ```eval()``` method of the
+expression Operation.
+
+    // The previous example could have been written:
+    Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+    // If you want to compute (t1 + t2) once ahead of time you can write:
+    Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp();
+
+Semantically, calling ```eval()``` is equivalent to materializing the value of
+the expression in a temporary Tensor of the right size.  The code above in
+effect does:
+
+    // .eval() knows the size!
+    TensorFixedSize<float, 4, 4, 2> tmp = t1 + t2;
+    Tensor<float, 3> result = (tmp * 0.2f).exp();
+
+Note that the return value of ```eval()``` is itself an Operation, so the
+following code does not do what you may think:
+
+    // Here t3 is an evaluation Operation.  t3 has not been evaluated yet.
+    auto t3 = (t1 + t2).eval();
+
+    // You can use t3 in another expression.  Still no evaluation.
+    auto t4 = (t3 * 0.2f).exp();
+
+    // The value is evaluated when you assign the Operation to a Tensor, using
+    // an intermediate tensor to represent t3.x
+    Tensor<float, 3> result = t4;
+
+While in the examples above calling ```eval()``` does not make a difference in
+performance, in other cases it can make a huge difference.  In the expression
+below the ```broadcast()``` expression causes the ```X.maximum()``` expression
+to be evaluated many times:
+
+    Tensor<...> X ...;
+    Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast))
+                     * beta).exp();
+
+Inserting a call to ```eval()``` between the ```maximum()``` and
+```reshape()``` calls guarantees that maximum() is only computed once and
+greatly speeds-up execution:
+
+    Tensor<...> Y =
+      ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast))
+        * beta).exp();
+
+In the other example below, the tensor ```Y``` is both used in the expression
+and its assignment.  This is an aliasing problem and if the evaluation is not
+done in the right order Y will be updated incrementally during the evaluation
+resulting in bogus results:
+
+     Tensor<...> Y ...;
+     Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast));
+
+Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()```
+expressions ensures that the sum is computed before any updates to ```Y``` are
+done.
+
+     Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+Note that an eval around the full right hand side expression is not needed
+because the generated has to compute the i-th value of the right hand side
+before assigning it to the left hand side.
+
+However, if you were assigning the expression value to a shuffle of ```Y```
+then you would need to force an eval for correctness by adding an ```eval()```
+call for the right hand side:
+
+     Y.shuffle(...) =
+        (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval();
+
+
+#### Assigning to a TensorRef.
+
+If you need to access only a few elements from the value of an expression you
+can avoid materializing the value in a full tensor by using a TensorRef.
+
+A TensorRef is a small wrapper class for any Eigen Operation.  It provides
+overloads for the ```()``` operator that let you access individual values in
+the expression.  TensorRef is convenient, because the Operation themselves do
+not provide a way to access individual elements.
+
+    // Create a TensorRef for the expression.  The expression is not
+    // evaluated yet.
+    TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp();
+
+    // Use "ref" to access individual elements.  The expression is evaluated
+    // on the fly.
+    float at_0 = ref(0, 0, 0);
+    cout << ref(0, 1, 0);
+
+Only use TensorRef when you need a subset of the values of the expression.
+TensorRef only computes the values you access.  However note that if you are
+going to access all the values it will be much faster to materialize the
+results in a Tensor first.
+
+In some cases, if the full Tensor result would be very large, you may save
+memory by accessing it as a TensorRef.  But not always.  So don't count on it.
+
+
+### Controlling How Expressions Are Evaluated
+
+The tensor library provides several implementations of the various operations
+such as contractions and convolutions.  The implementations are optimized for
+different environments: single threaded on CPU, multi threaded on CPU, or on a
+GPU using cuda.  Additional implementations may be added later.
+
+You can choose which implementation to use with the ```device()``` call.  If
+you do not choose an implementation explicitly the default implementation that
+uses a single thread on the CPU is used.
+
+The default implementation has been optimized for recent Intel CPUs, taking
+advantage of SSE, AVX, and FMA instructions.  Work is ongoing to tune the
+library on ARM CPUs.  Note that you need to pass compiler-dependent flags
+to enable the use of SSE, AVX, and other instructions.
+
+For example, the following code adds two tensors using the default
+single-threaded CPU implementation:
+
+    Tensor<float, 2> a(30, 40);
+    Tensor<float, 2> b(30, 40);
+    Tensor<float, 2> c = a + b;
+
+To choose a different implementation you have to insert a ```device()``` call
+before the assignment of the result.  For technical C++ reasons this requires
+that the Tensor for the result be declared on its own.  This means that you
+have to know the size of the result.
+
+    Eigen::Tensor<float, 2> c(30, 40);
+    c.device(...) = a + b;
+
+The call to ```device()``` must be the last call on the left of the operator=.
+
+You must pass to the ```device()``` call an Eigen device object.  There are
+presently three devices you can use: DefaultDevice, ThreadPoolDevice and
+GpuDevice.
+
+
+#### Evaluating With the DefaultDevice
+
+This is exactly the same as not inserting a ```device()``` call.
+
+    DefaultDevice my_device;
+    c.device(my_device) = a + b;
+
+#### Evaluating with a Thread Pool
+
+    // Create the Eigen ThreadPoolDevice.
+    Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */);
+
+    // Now just use the device when evaluating expressions.
+    Eigen::Tensor<float, 2> c(30, 50);
+    c.device(my_device) = a.contract(b, dot_product_dims);
+
+
+#### Evaluating On GPU
+
+This is presently a bit more complicated than just using a thread pool device.
+You need to create a GPU device but you also need to explicitly allocate the
+memory for tensors with cuda.
+
+
+## API Reference
+
+### Datatypes
+
+In the documentation of the tensor methods and Operation we mention datatypes
+that are tensor-type specific:
+
+#### <Tensor-Type>::Dimensions
+
+Acts like an array of ints.  Has an ```int size``` attribute, and can be
+indexed like an array to access individual values.  Used to represent the
+dimensions of a tensor.  See ```dimensions()```.
+
+#### <Tensor-Type>::Index
+
+Acts like an ```int```.  Used for indexing tensors along their dimensions.  See
+```operator()```, ```dimension()```, and ```size()```.
+
+#### <Tensor-Type>::Scalar
+
+Represents the datatype of individual tensor elements.  For example, for a
+```Tensor<float>```, ```Scalar``` is the type ```float```.  See
+```setConstant()```.
+
+#### <Operation>
+
+We use this pseudo type to indicate that a tensor Operation is returned by a
+method.  We indicate in the text the type and dimensions of the tensor that the
+Operation returns after evaluation.
+
+The Operation will have to be evaluated, for example by assigning it to a
+tensor, before you can access the values of the resulting tensor.  You can also
+access the values through a TensorRef.
+
+
+## Built-in Tensor Methods
+
+These are usual C++ methods that act on tensors immediately.  They are not
+Operations which provide delayed evaluation of their results.  Unless specified
+otherwise, all the methods listed below are available on all tensor classes:
+Tensor, TensorFixedSize, and TensorMap.
+
+## Metadata
+
+### int NumDimensions
+
+Constant value indicating the number of dimensions of a Tensor.  This is also
+known as the tensor "rank".
+
+      Eigen::Tensor<float, 2> a(3, 4);
+      cout << "Dims " << a.NumDimensions;
+      => Dims 2
+
+### Dimensions dimensions()
+
+Returns an array-like object representing the dimensions of the tensor.
+The actual type of the dimensions() result is <Tensor-Type>::Dimensions.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions();
+    cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+         << ", dim 1: " << d[1];
+    => Dim size: 2, dim 0: 3, dim 1: 4
+
+If you use a C++11 compiler, you can use ```auto``` to simplify the code:
+
+    const auto& d = a.dimensions();
+    cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+         << ", dim 1: " << d[1];
+    => Dim size: 2, dim 0: 3, dim 1: 4
+
+### Index dimension(Index n)
+
+Returns the n-th dimension of the tensor.  The actual type of the
+```dimension()``` result is ```<Tensor-Type>::Index```, but you can
+always use it like an int.
+
+      Eigen::Tensor<float, 2> a(3, 4);
+      int dim1 = a.dimension(1);
+      cout << "Dim 1: " << dim1;
+      => Dim 1: 4
+
+### Index size()
+
+Returns the total number of elements in the tensor.  This is the product of all
+the tensor dimensions.  The actual type of the ```size()``` result is
+```<Tensor-Type>::Index```, but you can always use it like an int.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    cout << "Size: " << a.size();
+    => Size: 12
+
+
+### Getting Dimensions From An Operation
+
+A few operations provide ```dimensions()``` directly,
+e.g. ```TensorReslicingOp```.  Most operations defer calculating dimensions
+until the operation is being evaluated.  If you need access to the dimensions
+of a deferred operation, you can wrap it in a TensorRef (see Assigning to a
+TensorRef above), which provides ```dimensions()``` and ```dimension()``` as
+above.
+
+TensorRef can also wrap the plain Tensor types, so this is a useful idiom in
+templated contexts where the underlying object could be either a raw Tensor
+or some deferred operation (e.g. a slice of a Tensor).  In this case, the
+template code can wrap the object in a TensorRef and reason about its
+dimensionality while remaining agnostic to the underlying type.
+
+
+## Constructors
+
+### Tensor
+
+Creates a tensor of the specified size. The number of arguments must be equal
+to the rank of the tensor. The content of the tensor is not initialized.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+    => NumRows: 3 NumCols: 4
+
+### TensorFixedSize
+
+Creates a tensor of the specified size. The number of arguments in the Size<>
+template parameter determines the rank of the tensor. The content of the tensor
+is not initialized.
+
+    Eigen::TensorFixedSize<float, Size<3, 4>> a;
+    cout << "Rank: " << a.rank() << endl;
+    => Rank: 2
+    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+    => NumRows: 3 NumCols: 4
+
+### TensorMap
+
+Creates a tensor mapping an existing array of data. The data must not be freed
+until the TensorMap is discarded, and the size of the data must be large enough
+to accomodate of the coefficients of the tensor.
+
+    float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    Eigen::TensorMap<float, 2> a(data, 3, 4);
+    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+    => NumRows: 3 NumCols: 4
+    cout << "a(1, 2): " << a(1, 2) << endl;
+    => a(1, 2): 9
+
+
+## Contents Initialization
+
+When a new Tensor or a new TensorFixedSize are created, memory is allocated to
+hold all the tensor elements, but the memory is not initialized.  Similarly,
+when a new TensorMap is created on top of non-initialized memory the memory its
+contents are not initialized.
+
+You can use one of the methods below to initialize the tensor memory.  These
+have an immediate effect on the tensor and return the tensor itself as a
+result.  These are not tensor Operations which delay evaluation.
+
+### <Tensor-Type> setConstant(const Scalar& val)
+
+Sets all elements of the tensor to the constant value ```val```.  ```Scalar```
+is the type of data stored in the tensor.  You can pass any value that is
+convertible to that type.
+
+Returns the tensor itself in case you want to chain another call.
+
+    a.setConstant(12.3f);
+    cout << "Constant: " << endl << a << endl << endl;
+    =>
+    Constant:
+    12.3 12.3 12.3 12.3
+    12.3 12.3 12.3 12.3
+    12.3 12.3 12.3 12.3
+
+Note that ```setConstant()``` can be used on any tensor where the element type
+has a copy constructor and an ```operator=()```:
+
+    Eigen::Tensor<string, 2> a(2, 3);
+    a.setConstant("yolo");
+    cout << "String tensor: " << endl << a << endl << endl;
+    =>
+    String tensor:
+    yolo yolo yolo
+    yolo yolo yolo
+
+
+### <Tensor-Type> setZero()
+
+Fills the tensor with zeros.  Equivalent to ```setConstant(Scalar(0))```.
+Returns the tensor itself in case you want to chain another call.
+
+    a.setZero();
+    cout << "Zeros: " << endl << a << endl << endl;
+    =>
+    Zeros:
+    0 0 0 0
+    0 0 0 0
+    0 0 0 0
+
+
+### <Tensor-Type> setValues({..initializer_list})
+
+Fills the tensor with explicit values specified in a std::initializer_list.
+The type of the initializer list depends on the type and rank of the tensor.
+
+If the tensor has rank N, the initializer list must be nested N times.  The
+most deeply nested lists must contains P scalars of the Tensor type where P is
+the size of the last dimension of the Tensor.
+
+For example, for a ```TensorFixedSize<float, 2, 3>``` the initializer list must
+contains 2 lists of 3 floats each.
+
+```setValues()``` returns the tensor itself in case you want to chain another
+call.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}});
+    cout << "a" << endl << a << endl << endl;
+    =>
+    a
+    0 1 2
+    3 4 5
+
+If a list is too short, the corresponding elements of the tensor will not be
+changed.  This is valid at each level of nesting.  For example the following
+code only sets the values of the first row of the tensor.
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setConstant(1000);
+    a.setValues({{10, 20, 30}});
+    cout << "a" << endl << a << endl << endl;
+    =>
+    a
+    10   20   30
+    1000 1000 1000
+
+### <Tensor-Type> setRandom()
+
+Fills the tensor with random values.  Returns the tensor itself in case you
+want to chain another call.
+
+    a.setRandom();
+    cout << "Random: " << endl << a << endl << endl;
+    =>
+    Random:
+      0.680375    0.59688  -0.329554    0.10794
+     -0.211234   0.823295   0.536459 -0.0452059
+      0.566198  -0.604897  -0.444451   0.257742
+
+You can customize ```setRandom()``` by providing your own random number
+generator as a template argument:
+
+    a.setRandom<MyRandomGenerator>();
+
+Here, ```MyRandomGenerator``` must be a struct with the following member
+functions, where Scalar and Index are the same as ```<Tensor-Type>::Scalar```
+and ```<Tensor-Type>::Index```.
+
+See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example.
+
+    // Custom number generator for use with setRandom().
+    struct MyRandomGenerator {
+      // Default and copy constructors. Both are needed
+      MyRandomGenerator() { }
+      MyRandomGenerator(const MyRandomGenerator& ) { }
+
+      // Return a random value to be used.  "element_location" is the
+      // location of the entry to set in the tensor, it can typically
+      // be ignored.
+      Scalar operator()(Eigen::DenseIndex element_location,
+                        Eigen::DenseIndex /*unused*/ = 0) const {
+        return <randomly generated value of type T>;
+      }
+
+      // Same as above but generates several numbers at a time.
+      typename internal::packet_traits<Scalar>::type packetOp(
+          Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+        return <a packet of randomly generated values>;
+      }
+    };
+
+You can also use one of the 2 random number generators that are part of the
+tensor library:
+*   UniformRandomGenerator
+*   NormalRandomGenerator
+
+
+## Data Access
+
+The Tensor, TensorFixedSize, and TensorRef classes provide the following
+accessors to access the tensor coefficients:
+
+    const Scalar& operator()(const array<Index, NumIndices>& indices)
+    const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    Scalar& operator()(const array<Index, NumIndices>& indices)
+    Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+
+The number of indices must be equal to the rank of the tensor. Moreover, these
+accessors are not available on tensor expressions. In order to access the
+values of a tensor expression, the expression must either be evaluated or
+wrapped in a TensorRef.
+
+
+### Scalar* data() and const Scalar* data() const
+
+Returns a pointer to the storage for the tensor.  The pointer is const if the
+tensor was const.  This allows direct access to the data.  The layout of the
+data depends on the tensor layout: RowMajor or ColMajor.
+
+This access is usually only needed for special cases, for example when mixing
+Eigen Tensor code with other libraries.
+
+Scalar is the type of data stored in the tensor.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    float* a_data = a.data();
+    a_data[0] = 123.45f;
+    cout << "a(0, 0): " << a(0, 0);
+    => a(0, 0): 123.45
+
+
+## Tensor Operations
+
+All the methods documented below return non evaluated tensor ```Operations```.
+These can be chained: you can apply another Tensor Operation to the value
+returned by the method.
+
+The chain of Operation is evaluated lazily, typically when it is assigned to a
+tensor.  See "Controlling when Expression are Evaluated" for more details about
+their evaluation.
+
+### <Operation> constant(const Scalar& val)
+
+Returns a tensor of the same type and dimensions as the original tensor but
+where all elements have the value ```val```.
+
+This is useful, for example, when you want to add or subtract a constant from a
+tensor, or multiply every element of a tensor by a scalar.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = a + a.constant(2.0f);
+    Eigen::Tensor<float, 2> c = b * b.constant(0.2f);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    cout << "c" << endl << c << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    3 3 3
+    3 3 3
+
+    c
+    0.6 0.6 0.6
+    0.6 0.6 0.6
+
+### <Operation> random()
+
+Returns a tensor of the same type and dimensions as the current tensor
+but where all elements have random values.
+
+This is for example useful to add random values to an existing tensor.
+The generation of random values can be customized in the same manner
+as for ```setRandom()```.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = a + a.random();
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    1.68038   1.5662  1.82329
+    0.788766  1.59688 0.395103
+
+
+## Unary Element Wise Operations
+
+All these operations take a single input tensor as argument and return a tensor
+of the same type and dimensions as the tensor to which they are applied.  The
+requested operations are applied to each element independently.
+
+### <Operation> operator-()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the opposite values of the original tensor.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = -a;
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    -1 -1 -1
+    -1 -1 -1
+
+### <Operation> sqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the square roots of the original tensor.
+
+### <Operation> rsqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse square roots of the original tensor.
+
+### <Operation> square()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the squares of the original tensor values.
+
+### <Operation> inverse()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse of the original tensor values.
+
+### <Operation> exp()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the exponential of the original tensor.
+
+### <Operation> log()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the natural logarithms of the original tensor.
+
+### <Operation> abs()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the absolute values of the original tensor.
+
+### <Operation> pow(Scalar exponent)
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the coefficients of the original tensor to the power of the
+exponent.
+
+The type of the exponent, Scalar, is always the same as the type of the
+tensor coefficients.  For example, only integer exponents can be used in
+conjuntion with tensors of integer values.
+
+You can use cast() to lift this restriction.  For example this computes
+cubic roots of an int Tensor:
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 1, 8}, {27, 64, 125}});
+    Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    0   1   8
+    27  64 125
+
+    b
+    0 1 2
+    3 4 5
+
+### <Operation>  operator * (Scalar scale)
+
+Multiplies all the coefficients of the input tensor by the provided scale.
+
+### <Operation>  cwiseMax(Scalar threshold)
+TODO
+
+### <Operation>  cwiseMin(Scalar threshold)
+TODO
+
+### <Operation>  unaryExpr(const CustomUnaryOp& func)
+TODO
+
+
+## Binary Element Wise Operations
+
+These operations take two input tensors as arguments. The 2 input tensors should
+be of the same type and dimensions. The result is a tensor of the same
+dimensions as the tensors to which they are applied, and unless otherwise
+specified it is also of the same type. The requested operations are applied to
+each pair of elements independently.
+
+### <Operation> operator+(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise sums of the inputs.
+
+### <Operation> operator-(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise differences of the inputs.
+
+### <Operation> operator*(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise products of the inputs.
+
+### <Operation> operator/(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise quotients of the inputs.
+
+This operator is not supported for integer types.
+
+### <Operation> cwiseMax(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise maximums of the inputs.
+
+### <Operation> cwiseMin(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise mimimums of the inputs.
+
+### <Operation> Logical operators
+
+The following logical operators are supported as well:
+
+*   operator&&(const OtherDerived& other)
+*   operator||(const OtherDerived& other)
+*   operator<(const OtherDerived& other)
+*   operator<=(const OtherDerived& other)
+*   operator>(const OtherDerived& other)
+*   operator>=(const OtherDerived& other)
+*   operator==(const OtherDerived& other)
+*   operator!=(const OtherDerived& other)
+
+They all return a tensor of boolean values.
+
+
+## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor)
+
+Selection is a coefficient-wise ternary operator that is the tensor equivalent
+to the if-then-else operation.
+
+    Tensor<bool, 3> if = ...;
+    Tensor<float, 3> then = ...;
+    Tensor<float, 3> else = ...;
+    Tensor<float, 3> result = if.select(then, else);
+
+The 3 arguments must be of the same dimensions, which will also be the dimension
+of the result.  The 'if' tensor must be of type boolean, the 'then' and the
+'else' tensor must be of the same type, which will also be the type of the
+result.
+
+Each coefficient in the result is equal to the corresponding coefficient in the
+'then' tensor if the corresponding value in the 'if' tensor is true. If not, the
+resulting coefficient will come from the 'else' tensor.
+
+
+## Contraction
+
+Tensor *contractions* are a generalization of the matrix product to the
+multidimensional case.
+
+    // Create 2 matrices using tensors of rank 2
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {6, 5, 4}});
+    Eigen::Tensor<int, 2> b(3, 2);
+    a.setValues({{1, 2}, {4, 5}, {5, 6}});
+
+    // Compute the traditional matrix product
+    array<IndexPair<int>, 1> product_dims = { IndexPair(1, 0) };
+    Eigen::Tensor<int, 2> AB = a.contract(b, product_dims);
+
+    // Compute the product of the transpose of the matrices
+    array<IndexPair<int>, 1> transpose_product_dims = { IndexPair(0, 1) };
+    Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims);
+
+
+## Reduction Operations
+
+A *Reduction* operation returns a tensor with fewer dimensions than the
+original tensor.  The values in the returned tensor are computed by applying a
+*reduction operator* to slices of values from the original tensor.  You specify
+the dimensions along which the slices are made.
+
+The Eigen Tensor library provides a set of predefined reduction operators such
+as ```maximum()``` and ```sum()``` and lets you define additional operators by
+implementing a few methods from a reductor template.
+
+### Reduction Dimensions
+
+All reduction operations take a single parameter of type
+```<TensorType>::Dimensions``` which can always be specified as an array of
+ints.  These are called the "reduction dimensions."  The values are the indices
+of the dimensions of the input tensor over which the reduction is done.  The
+parameter can have at most as many element as the rank of the input tensor;
+each element must be less than the tensor rank, as it indicates one of the
+dimensions to reduce.
+
+Each dimension of the input tensor should occur at most once in the reduction
+dimensions as the implementation does not remove duplicates.
+
+The order of the values in the reduction dimensions does not affect the
+results, but the code may execute faster if you list the dimensions in
+increasing order.
+
+Example: Reduction along one dimension.
+
+    // Create a tensor of 2 dimensions
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {6, 5, 4}});
+    // Reduce it along the second dimension (1)...
+    Eigen::array<int, 1> dims({1 /* dimension to reduce */});
+    // ...using the "maximum" operator.
+    // The result is a tensor with one dimension.  The size of
+    // that dimension is the same as the first (non-reduced) dimension of a.
+    Eigen::Tensor<int, 1> b = a.maximum(dims);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 2 3
+    6 5 4
+
+    b
+    3
+    6
+
+Example: Reduction along two dimensions.
+
+    Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4);
+    a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+                  {7.0f, 6.0f, 5.0f, 4.0f},
+                  {8.0f, 9.0f, 10.0f, 11.0f}},
+                 {{12.0f, 13.0f, 14.0f, 15.0f},
+                  {19.0f, 18.0f, 17.0f, 16.0f},
+                  {20.0f, 21.0f, 22.0f, 23.0f}}});
+    // The tensor a has 3 dimensions.  We reduce along the
+    // first 2, resulting in a tensor with a single dimension
+    // of size 4 (the last dimension of a.)
+    // Note that we pass the array of reduction dimensions
+    // directly to the maximum() call.
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b =
+        a.maximum(Eigen::array<int, 2>({0, 1}));
+    cout << "b" << endl << b << endl << endl;
+    =>
+    b
+    20
+    21
+    22
+    23
+
+#### Reduction along all dimensions
+
+As a special case, if you pass no parameter to a reduction operation the
+original tensor is reduced along *all* its dimensions.  The result is a
+scalar, represented as a zero-dimension tensor.
+
+    Eigen::Tensor<float, 3> a(2, 3, 4);
+    a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+                  {7.0f, 6.0f, 5.0f, 4.0f},
+                  {8.0f, 9.0f, 10.0f, 11.0f}},
+                 {{12.0f, 13.0f, 14.0f, 15.0f},
+                  {19.0f, 18.0f, 17.0f, 16.0f},
+                  {20.0f, 21.0f, 22.0f, 23.0f}}});
+    // Reduce along all dimensions using the sum() operator.
+    Eigen::Tensor<float, 0> b = a.sum();
+    cout << "b" << endl << b << endl << endl;
+    =>
+    b
+    276
+
+
+### <Operation> sum(const Dimensions& new_dims)
+### <Operation> sum()
+
+Reduce a tensor using the sum() operator.  The resulting values
+are the sum of the reduced values.
+
+### <Operation> mean(const Dimensions& new_dims)
+### <Operation> mean()
+
+Reduce a tensor using the mean() operator.  The resulting values
+are the mean of the reduced values.
+
+### <Operation> maximum(const Dimensions& new_dims)
+### <Operation> maximum()
+
+Reduce a tensor using the maximum() operator.  The resulting values are the
+largest of the reduced values.
+
+### <Operation> minimum(const Dimensions& new_dims)
+### <Operation> minimum()
+
+Reduce a tensor using the minimum() operator.  The resulting values
+are the smallest of the reduced values.
+
+### <Operation> prod(const Dimensions& new_dims)
+### <Operation> prod()
+
+Reduce a tensor using the prod() operator.  The resulting values
+are the product of the reduced values.
+
+### <Operation> all(const Dimensions& new_dims)
+### <Operation> all()
+Reduce a tensor using the all() operator.  Casts tensor to bool and then checks
+whether all elements are true.  Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+### <Operation> any(const Dimensions& new_dims)
+### <Operation> any()
+Reduce a tensor using the any() operator.  Casts tensor to bool and then checks
+whether any element is true.  Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+
+### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer)
+
+Reduce a tensor using a user-defined reduction operator.  See ```SumReducer```
+in TensorFunctors.h for information on how to implement a reduction operator.
+
+
+## Scan Operations
+
+A *Scan* operation returns a tensor with the same dimensions as the original
+tensor. The operation performs an inclusive scan along the specified
+axis, which means it computes a running total along the axis for a given
+reduction operation.
+If the reduction operation corresponds to summation, then this computes the
+prefix sum of the tensor along the given axis.
+
+Example:
+dd a comment to this line
+
+    // Create a tensor of 2 dimensions
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {4, 5, 6}});
+    // Scan it along the second dimension (1) using summation
+    Eigen::Tensor<int, 2> b = a.cumsum(1);
+    // The result is a tensor with the same size as the input
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 2 3
+    6 5 4
+
+    b
+    1  3  6
+    4  9 15
+
+### <Operation> cumsum(const Index& axis)
+
+Perform a scan by summing consecutive entries.
+
+### <Operation> cumprod(const Index& axis)
+
+Perform a scan by multiplying consecutive entries.
+
+
+## Convolutions
+
+### <Operation> convolve(const Kernel& kernel, const Dimensions& dims)
+
+Returns a tensor that is the output of the convolution of the input tensor with the kernel,
+along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor
+which were part of the convolution will be reduced by the formula:
+output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size).
+The dimension sizes for dimensions that were not part of the convolution will remain the same.
+Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the
+convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is
+for the last dimension).
+
+    // Compute convolution along the second and third dimension.
+    Tensor<float, 4, DataLayout> input(3, 3, 7, 11);
+    Tensor<float, 2, DataLayout> kernel(2, 2);
+    Tensor<float, 4, DataLayout> output(3, 2, 6, 11);
+    input.setRandom();
+    kernel.setRandom();
+
+    Eigen::array<ptrdiff_t, 2> dims({1, 2});  // Specify second and third dimension for convolution.
+    output = input.convolve(kernel, dims);
+
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 6; ++k) {
+          for (int l = 0; l < 11; ++l) {
+            const float result = output(i,j,k,l);
+            const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+                                   input(i,j+1,k+0,l) * kernel(1,0) +
+                                   input(i,j+0,k+1,l) * kernel(0,1) +
+                                   input(i,j+1,k+1,l) * kernel(1,1);
+            VERIFY_IS_APPROX(result, expected);
+          }
+        }
+      }
+    }
+
+
+## Geometrical Operations
+
+These operations return a Tensor with different dimensions than the original
+Tensor.  They can be used to access slices of tensors, see them with different
+dimensions, or pad tensors with additional data.
+
+### <Operation> reshape(const Dimensions& new_dims)
+
+Returns a view of the input tensor that has been reshaped to the specified
+new dimensions.  The argument new_dims is an array of Index values.  The
+rank of the resulting tensor is equal to the number of elements in new_dims.
+
+The product of all the sizes in the new dimension array must be equal to
+the number of elements in the input tensor.
+
+    // Increase the rank of the input tensor by introducing a new dimension
+    // of size 1.
+    Tensor<float, 2> input(7, 11);
+    array<int, 3> three_dims{{7, 11, 1}};
+    Tensor<float, 3> result = input.reshape(three_dims);
+
+    // Decrease the rank of the input tensor by merging 2 dimensions;
+    array<int, 1> one_dim{{7 * 11}};
+    Tensor<float, 1> result = input.reshape(one_dim);
+
+This operation does not move any data in the input tensor, so the resulting
+contents of a reshaped Tensor depend on the data layout of the original Tensor.
+
+For example this is what happens when you ```reshape()``` a 2D ColMajor tensor
+to one dimension:
+
+    Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    300
+    100
+    400
+    200
+    500
+
+This is what happens when the 2D Tensor is RowMajor:
+
+    Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+    Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    100
+    200
+    300
+    400
+    500
+
+The reshape operation is a lvalue. In other words, it can be used on the left
+side of the assignment operator.
+
+The previous example can be rewritten as follow:
+
+    Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b;
+    b.reshape(two_dim) = a;
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    300
+    100
+    400
+    200
+    500
+
+Note that "b" itself was not reshaped but that instead the assignment is done to
+the reshape view of b.
+
+
+### <Operation> shuffle(const Shuffle& shuffle)
+
+Returns a copy of the input tensor whose dimensions have been
+reordered according to the specified permutation. The argument shuffle
+is an array of Index values. Its size is the rank of the input
+tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th
+dimension of the output tensor equals to the size of the shuffle[i]-th
+dimension of the input tensor. For example:
+
+    // Shuffle all dimensions to the left by 1.
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output = input.shuffle({1, 2, 0})
+
+    eigen_assert(output.dimension(0) == 30);
+    eigen_assert(output.dimension(1) == 50);
+    eigen_assert(output.dimension(2) == 20);
+
+Indices into the output tensor are shuffled accordingly to formulate
+indices into the input tensor. For example, one can assert in the above
+code snippet that:
+
+    eigen_assert(output(3, 7, 11) == input(11, 3, 7));
+
+In general, one can assert that
+
+    eigen_assert(output(..., indices[shuffle[i]], ...) ==
+                 input(..., indices[i], ...))
+
+The shuffle operation results in a lvalue, which means that it can be assigned
+to. In other words, it can be used on the left side of the assignment operator.
+
+Let's rewrite the previous example to take advantage of this feature:
+
+    // Shuffle all dimensions to the left by 1.
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output(30, 50, 20);
+    output.shuffle({2, 0, 1}) = input;
+
+
+### <Operation> stride(const Strides& strides)
+
+Returns a view of the input tensor that strides (skips stride-1
+elements) along each of the dimensions.  The argument strides is an
+array of Index values.  The dimensions of the resulting tensor are
+ceil(input_dimensions[i] / strides[i]).
+
+For example this is what happens when you ```stride()``` a 2D tensor:
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<Eigen::DenseIndex, 2> strides({3, 2});
+    Eigen::Tensor<int, 2> b = a.stride(strides);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+       0   200
+     900  1100
+
+It is possible to assign a tensor to a stride:
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output(40, 90, 200);
+    output.stride({2, 3, 4}) = input;
+
+
+### <Operation> slice(const StartIndices& offsets, const Sizes& extents)
+
+Returns a sub-tensor of the given tensor. For each dimension i, the slice is
+made of the coefficients stored between offset[i] and offset[i] + extents[i] in
+the input tensor.
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                 {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<int, 2> offsets = {1, 0};
+    Eigen::array<int, 2> extents = {2, 2};
+    Eigen::Tensor<int, 1> slice = a.slice(offsets, extents);
+    cout << "a" << endl << a << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    cout << "slice" << endl << slice << endl;
+    =>
+    slice
+     300   400
+     600   700
+
+
+### <Operation> chip(const Index offset, const Index dim)
+
+A chip is a special kind of slice. It is the subtensor at the given offset in
+the dimension dim. The returned tensor has one fewer dimension than the input
+tensor: the dimension dim is removed.
+
+For example, a matrix chip would be either a row or a column of the input
+matrix.
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                 {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::Tensor<int, 1> row_3 = a.chip(2, 0);
+    Eigen::Tensor<int, 1> col_2 = a.chip(1, 1);
+    cout << "a" << endl << a << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    cout << "row_3" << endl << row_3 << endl;
+    =>
+    row_3
+       600   700   800
+    cout << "col_2" << endl << col_2 << endl;
+    =>
+    col_2
+       100   400   700    1000
+
+It is possible to assign values to a tensor chip since the chip operation is a
+lvalue. For example:
+
+    Eigen::Tensor<int, 1> a(3);
+    a.setValues({{100, 200, 300}});
+    Eigen::Tensor<int, 2> b(2, 3);
+    b.setZero();
+    b.chip(0, 0) = a;
+    cout << "a" << endl << a << endl;
+    =>
+    a
+     100
+     200
+     300
+    cout << "b" << endl << b << endl;
+    =>
+    b
+       100   200   300
+         0     0     0
+
+
+### <Operation> reverse(const ReverseDimensions& reverse)
+
+Returns a view of the input tensor that reverses the order of the coefficients
+along a subset of the dimensions.  The argument reverse is an array of boolean
+values that indicates whether or not the order of the coefficients should be
+reversed along each of the dimensions.  This operation preserves the dimensions
+of the input tensor.
+
+For example this is what happens when you ```reverse()``` the first dimension
+of a 2D tensor:
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<bool, 2> reverse({true, false});
+    Eigen::Tensor<int, 2> b = a.reverse(reverse);
+    cout << "a" << endl << a << endl << "b" << endl << b << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    b
+     900  1000  1100
+     600   700   800
+     300   400   500
+       0   100   200
+
+
+### <Operation> broadcast(const Broadcast& broadcast)
+
+Returns a view of the input tensor in which the input is replicated one to many
+times.
+The broadcast argument specifies how many copies of the input tensor need to be
+made in each of the dimensions.
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}});
+    Eigen::array<int, 2> bcast({3, 2});
+    Eigen::Tensor<int, 2> b = a.broadcast(bcast);
+    cout << "a" << endl << a << endl << "b" << endl << b << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+    b
+       0   100   200    0   100   200
+     300   400   500  300   400   500
+       0   100   200    0   100   200
+     300   400   500  300   400   500
+       0   100   200    0   100   200
+     300   400   500  300   400   500
+
+### <Operation> concatenate(const OtherDerived& other, Axis axis)
+
+TODO
+
+### <Operation>  pad(const PaddingDimensions& padding)
+
+Returns a view of the input tensor in which the input is padded with zeros.
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}});
+    Eigen::array<pair<int, int>, 2> paddings;
+    paddings[0] = make_pair(0, 1);
+    paddings[1] = make_pair(2, 3);
+    Eigen::Tensor<int, 2> b = a.pad(paddings);
+    cout << "a" << endl << a << endl << "b" << endl << b << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+    b
+       0     0     0    0
+       0     0     0    0
+       0   100   200    0
+     300   400   500    0
+       0     0     0    0
+       0     0     0    0
+       0     0     0    0
+
+
+### <Operation>  extract_patches(const PatchDims& patch_dims)
+
+Returns a tensor of coefficient patches extracted from the input tensor, where
+each patch is of dimension specified by 'patch_dims'. The returned tensor has
+one greater dimension than the input tensor, which is used to index each patch.
+The patch index in the output tensor depends on the data layout of the input
+tensor: the patch index is the last dimension ColMajor layout, and the first
+dimension in RowMajor layout.
+
+For example, given the following input tensor:
+
+  Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
+  tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
+                    {4.0f, 5.0f, 6.0f, 7.0f},
+                    {8.0f, 9.0f, 10.0f, 11.0f}});
+
+  cout << "tensor: " << endl << tensor << endl;
+=>
+tensor:
+ 0   1   2   3
+ 4   5   6   7
+ 8   9  10  11
+
+Six 2x2 patches can be extracted and indexed using the following code:
+
+  Eigen::Tensor<float, 3, DataLayout> patch;
+  Eigen::array<ptrdiff_t, 2> patch_dims;
+  patch_dims[0] = 2;
+  patch_dims[1] = 2;
+  patch = tensor.extract_patches(patch_dims);
+  for (int k = 0; k < 6; ++k) {
+    cout << "patch index: " << k << endl;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        if (DataLayout == ColMajor) {
+          cout << patch(i, j, k) << " ";
+        } else {
+          cout << patch(k, i, j) << " ";
+        }
+      }
+      cout << endl;
+    }
+  }
+
+This code results in the following output when the data layout is ColMajor:
+
+patch index: 0
+0 1
+4 5
+patch index: 1
+4 5
+8 9
+patch index: 2
+1 2
+5 6
+patch index: 3
+5 6
+9 10
+patch index: 4
+2 3
+6 7
+patch index: 5
+6 7
+10 11
+
+This code results in the following output when the data layout is RowMajor:
+(NOTE: the set of patches is the same as in ColMajor, but are indexed differently).
+
+patch index: 0
+0 1
+4 5
+patch index: 1
+1 2
+5 6
+patch index: 2
+2 3
+6 7
+patch index: 3
+4 5
+8 9
+patch index: 4
+5 6
+9 10
+patch index: 5
+6 7
+10 11
+
+### <Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const PaddingType padding_type)
+
+Returns a tensor of coefficient image patches extracted from the input tensor,
+which is expected to have dimensions ordered as follows (depending on the data
+layout of the input tensor, and the number of additional dimensions 'N'):
+
+*) ColMajor
+1st dimension: channels (of size d)
+2nd dimension: rows (of size r)
+3rd dimension: columns (of size c)
+4th-Nth dimension: time (for video) or batch (for bulk processing).
+
+*) RowMajor (reverse order of ColMajor)
+1st-Nth dimension: time (for video) or batch (for bulk processing).
+N+1'th dimension: columns (of size c)
+N+2'th dimension: rows (of size r)
+N+3'th dimension: channels (of size d)
+
+The returned tensor has one greater dimension than the input tensor, which is
+used to index each patch. The patch index in the output tensor depends on the
+data layout of the input tensor: the patch index is the 4'th dimension in
+ColMajor layout, and the 4'th from the last dimension in RowMajor layout.
+
+For example, given the following input tensor with the following dimension
+sizes:
+ *) depth:   2
+ *) rows:    3
+ *) columns: 5
+ *) batch:   7
+
+  Tensor<float, 4> tensor(2,3,5,7);
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+
+2x2 image patches can be extracted and indexed using the following code:
+
+*) 2D patch: ColMajor (patch indexed by second-to-last dimension)
+  Tensor<float, 5> twod_patch;
+  twod_patch = tensor.extract_image_patches<2, 2>();
+  // twod_patch.dimension(0) == 2
+  // twod_patch.dimension(1) == 2
+  // twod_patch.dimension(2) == 2
+  // twod_patch.dimension(3) == 3*5
+  // twod_patch.dimension(4) == 7
+
+*) 2D patch: RowMajor (patch indexed by the second dimension)
+  Tensor<float, 5, RowMajor> twod_patch_row_major;
+  twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
+  // twod_patch_row_major.dimension(0) == 7
+  // twod_patch_row_major.dimension(1) == 3*5
+  // twod_patch_row_major.dimension(2) == 2
+  // twod_patch_row_major.dimension(3) == 2
+  // twod_patch_row_major.dimension(4) == 2
+
+## Special Operations
+
+### <Operation> cast<T>()
+
+Returns a tensor of type T with the same dimensions as the original tensor.
+The returned tensor contains the values of the original tensor converted to
+type T.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    Eigen::Tensor<int, 2> b = a.cast<int>();
+
+This can be useful for example if you need to do element-wise division of
+Tensors of integers.  This is not currently supported by the Tensor library
+but you can easily cast the tensors to floats to do the division:
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 1, 2}, {3, 4, 5}});
+    Eigen::Tensor<int, 2> b =
+        (a.cast<float>() / a.constant(2).cast<float>()).cast<int>();
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    0 1 2
+    3 4 5
+
+    b
+    0 0 1
+    1 2 2
+
+
+### <Operation>     eval()
+
+TODO
+
+
+## Representation of scalar values
+
+Scalar values are often represented by tensors of size 1 and rank 1. It would be
+more logical and user friendly to use tensors of rank 0 instead. For example
+Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner
+product of 2 1d tensors (through contractions) returns a 1d tensor. In the
+future these operations might be updated to return 0d tensors instead.
+
+## Limitations
+
+*   The number of tensor dimensions is currently limited to 250 when using a
+    compiler that supports cxx11. It is limited to only 5 for older compilers.
+*   The IndexList class requires a cxx11 compliant compiler. You can use an
+    array of indices instead if you don't have access to a modern compiler.
+*   On GPUs only floating point values are properly tested and optimized for.
+*   Complex and integer values are known to be broken on GPUs. If you try to use
+    them you'll most likely end up triggering a static assertion failure such as
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
new file mode 100644
index 000000000..1940a9692
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -0,0 +1,527 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_H
+
+namespace Eigen {
+
+/** \class Tensor
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor class.
+  *
+  * The %Tensor class is the work-horse for all \em dense tensors within Eigen.
+  *
+  * The %Tensor class encompasses only dynamic-size objects so far.
+  *
+  * The first two template parameters are required:
+  * \tparam Scalar_ \anchor tensor_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
+  *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
+  * \tparam NumIndices_ Number of indices (i.e. rank of the tensor)
+  *
+  * The remaining template parameters are optional -- in most cases you don't have to worry about them.
+  * \tparam Options_ \anchor tensor_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either
+  *                 \b #AutoAlign or \b #DontAlign.
+  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
+  *                 for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization.
+  *                 Support for such operations (i.e. adding two tensors etc.) is planned.
+  *
+  * You can access elements of tensors using normal subscripting:
+  *
+  * \code
+  * Eigen::Tensor<double, 4> t(10, 10, 10, 10);
+  * t(0, 1, 2, 3) = 42.0;
+  * \endcode
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN.
+  *
+  * <i><b>Some notes:</b></i>
+  *
+  * <dl>
+  * <dt><b>Relation to other parts of Eigen:</b></dt>
+  * <dd>The midterm developement goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
+  * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
+  * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor
+  * class does not provide any of these features and is only available as a stand-alone class that just allows for
+  * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to
+  * change dramatically.</dd>
+  * </dl>
+  *
+  * \ref TopicStorageOrders
+  */
+
+template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+  public:
+    typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self;
+    typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<Self>::StorageKind StorageKind;
+    typedef typename internal::traits<Self>::Index Index;
+    typedef Scalar_ Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+    enum {
+      IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
+      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+      CoordAccess = true,
+      RawAccess = true
+    };
+
+    static const int Options = Options_;
+    static const int NumIndices = NumIndices_;
+    typedef DSizes<Index, NumIndices_> Dimensions;
+
+  protected:
+    TensorStorage<Scalar, Dimensions, Options> m_storage;
+
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices>
+    struct isOfNormalIndex{
+      static const bool is_array = internal::is_base_of<array<Index, NumIndices>, CustomIndices>::value;
+      static const bool is_int = NumTraits<CustomIndices>::IsInteger;
+      static const bool value = is_array | is_int;
+    };
+#endif
+
+  public:
+    // Metadata
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         rank()                   const { return NumIndices; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&             dimensions()             const { return m_storage.dimensions(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         size()                   const { return m_storage.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                        *data()                        { return m_storage.data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar                  *data()                  const { return m_storage.data(); }
+
+    // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    // work, because that uses base().coeffRef() - and we don't yet
+    // implement a similar class hierarchy
+    inline Self& base()             { return *this; }
+    inline const Self& base() const { return *this; }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#endif
+
+    // normal indices
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const
+    {
+        return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#endif
+
+    // normal indices
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+             >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices)
+    {
+        return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    {
+      return coeff(array<Index, 2>(i0, i1));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    {
+      return coeff(array<Index, 3>(i0, i1, i2));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      return coeff(array<Index, 4>(i0, i1, i2, i3));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      return coeff(array<Index, 5>(i0, i1, i2, i3, i4));
+    }
+#endif
+
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const
+    {
+        return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
+    // normal indices
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    {
+      return coeff(indices);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
+    {
+      // The bracket operator is only for vectors, use the parenthesis operator instead.
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff(index);
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    {
+      return coeffRef(array<Index, 2>(i0, i1));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    {
+      return coeffRef(array<Index, 3>(i0, i1, i2));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      return coeffRef(array<Index, 4>(i0, i1, i2, i3));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4));
+    }
+#endif
+
+    // normal indices
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    {
+      return coeffRef(indices);
+    }
+
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices)
+    {
+      return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_assert(index >= 0 && index < size());
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeffRef();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
+    {
+      // The bracket operator is only for vectors, use the parenthesis operator instead
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor()
+      : m_storage()
+    {
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(const Self& other)
+      : m_storage(other.m_storage)
+    {
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
+        : m_storage(firstDimension, otherDimensions...)
+    {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#else
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1)
+      : m_storage(dim1, array<Index, 1>(dim1))
+    {
+      EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2)
+      : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
+    {
+      EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3)
+      : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
+    {
+      EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+      : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
+    {
+      EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+      : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
+    {
+      EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#endif
+
+    /** Normal Dimension */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array<Index, NumIndices>& dimensions)
+        : m_storage(internal::array_prod(dimensions), dimensions)
+    {
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
+    {
+      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
+    {
+      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
+    {
+      typedef TensorAssignOp<Tensor, const Tensor> Assign;
+      Assign assign(*this, other);
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    void resize(Index firstDimension, IndexTypes... otherDimensions)
+    {
+      // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+    }
+#endif
+
+    /** Normal Dimension */
+    EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
+    {
+      int i;
+      Index size = Index(1);
+      for (i = 0; i < NumIndices; i++) {
+        internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]);
+        size *= dimensions[i];
+      }
+      #ifdef EIGEN_INITIALIZE_COEFFS
+        bool size_changed = size != this->size();
+        m_storage.resize(size, dimensions);
+        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      #else
+        m_storage.resize(size, dimensions);
+      #endif
+    }
+
+    // Why this overload, DSizes is derived from array ??? //
+    EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (int i = 0; i < NumIndices; ++i) {
+        dims[i] = dimensions[i];
+      }
+      resize(dims);
+    }
+
+    EIGEN_DEVICE_FUNC
+    void resize()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      // Nothing to do: rank 0 tensors have fixed size
+    }
+
+    /** Custom Dimension */
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomDimension,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomDimension>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions)
+    {
+      resize(internal::customIndices2Array<Index,NumIndices>(dimensions));
+    }
+#endif
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+    template <typename std::ptrdiff_t... Indices>
+    EIGEN_DEVICE_FUNC
+    void resize(const Sizes<Indices...>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (int i = 0; i < NumIndices; ++i) {
+        dims[i] = static_cast<Index>(dimensions[i]);
+      }
+      resize(dims);
+    }
+#else
+    template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+    EIGEN_DEVICE_FUNC
+    void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (int i = 0; i < NumIndices; ++i) {
+        dims[i] = static_cast<Index>(dimensions[i]);
+      }
+      resize(dims);
+    }
+#endif
+
+  protected:
+
+    bool checkIndexRange(const array<Index, NumIndices>& indices) const
+    {
+      using internal::array_apply_and_reduce;
+      using internal::array_zip_and_reduce;
+      using internal::greater_equal_zero_op;
+      using internal::logical_and_op;
+      using internal::lesser_op;
+
+      return
+        // check whether the indices are all >= 0
+        array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+        // check whether the indices fit in the dimensions
+        array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
+    {
+      if (Options&RowMajor) {
+        return m_storage.dimensions().IndexOfRowMajor(indices);
+      } else {
+        return m_storage.dimensions().IndexOfColMajor(indices);
+      }
+    }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
new file mode 100644
index 000000000..d06f40cd8
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -0,0 +1,299 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
+
+namespace Eigen {
+namespace internal {
+
+/** \class TensorIndexTuple
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor + Index Tuple class.
+  *
+  *
+  */
+template<typename XprType>
+struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
+{
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef Tuple<Index, typename XprTraits::Scalar> Scalar;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename XprType>
+struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorIndexTupleOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorIndexTupleOp<XprType>, 1,
+              typename eval<TensorIndexTupleOp<XprType> >::type>
+{
+  typedef TensorIndexTupleOp<XprType> type;
+};
+
+}  // end namespace internal
+
+template<typename XprType>
+class TensorIndexTupleOp : public TensorBase<TensorIndexTupleOp<XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename Eigen::internal::nested<TensorIndexTupleOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorIndexTupleOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Index Index;
+  typedef Tuple<Index, typename XprType::CoeffReturnType> CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr)
+      : m_xpr(expr) {}
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+// Eval as rvalue
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
+{
+  typedef TensorIndexTupleOp<ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return CoeffReturnType(index, m_impl.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+namespace internal {
+
+/** \class TensorTupleIndex
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Converts to Tensor<Tuple<Index, Scalar> > and reduces to Tensor<Index>.
+  *
+  */
+template<typename ReduceOp, typename Dims, typename XprType>
+struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<XprType>
+{
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef Index Scalar;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename ReduceOp, typename Dims, typename XprType>
+struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
+{
+  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type;
+};
+
+template<typename ReduceOp, typename Dims, typename XprType>
+struct nested<TensorTupleReducerOp<ReduceOp, Dims, XprType>, 1,
+              typename eval<TensorTupleReducerOp<ReduceOp, Dims, XprType> >::type>
+{
+  typedef TensorTupleReducerOp<ReduceOp, Dims, XprType> type;
+};
+
+}  // end namespace internal
+
+template<typename ReduceOp, typename Dims, typename XprType>
+class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Dims, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename Eigen::internal::nested<TensorTupleReducerOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorTupleReducerOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Index Index;
+  typedef Index CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
+                                                          const ReduceOp& reduce_op,
+                                                          const int return_dim,
+                                                          const Dims& reduce_dims)
+      : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  const ReduceOp& reduce_op() const { return m_reduce_op; }
+
+  EIGEN_DEVICE_FUNC
+  const Dims& reduce_dims() const { return m_reduce_dims; }
+
+  EIGEN_DEVICE_FUNC
+  int return_dim() const { return m_return_dim; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const ReduceOp m_reduce_op;
+    const int m_return_dim;
+    const Dims m_reduce_dims;
+};
+
+// Eval as rvalue
+template<typename ReduceOp, typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>
+{
+  typedef TensorTupleReducerOp<ReduceOp, Dims, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename TensorIndexTupleOp<ArgType>::CoeffReturnType TupleType;
+  typedef typename TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
+  static const int NumDims = internal::array_size<InputDimensions>::value;
+  typedef array<Index, NumDims> StrideDims;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_orig_impl(op.expression(), device),
+        m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
+        m_return_dim(op.return_dim()) {
+
+    gen_strides(m_orig_impl.dimensions(), m_strides);
+    if (Layout == static_cast<int>(ColMajor)) {
+      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+      m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size;
+    } else {
+      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+      m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
+    }
+    m_stride_div = m_strides[m_return_dim];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    const TupleType v = m_impl.coeff(index);
+    return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double compute_cost = 1.0 +
+        (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
+    return m_orig_impl.costPerCoeff(vectorized) +
+           m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
+  }
+
+ private:
+  EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
+    if (m_return_dim < 0) {
+      return;  // Won't be using the strides.
+    }
+    eigen_assert(m_return_dim < NumDims &&
+                 "Asking to convert index to a dimension outside of the rank");
+
+    // Calculate m_stride_div and m_stride_mod, which are used to
+    // calculate the value of an index w.r.t. the m_return_dim.
+    if (Layout == static_cast<int>(ColMajor)) {
+      strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        strides[i] = strides[i-1] * dims[i-1];
+      }
+    } else {
+      strides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        strides[i] = strides[i+1] * dims[i+1];
+      }
+    }
+  }
+
+ protected:
+  TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
+  TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
+  const int m_return_dim;
+  StrideDims m_strides;
+  Index m_stride_mod;
+  Index m_stride_div;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
new file mode 100644
index 000000000..166be200c
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -0,0 +1,181 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+
+namespace Eigen {
+
+/** \class TensorAssign
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor assignment class.
+  *
+  * This class is represents the assignment of the values resulting from the evaluation of
+  * the rhs expression to the memory locations denoted by the lhs expression.
+  */
+namespace internal {
+template<typename LhsXprType, typename RhsXprType>
+struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+  typedef typename LhsXprType::Scalar Scalar;
+  typedef typename traits<LhsXprType>::StorageKind StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
+  static const int Layout = internal::traits<LhsXprType>::Layout;
+
+  enum {
+    Flags = 0
+  };
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorAssignOp<LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename LhsXprType, typename RhsXprType>
+class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr;
+    const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr;
+};
+
+
+template<typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
+{
+  typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      m_leftImpl(op.lhsExpression(), device),
+      m_rightImpl(op.rhsExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // The dimensions of the lhs and the rhs tensors should be equal to prevent
+    // overflows and ensure the result is fully initialized.
+    // TODO: use left impl instead if right impl dimensions are known at compile time.
+    return m_rightImpl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
+    // null value), attempt to evaluate the rhs expression in place. Returns true iff in place
+    // evaluation isn't supported and the caller still needs to manually assign the values generated
+    // by the rhs to the lhs.
+    return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+    m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+    m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
+  }
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_leftImpl.coeff(index);
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+    return m_leftImpl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    // We assume that evalPacket or evalScalar is called to perform the
+    // assignment and account for the cost of the write here, but reduce left
+    // cost by one load because we are using m_leftImpl.coeffRef.
+    TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
+    return m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(
+               numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
+               left.bytes_stored(), left.compute_cycles()) +
+           TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+  }
+
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
+
+ private:
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+}
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
new file mode 100644
index 000000000..7a45a5cf4
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -0,0 +1,1010 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+
+// clang-format off
+
+namespace Eigen {
+
+/** \class TensorBase
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor base class.
+  *
+  * This class is the common parent of the Tensor and TensorMap class, thus
+  * making it possible to use either class interchangably in expressions.
+  */
+
+template<typename Derived>
+class TensorBase<Derived, ReadOnlyAccessors>
+{
+  public:
+    typedef internal::traits<Derived> DerivedTraits;
+    typedef typename DerivedTraits::Scalar Scalar;
+    typedef typename DerivedTraits::Index Index;
+    typedef typename internal::remove_const<Scalar>::type CoeffReturnType;
+    static const int NumDimensions = DerivedTraits::NumDimensions;
+
+    // Generic nullary operation support.
+    template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived>
+    nullaryExpr(const CustomNullaryOp& func) const {
+      return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func);
+    }
+
+    // Coefficient-wise nullary operators
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
+    constant(const Scalar& value) const {
+      return nullaryExpr(internal::scalar_constant_op<Scalar>(value));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived>
+    random() const {
+      return nullaryExpr(internal::UniformRandomGenerator<Scalar>());
+    }
+    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived>
+    random(const RandomGenerator& gen = RandomGenerator()) const {
+      return nullaryExpr(gen);
+    }
+
+    // Tensor generation
+    template <typename Generator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorGeneratorOp<Generator, const Derived>
+    generate(const Generator& generator) const {
+      return TensorGeneratorOp<Generator, const Derived>(derived(), generator);
+    }
+
+    // Generic unary operation support.
+    template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived>
+    unaryExpr(const CustomUnaryOp& func) const {
+      return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
+    }
+
+    // Coefficient-wise unary operators
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived>
+    operator-() const {
+      return unaryExpr(internal::scalar_opposite_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+    sqrt() const {
+      return unaryExpr(internal::scalar_sqrt_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
+    sign() const {
+      return unaryExpr(internal::scalar_sign_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived>
+    rsqrt() const {
+      return unaryExpr(internal::scalar_rsqrt_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+    square() const {
+      return unaryExpr(internal::scalar_square_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+    cube() const {
+      return unaryExpr(internal::scalar_cube_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+    inverse() const {
+      return unaryExpr(internal::scalar_inverse_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived>
+    tanh() const {
+      return unaryExpr(internal::scalar_tanh_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived>
+    lgamma() const {
+      return unaryExpr(internal::scalar_lgamma_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived>
+    digamma() const {
+      return unaryExpr(internal::scalar_digamma_op<Scalar>());
+    }
+
+    // igamma(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
+    igamma(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
+    }
+
+    // igammac(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
+    igammac(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igammac_op<Scalar>());
+    }
+
+    // zeta(x = this, q = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const OtherDerived>
+    zeta(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_zeta_op<Scalar>());
+    }
+
+    // polygamma(n = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const Derived, const OtherDerived>
+    polygamma(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_polygamma_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived>
+    erf() const {
+      return unaryExpr(internal::scalar_erf_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived>
+    erfc() const {
+      return unaryExpr(internal::scalar_erfc_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sigmoid_op<Scalar>, const Derived>
+    sigmoid() const {
+      return unaryExpr(internal::scalar_sigmoid_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+    exp() const {
+      return unaryExpr(internal::scalar_exp_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+    log() const {
+      return unaryExpr(internal::scalar_log_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+    log1p() const {
+      return unaryExpr(internal::scalar_log1p_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+    abs() const {
+      return unaryExpr(internal::scalar_abs_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>
+    conjugate() const {
+      return unaryExpr(internal::scalar_conjugate_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived>
+    pow(Scalar exponent) const {
+      return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+    real() const {
+      return unaryExpr(internal::scalar_real_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+    imag() const {
+      return unaryExpr(internal::scalar_imag_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
+    operator+ (Scalar rhs) const {
+      return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
+    operator+ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
+    operator- (Scalar rhs) const {
+      EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
+    operator- (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
+    operator* (Scalar rhs) const {
+      return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
+    operator* (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
+    operator/ (Scalar rhs) const {
+      return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
+    operator/ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_mod_op<Scalar>, const Derived>
+    operator% (Scalar rhs) const {
+      EIGEN_STATIC_ASSERT(NumTraits<Scalar>::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD);
+      return unaryExpr(internal::scalar_mod_op<Scalar>(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    cwiseMax(Scalar threshold) const {
+      return cwiseMax(constant(threshold));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    cwiseMin(Scalar threshold) const {
+      return cwiseMin(constant(threshold));
+    }
+
+    template <typename NewType> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorConversionOp<NewType, const Derived>
+    cast() const {
+      return TensorConversionOp<NewType, const Derived>(derived());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived>
+    round() const {
+      return unaryExpr(internal::scalar_round_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived>
+    ceil() const {
+      return unaryExpr(internal::scalar_ceil_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived>
+    floor() const {
+      return unaryExpr(internal::scalar_floor_op<Scalar>());
+    }
+
+    // Generic binary operation support.
+    template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
+    binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const {
+      return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func);
+    }
+
+    // Coefficient-wise binary operators.
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>
+    operator+(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>
+    operator-(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived>
+    operator*(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+    operator/(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+    cwiseMax(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_max_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+    cwiseMin(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
+    operator&&(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_and_op());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
+    operator||(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
+    operator^(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_xor_op());
+    }
+
+    // Comparisons and tests.
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+    operator<(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+    operator<=(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+    operator>(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+    operator>=(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+    operator==(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+    operator!=(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
+    }
+
+    // comparisons and tests for Scalars
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator<(Scalar threshold) const {
+      return operator<(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator<=(Scalar threshold) const {
+      return operator<=(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator>(Scalar threshold) const {
+      return operator>(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator>=(Scalar threshold) const {
+      return operator>=(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator==(Scalar threshold) const {
+      return operator==(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator!=(Scalar threshold) const {
+      return operator!=(constant(threshold));
+    }
+
+    // Checks
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived>
+    (isnan)() const {
+      return unaryExpr(internal::scalar_isnan_op<Scalar>());
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isinf_op<Scalar>, const Derived>
+    (isinf)() const {
+      return unaryExpr(internal::scalar_isinf_op<Scalar>());
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived>
+    (isfinite)() const {
+      return unaryExpr(internal::scalar_isfinite_op<Scalar>());
+    }
+
+    // Coefficient-wise ternary operators.
+    template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
+    select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const {
+      return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
+    }
+
+    // Contractions.
+    typedef Eigen::IndexPair<Index> DimensionPair;
+
+    template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
+    contract(const OtherDerived& other, const Dimensions& dims) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims);
+    }
+
+    // Convolutions.
+    template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>
+    convolve(const KernelDerived& kernel, const Dimensions& dims) const {
+      return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
+    }
+
+    // Fourier transforms
+    template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
+    fft(const FFT& fft) const {
+      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
+    }
+
+    // Scan.
+    typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanSumOp
+    cumsum(const Index& axis, bool exclusive = false) const {
+      return TensorScanSumOp(derived(), axis, exclusive);
+    }
+
+    typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanProdOp
+    cumprod(const Index& axis, bool exclusive = false) const {
+      return TensorScanProdOp(derived(), axis, exclusive);
+    }
+
+    template <typename Reducer>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanOp<Reducer, const Derived>
+    scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
+      return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
+    }
+
+    // Reductions.
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
+    sum(const Dims& dims) const {
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    sum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>
+    mean(const Dims& dims) const {
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    mean() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>
+    prod(const Dims& dims) const {
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    prod() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>
+    maximum(const Dims& dims) const {
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    maximum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>
+    minimum(const Dims& dims) const {
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    minimum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+    all(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::AndReducer());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+    all() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::AndReducer());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+    any(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::OrReducer());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+    any() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::OrReducer());
+    }
+
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTupleReducerOp<
+      internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+      const array<Index, NumDimensions>, const Derived>
+    argmax() const {
+      array<Index, NumDimensions> in_dims;
+      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      return TensorTupleReducerOp<
+        internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+        const array<Index, NumDimensions>,
+        const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTupleReducerOp<
+      internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+      const array<Index, NumDimensions>, const Derived>
+    argmin() const {
+      array<Index, NumDimensions> in_dims;
+      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      return TensorTupleReducerOp<
+        internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+        const array<Index, NumDimensions>,
+        const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTupleReducerOp<
+      internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+      const array<Index, 1>, const Derived>
+    argmax(const int return_dim) const {
+      array<Index, 1> in_dims;
+      in_dims[0] = return_dim;
+      return TensorTupleReducerOp<
+        internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+        const array<Index, 1>,
+        const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTupleReducerOp<
+      internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+      const array<Index, 1>, const Derived>
+    argmin(const int return_dim) const {
+      array<Index, 1> in_dims;
+      in_dims[0] = return_dim;
+      return TensorTupleReducerOp<
+        internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+        const array<Index, 1>,
+        const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims);
+    }
+
+    template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<Reducer, const Dims, const Derived>
+    reduce(const Dims& dims, const Reducer& reducer) const {
+      return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
+    }
+
+    template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorBroadcastingOp<const Broadcast, const Derived>
+    broadcast(const Broadcast& broadcast) const {
+      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast);
+    }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, Axis axis) const {
+      return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis);
+    }
+
+    template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPatchOp<const PatchDims, const Derived>
+    extract_patches(const PatchDims& patch_dims) const {
+      return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1,
+                          const Index row_stride = 1, const Index col_stride = 1,
+                          const Index in_row_stride = 1, const Index in_col_stride = 1,
+                          const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, 1, 1, padding_type, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const Index in_row_stride, const Index in_col_stride,
+                          const Index row_inflate_stride, const Index col_inflate_stride,
+                          const Index padding_top, const Index padding_bottom,
+                          const Index padding_left,const Index padding_right,
+                          const Scalar padding_value) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride,
+                                                                 padding_top, padding_bottom, padding_left, padding_right, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+                           const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
+                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
+      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
+    }
+
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+                           const Index plane_stride, const Index row_stride, const Index col_stride,
+                           const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
+                           const Index padding_top_z, const Index padding_bottom_z,
+                           const Index padding_top, const Index padding_bottom,
+                           const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const {
+      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
+    }
+
+    // Morphing operators.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorLayoutSwapOp<const Derived>
+    swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+    }
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                const Derived>(derived(), startIndices, stopIndices, strides);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<DimId, const Derived>
+    chip(const Index offset) const {
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
+    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPaddingOp<const PaddingDimensions, const Derived>
+    pad(const PaddingDimensions& padding) const {
+      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, internal::scalar_cast_op<int, Scalar>()(0));
+    }
+    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPaddingOp<const PaddingDimensions, const Derived>
+    pad(const PaddingDimensions& padding, const Scalar padding_value) const {
+      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, padding_value);
+    }
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shuffle) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorInflationOp<const Strides, const Derived>
+    inflate(const Strides& strides) const {
+      return TensorInflationOp<const Strides, const Derived>(derived(), strides);
+    }
+
+    // Returns a tensor containing index/value tuples
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorIndexTupleOp<const Derived>
+    index_tuples() const {
+      return TensorIndexTupleOp<const Derived>(derived());
+    }
+
+    // Support for custom unary and binary operations
+    template <typename CustomUnaryFunc>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const {
+      return TensorCustomUnaryOp<const CustomUnaryFunc, const Derived>(derived(), op);
+    }
+    template <typename OtherDerived, typename CustomBinaryFunc>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived> customOp(const OtherDerived& other, const CustomBinaryFunc& op) const {
+      return TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived>(derived(), other, op);
+    }
+
+    // Force the evaluation of the expression.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorForcedEvalOp<const Derived> eval() const {
+      return TensorForcedEvalOp<const Derived>(derived());
+    }
+
+  protected:
+    template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
+    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
+ public:
+    typedef internal::traits<Derived> DerivedTraits;
+    typedef typename DerivedTraits::Scalar Scalar;
+    typedef typename DerivedTraits::Index Index;
+    typedef Scalar CoeffReturnType;
+    static const int NumDimensions = DerivedTraits::NumDimensions;
+
+    template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
+    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+    template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase;
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setZero() {
+      return setConstant(Scalar(0));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
+      return derived() = this->constant(val);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->random();
+    }
+    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->template random<RandomGenerator>();
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setValues(
+        const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
+      TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice());
+      internal::initialize_tensor<Derived, NumDimensions>(eval, vals);
+      return derived();
+    }
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator+=(const OtherDerived& other) {
+      return derived() = derived() + other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator-=(const OtherDerived& other) {
+      return derived() = derived() - other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const OtherDerived& other) {
+      return derived() = derived() * other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const OtherDerived& other) {
+      return derived() = derived() / other.derived();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorLayoutSwapOp<const Derived>
+    swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorLayoutSwapOp<Derived>
+    swap_layout() {
+      return TensorLayoutSwapOp<Derived>(derived());
+    }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<const Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, const Axis& axis) const {
+      return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis);
+    }
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorConcatenationOp<const Axis, Derived, OtherDerived>
+    concatenate(const OtherDerived& other, const Axis& axis) {
+      return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis);
+    }
+
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+    }
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReshapingOp<const NewDimensions, Derived>
+    reshape(const NewDimensions& newDimensions) {
+      return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
+    }
+
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorSlicingOp<const StartIndices, const Sizes, Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) {
+      return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
+    }
+
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                const Derived>(derived(), startIndices, stopIndices, strides);
+    }
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                Derived>(derived(), startIndices, stopIndices, strides);
+    }
+
+    template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<DimId, const Derived>
+    chip(const Index offset) const {
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<DimId, Derived>
+    chip(const Index offset) {
+      return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<Dynamic, Derived>
+    chip(const Index offset, const Index dim) {
+      return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
+    }
+
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReverseOp<const ReverseDimensions, Derived>
+    reverse(const ReverseDimensions& rev) {
+      return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev);
+    }
+
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shuffle) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    }
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorShufflingOp<const Shuffle, Derived>
+    shuffle(const Shuffle& shuffle) {
+      return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
+    }
+
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorStridingOp<const Strides, Derived>
+    stride(const Strides& strides) {
+      return TensorStridingOp<const Strides, Derived>(derived(), strides);
+    }
+
+    // Select the device on which to evaluate the expression.
+    template <typename DeviceType>
+    TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
+      return TensorDevice<Derived, DeviceType>(device, derived());
+    }
+
+ protected:
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
new file mode 100644
index 000000000..4cfe300eb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -0,0 +1,392 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+
+namespace Eigen {
+
+/** \class TensorBroadcasting
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor broadcasting class.
+  *
+  *
+  */
+namespace internal {
+template<typename Broadcast, typename XprType>
+struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Broadcast, typename XprType>
+struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
+{
+  typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
+};
+
+template<typename Broadcast, typename XprType>
+struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
+{
+  typedef TensorBroadcastingOp<Broadcast, XprType> type;
+};
+
+template <typename Dims>
+struct is_input_scalar {
+  static const bool value = false;
+};
+template <>
+struct is_input_scalar<Sizes<> > {
+  static const bool value = true;
+};
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::size_t... Indices>
+struct is_input_scalar<Sizes<Indices...> > {
+  static const bool value = (Sizes<Indices...>::total_size == 1);
+};
+#endif
+
+}  // end namespace internal
+
+
+
+template<typename Broadcast, typename XprType>
+class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
+      : m_xpr(expr), m_broadcast(broadcast) {}
+
+    EIGEN_DEVICE_FUNC
+    const Broadcast& broadcast() const { return m_broadcast; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Broadcast m_broadcast;
+};
+
+
+// Eval as rvalue
+template<typename Broadcast, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
+{
+  typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_broadcast(op.broadcast()),m_impl(op.expression(), device)
+  {
+    // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
+    // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
+    // tensor with N >= 1 of 1 element first and then broadcast.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    const InputDimensions& input_dims = m_impl.dimensions();
+    const Broadcast& broadcast = op.broadcast();
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i] * broadcast[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      }
+    } else {
+      m_inputStrides[NumDims-1] = 1;
+      m_outputStrides[NumDims-1] = 1;
+      for (int i = NumDims-2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
+  {
+    if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
+      return m_impl.coeff(0);
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return coeffColMajor(index);
+    } else {
+      return coeffRowMajor(index);
+    }
+  }
+
+  // TODO: attempt to speed this up. The integer divisions and modulo are slow
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
+  {
+    Index inputIndex = 0;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    if (internal::index_statically_eq<Broadcast>(0, 1)) {
+      eigen_assert(index < m_impl.dimensions()[0]);
+      inputIndex += index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
+        eigen_assert(index % m_impl.dimensions()[0] == 0);
+      } else {
+        inputIndex += (index % m_impl.dimensions()[0]);
+      }
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+  {
+    Index inputIndex = 0;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+      inputIndex += index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
+      } else {
+        inputIndex += (index % m_impl.dimensions()[NumDims-1]);
+      }
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
+  {
+    if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
+      return internal::pset1<PacketReturnType>(m_impl.coeff(0));
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return packetColMajor<LoadMode>(index);
+    } else {
+      return packetRowMajor<LoadMode>(index);
+    }
+  }
+
+  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
+  // the alignment at compile time.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    const Index originalIndex = index;
+
+    Index inputIndex = 0;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    Index innermostLoc;
+    if (internal::index_statically_eq<Broadcast>(0, 1)) {
+      eigen_assert(index < m_impl.dimensions()[0]);
+      innermostLoc = index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
+        eigen_assert(index % m_impl.dimensions()[0] == 0);
+        innermostLoc = 0;
+      } else {
+        innermostLoc = index % m_impl.dimensions()[0];
+      }
+    }
+    inputIndex += innermostLoc;
+
+    // Todo: this could be extended to the second dimension if we're not
+    // broadcasting alongside the first dimension, and so on.
+    if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      values[0] = m_impl.coeff(inputIndex);
+      for (int i = 1; i < PacketSize; ++i) {
+        values[i] = coeffColMajor(originalIndex+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    const Index originalIndex = index;
+
+    Index inputIndex = 0;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    Index innermostLoc;
+    if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+      innermostLoc = index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
+        innermostLoc = 0;
+      } else {
+        innermostLoc = index % m_impl.dimensions()[NumDims-1];
+      }
+    }
+    inputIndex += innermostLoc;
+
+    // Todo: this could be extended to the second dimension if we're not
+    // broadcasting alongside the first dimension, and so on.
+    if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      values[0] = m_impl.coeff(inputIndex);
+      for (int i = 1; i < PacketSize; ++i) {
+        values[i] = coeffRowMajor(originalIndex+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    double compute_cost = TensorOpCost::AddCost<Index>();
+    if (NumDims > 0) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        compute_cost += TensorOpCost::DivCost<Index>();
+        if (internal::index_statically_eq<Broadcast>(i, 1)) {
+          compute_cost +=
+              TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+        } else {
+          if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
+            compute_cost += TensorOpCost::MulCost<Index>() +
+                            TensorOpCost::ModCost<Index>() +
+                            TensorOpCost::AddCost<Index>();
+          }
+        }
+        compute_cost +=
+            TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+      }
+    }
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Broadcast functor() const { return m_broadcast; }
+
+ protected:
+  const Broadcast m_broadcast;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
new file mode 100644
index 000000000..1ba7ef170
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -0,0 +1,384 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+
+namespace Eigen {
+
+/** \class TensorKChippingReshaping
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
+  *
+  *
+  */
+
+namespace internal {
+template<DenseIndex DimId, typename XprType>
+struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions - 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex DimId, typename XprType>
+struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
+{
+  typedef const TensorChippingOp<DimId, XprType>& type;
+};
+
+template<DenseIndex DimId, typename XprType>
+struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type>
+{
+  typedef TensorChippingOp<DimId, XprType> type;
+};
+
+template <DenseIndex DimId>
+struct DimensionId
+{
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
+    eigen_assert(dim == DimId);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
+    return DimId;
+  }
+};
+template <>
+struct DimensionId<Dynamic>
+{
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) {
+    eigen_assert(dim >= 0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
+    return actual_dim;
+  }
+ private:
+  const DenseIndex actual_dim;
+};
+
+
+}  // end namespace internal
+
+
+
+template<DenseIndex DimId, typename XprType>
+class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+      : m_xpr(expr), m_offset(offset), m_dim(dim) {
+  }
+
+  EIGEN_DEVICE_FUNC
+  const Index offset() const { return m_offset; }
+  EIGEN_DEVICE_FUNC
+  const Index dim() const { return m_dim.actualDim(); }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other)
+  {
+    typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
+    Assign assign(*this, other);
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    return *this;
+  }
+
+  template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other)
+  {
+    typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
+    Assign assign(*this, other);
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    return *this;
+  }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Index m_offset;
+    const internal::DimensionId<DimId> m_dim;
+};
+
+
+// Eval as rvalue
+template<DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
+{
+  typedef TensorChippingOp<DimId, ArgType> XprType;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims-1;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets.
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(NumInputDims > m_dim.actualDim());
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    eigen_assert(op.offset() < input_dims[m_dim.actualDim()]);
+
+    int j = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (i != m_dim.actualDim()) {
+        m_dimensions[j] = input_dims[i];
+        ++j;
+      }
+    }
+
+    m_stride = 1;
+    m_inputStride = 1;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < m_dim.actualDim(); ++i) {
+        m_stride *= input_dims[i];
+        m_inputStride *= input_dims[i];
+      }
+    } else {
+      for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) {
+        m_stride *= input_dims[i];
+        m_inputStride *= input_dims[i];
+      }
+    }
+    m_inputStride *= input_dims[m_dim.actualDim()];
+    m_inputOffset = m_stride * op.offset();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(m_stride == 1);
+      Index inputIndex = index * m_inputStride + m_inputOffset;
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = m_impl.coeff(inputIndex);
+        inputIndex += m_inputStride;
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
+	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
+      eigen_assert(m_stride > index);
+      return m_impl.template packet<LoadMode>(index + m_inputOffset);
+    } else {
+      const Index idx = index / m_stride;
+      const Index rem = index - idx * m_stride;
+      if (rem + PacketSize <= m_stride) {
+        Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
+        return m_impl.template packet<LoadMode>(inputIndex);
+      } else {
+        // Cross the stride boundary. Fallback to slow path.
+        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = coeff(index);
+          ++index;
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    double cost = 0;
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+         m_dim.actualDim() == 0) ||
+        (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+         m_dim.actualDim() == NumInputDims - 1)) {
+      cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+                m_dim.actualDim() == NumInputDims - 1) ||
+               (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+                m_dim.actualDim() == 0)) {
+      cost += TensorOpCost::AddCost<Index>();
+    } else {
+      cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() +
+              3 * TensorOpCost::AddCost<Index>();
+    }
+
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
+    CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
+    if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
+         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) &&
+        result) {
+      return result + m_inputOffset;
+    } else {
+      return NULL;
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex;
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(m_stride == 1);
+      inputIndex = index * m_inputStride + m_inputOffset;
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
+	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
+      eigen_assert(m_stride > index);
+      inputIndex = index + m_inputOffset;
+    } else {
+      const Index idx = index / m_stride;
+      inputIndex = idx * m_inputStride + m_inputOffset;
+      index -= idx * m_stride;
+      inputIndex += index;
+    }
+    return inputIndex;
+  }
+
+  Dimensions m_dimensions;
+  Index m_stride;
+  Index m_inputOffset;
+  Index m_inputStride;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const internal::DimensionId<DimId> m_dim;
+  const Device& m_device;
+};
+
+
+// Eval as lvalue
+template<DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
+  : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
+  typedef TensorChippingOp<DimId, ArgType> XprType;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims-1;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+    if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
+	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(this->m_stride == 1);
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+      Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+      for (int i = 0; i < PacketSize; ++i) {
+        this->m_impl.coeffRef(inputIndex) = values[i];
+        inputIndex += this->m_inputStride;
+      }
+    } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
+	       (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
+      eigen_assert(this->m_stride > index);
+      this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
+    } else {
+      const Index idx = index / this->m_stride;
+      const Index rem = index - idx * this->m_stride;
+      if (rem + PacketSize <= this->m_stride) {
+        const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
+        this->m_impl.template writePacket<StoreMode>(inputIndex, x);
+      } else {
+        // Cross stride boundary. Fallback to slow path.
+        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+        internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+        for (int i = 0; i < PacketSize; ++i) {
+          this->coeffRef(index) = values[i];
+          ++index;
+        }
+      }
+    }
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
new file mode 100644
index 000000000..59bf90d93
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -0,0 +1,361 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+
+namespace Eigen {
+
+/** \class TensorConcatenationOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor concatenation class.
+  *
+  *
+  */
+namespace internal {
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename LhsXprType::Scalar,
+                                        typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<LhsXprType>::NumDimensions;
+  static const int Layout = traits<LhsXprType>::Layout;
+  enum { Flags = 0 };
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
+{
+  public:
+    typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
+    typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
+    typedef typename internal::traits<TensorConcatenationOp>::Index Index;
+    typedef typename internal::nested<TensorConcatenationOp>::type Nested;
+    typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                    typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
+        : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return m_lhs_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+    EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other)
+    {
+      typedef TensorAssignOp<TensorConcatenationOp, const TensorConcatenationOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorConcatenationOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const Axis m_axis;
+};
+
+
+// Eval as rvalue
+template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+{
+  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+  static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    eigen_assert(0 <= m_axis && m_axis < NumDims);
+    const Dimensions& lhs_dims = m_leftImpl.dimensions();
+    const Dimensions& rhs_dims = m_rightImpl.dimensions();
+    {
+      int i = 0;
+      for (; i < m_axis; ++i) {
+        eigen_assert(lhs_dims[i] > 0);
+        eigen_assert(lhs_dims[i] == rhs_dims[i]);
+        m_dimensions[i] = lhs_dims[i];
+      }
+      eigen_assert(lhs_dims[i] > 0);  // Now i == m_axis.
+      eigen_assert(rhs_dims[i] > 0);
+      m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
+      for (++i; i < NumDims; ++i) {
+        eigen_assert(lhs_dims[i] > 0);
+        eigen_assert(lhs_dims[i] == rhs_dims[i]);
+        m_dimensions[i] = lhs_dims[i];
+      }
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_leftStrides[0] = 1;
+      m_rightStrides[0] = 1;
+      m_outputStrides[0] = 1;
+
+      for (int j = 1; j < NumDims; ++j) {
+        m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1];
+        m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1];
+        m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1];
+      }
+    } else {
+      m_leftStrides[NumDims - 1] = 1;
+      m_rightStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+
+      for (int j = NumDims - 2; j >= 0; --j) {
+        m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1];
+        m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1];
+        m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+  {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
+  // See CL/76180724 comments for more ideas.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Collect dimension-wise indices (subs).
+    array<Index, NumDims> subs;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        subs[i] = index / m_outputStrides[i];
+        index -= subs[i] * m_outputStrides[i];
+      }
+      subs[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        subs[i] = index / m_outputStrides[i];
+        index -= subs[i] * m_outputStrides[i];
+      }
+      subs[NumDims - 1] = index;
+    }
+
+    const Dimensions& left_dims = m_leftImpl.dimensions();
+    if (subs[m_axis] < left_dims[m_axis]) {
+      Index left_index;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        left_index = subs[0];
+        for (int i = 1; i < NumDims; ++i) {
+          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+        }
+      } else {
+        left_index = subs[NumDims - 1];
+        for (int i = NumDims - 2; i >= 0; --i) {
+          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+        }
+      }
+      return m_leftImpl.coeff(left_index);
+    } else {
+      subs[m_axis] -= left_dims[m_axis];
+      const Dimensions& right_dims = m_rightImpl.dimensions();
+      Index right_index;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        right_index = subs[0];
+        for (int i = 1; i < NumDims; ++i) {
+          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+        }
+      } else {
+        right_index = subs[NumDims - 1];
+        for (int i = NumDims - 2; i >= 0; --i) {
+          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+        }
+      }
+      return m_rightImpl.coeff(right_index);
+    }
+  }
+
+  // TODO(phli): Add a real vectorization.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+                                           2 * TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>() +
+                                           TensorOpCost::ModCost<Index>());
+    const double lhs_size = m_leftImpl.dimensions().TotalSize();
+    const double rhs_size = m_rightImpl.dimensions().TotalSize();
+    return (lhs_size / (lhs_size + rhs_size)) *
+               m_leftImpl.costPerCoeff(vectorized) +
+           (rhs_size / (lhs_size + rhs_size)) *
+               m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  protected:
+    Dimensions m_dimensions;
+    array<Index, NumDims> m_outputStrides;
+    array<Index, NumDims> m_leftStrides;
+    array<Index, NumDims> m_rightStrides;
+    TensorEvaluator<LeftArgType, Device> m_leftImpl;
+    TensorEvaluator<RightArgType, Device> m_rightImpl;
+    const Axis m_axis;
+};
+
+// Eval as lvalue
+template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+  struct TensorEvaluator<TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+  : public TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> Base;
+  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+  typedef typename Base::Dimensions Dimensions;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
+    : Base(op, device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    // Collect dimension-wise indices (subs).
+    array<Index, Base::NumDims> subs;
+    for (int i = Base::NumDims - 1; i > 0; --i) {
+      subs[i] = index / this->m_outputStrides[i];
+      index -= subs[i] * this->m_outputStrides[i];
+    }
+    subs[0] = index;
+
+    const Dimensions& left_dims = this->m_leftImpl.dimensions();
+    if (subs[this->m_axis] < left_dims[this->m_axis]) {
+      Index left_index = subs[0];
+      for (int i = 1; i < Base::NumDims; ++i) {
+        left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i];
+      }
+      return this->m_leftImpl.coeffRef(left_index);
+    } else {
+      subs[this->m_axis] -= left_dims[this->m_axis];
+      const Dimensions& right_dims = this->m_rightImpl.dimensions();
+      Index right_index = subs[0];
+      for (int i = 1; i < Base::NumDims; ++i) {
+        right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i];
+      }
+      return this->m_rightImpl.coeffRef(right_index);
+    }
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    for (int i = 0; i < packetSize; ++i) {
+      coeffRef(index+i) = values[i];
+    }
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
new file mode 100644
index 000000000..20b29e5fd
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -0,0 +1,628 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+
+namespace Eigen {
+
+/** \class TensorContraction
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor contraction class.
+  *
+  *
+  */
+namespace internal {
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
+                               typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
+
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+
+  // From NumDims below.
+  static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
+  static const int Layout = traits<LhsXprType>::Layout;
+
+  enum {
+    Flags = 0
+  };
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
+};
+
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
+  typedef Indices_ Indices;
+  typedef LeftArgType_ LeftArgType;
+  typedef RightArgType_ RightArgType;
+  typedef Device_ Device;
+
+  // From NumDims below.
+  static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
+};
+
+}  // end namespace internal
+
+template<typename Indices, typename LhsXprType, typename RhsXprType>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
+  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+                                                   typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
+      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
+
+  EIGEN_DEVICE_FUNC
+  const Indices& indices() const { return m_indices; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename LhsXprType::Nested>::type&
+  lhsExpression() const { return m_lhs_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename RhsXprType::Nested>::type&
+  rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const Indices m_indices;
+};
+
+
+template<typename Derived>
+struct TensorContractionEvaluatorBase
+{
+  typedef typename internal::traits<Derived>::Indices Indices;
+  typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
+  typedef typename internal::traits<Derived>::RightArgType RightArgType;
+  typedef typename internal::traits<Derived>::Device Device;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = true
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  TensorContractionEvaluatorBase(const XprType& op, const Device& device)
+    : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+                          op.lhsExpression(), op.rhsExpression()), device),
+    m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+                          op.rhsExpression(), op.lhsExpression()), device),
+        m_device(device),
+        m_result(NULL) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+			   static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+
+    DSizes<Index, LDims> eval_left_dims;
+    DSizes<Index, RDims> eval_right_dims;
+    array<IndexPair<Index>, ContractDims> eval_op_indices;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // For ColMajor, we keep using the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[i];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[i];
+      }
+      // We keep the pairs of contracting indices.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = op.indices()[i].first;
+        eval_op_indices[i].second = op.indices()[i].second;
+      }
+    } else {
+      // For RowMajor, we need to reverse the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
+      }
+      // We need to flip all the pairs of contracting indices as well as
+      // reversing the dimensions.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
+        eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
+      }
+    }
+
+    // Check for duplicate axes and make sure the first index in eval_op_indices
+    // is increasing. Using O(n^2) sorting is OK since ContractDims is small
+    for (int i = 0; i < ContractDims; i++) {
+      for (int j = i + 1; j < ContractDims; j++) {
+        eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first &&
+                     eval_op_indices[j].second != eval_op_indices[i].second &&
+                     "contraction axes should be unique");
+        if (eval_op_indices[j].first < eval_op_indices[i].first) {
+          numext::swap(eval_op_indices[j], eval_op_indices[i]);
+        }
+      }
+    }
+
+    array<Index, LDims> lhs_strides;
+    lhs_strides[0] = 1;
+    for (int i = 0; i < LDims-1; ++i) {
+      lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
+    }
+
+    array<Index, RDims> rhs_strides;
+    rhs_strides[0] = 1;
+    for (int i = 0; i < RDims-1; ++i) {
+      rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
+    }
+
+    if (m_i_strides.size() > 0) m_i_strides[0] = 1;
+    if (m_j_strides.size() > 0) m_j_strides[0] = 1;
+    if (m_k_strides.size() > 0) m_k_strides[0] = 1;
+
+    m_i_size = 1;
+    m_j_size = 1;
+    m_k_size = 1;
+
+    // To compute the dimension, we simply concatenate the non-contracting
+    // dimensions of the left and then the right tensor. Additionally, we also
+    // compute the strides corresponding to the left non-contracting
+    // dimensions and right non-contracting dimensions.
+    m_lhs_inner_dim_contiguous = true;
+    int dim_idx = 0;
+    unsigned int nocontract_idx = 0;
+
+    for (int i = 0; i < LDims; i++) {
+      // find if we are contracting on index i of left tensor
+      bool contracting = false;
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].first == i) {
+          contracting = true;
+          break;
+        }
+      }
+      if (!contracting) {
+        // add dimension size to output dimensions
+        m_dimensions[dim_idx] = eval_left_dims[i];
+        m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
+        if (dim_idx != i) {
+          m_lhs_inner_dim_contiguous = false;
+        }
+        if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
+          m_i_strides[nocontract_idx+1] =
+              m_i_strides[nocontract_idx] * eval_left_dims[i];
+        } else {
+          m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
+        }
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    nocontract_idx = 0;
+    for (int i = 0; i < RDims; i++) {
+      bool contracting = false;
+      // find if we are contracting on index i of right tensor
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].second == i) {
+          contracting = true;
+          break;
+        }
+      }
+      if (!contracting) {
+        m_dimensions[dim_idx] = eval_right_dims[i];
+        if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
+          m_j_strides[nocontract_idx+1] =
+              m_j_strides[nocontract_idx] * eval_right_dims[i];
+        } else {
+          m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
+        }
+        m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    // Now compute the strides corresponding to the contracting dimensions. We
+    // assumed above that non-contracting axes are represented in the same order
+    // in the matrix as they are in the tensor. This is not the case for
+    // contracting axes. As the contracting axes must be of the same size in
+    // each tensor, we'll only look at the first tensor here.
+    m_rhs_inner_dim_contiguous = true;
+    m_rhs_inner_dim_reordered = false;
+    for (int i = 0; i < ContractDims; i++) {
+      Index left = eval_op_indices[i].first;
+      Index right = eval_op_indices[i].second;
+
+      Index size = eval_left_dims[left];
+      eigen_assert(size == eval_right_dims[right] &&
+                   "Contraction axes must be same size");
+
+      if (i+1 < static_cast<int>(internal::array_size<contract_t>::value)) {
+        m_k_strides[i+1] = m_k_strides[i] * size;
+      } else {
+        m_k_size = m_k_strides[i] * size;
+      }
+      m_left_contracting_strides[i] = lhs_strides[left];
+      m_right_contracting_strides[i] = rhs_strides[right];
+
+      if (i > 0 && right < eval_op_indices[i-1].second) {
+        m_rhs_inner_dim_reordered = true;
+      }
+      if (right != i) {
+        m_rhs_inner_dim_contiguous = false;
+      }
+    }
+
+    // If the layout is RowMajor, we need to reverse the m_dimensions
+    if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
+      for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
+        numext::swap(m_dimensions[i], m_dimensions[j]);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
+    const Index rows = m_i_size;
+    const Index cols = m_k_size;
+
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+    const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
+    const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, lhs_alignment> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, rhs_alignment> RhsMapper;
+
+    LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
+                  m_left_contracting_strides, m_k_strides);
+    RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides,
+                  m_right_contracting_strides, m_k_strides);
+
+    const Scalar alpha(1);
+    const Index resIncr(1);
+
+    // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
+    m_device.memset(buffer, 0, rows * sizeof(Scalar));
+
+    internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
+        rows, cols, lhs, rhs,
+        buffer, resIncr, alpha);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    // define mr, nr, and all of my data mapper types
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+    const Index nr = Traits::nr;
+    const Index mr = Traits::mr;
+
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // Declare GEBP packing and kernel structs
+    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
+    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
+
+    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    // Sizes of the blocks to load in cache. See the Goto paper for details.
+    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
+    const Index kc = blocking.kc();
+    const Index mc = numext::mini(m, blocking.mc());
+    const Index nc = numext::mini(n, blocking.nc());
+    const Index sizeA = mc * kc;
+    const Index sizeB = kc * nc;
+
+    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
+    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+
+    for(Index i2=0; i2<m; i2+=mc)
+    {
+      const Index actual_mc = numext::mini(i2+mc,m)-i2;
+      for (Index k2 = 0; k2 < k; k2 += kc) {
+        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+        const Index actual_kc = numext::mini(k2 + kc, k) - k2;
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+
+        // series of horizontal blocks
+        for (Index j2 = 0; j2 < n; j2 += nc) {
+          // make sure we don't overshoot right edge of right matrix, then pack block
+          const Index actual_nc = numext::mini(j2 + nc, n) - j2;
+          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+
+          // call gebp (matrix kernel)
+          // The parameters here are copied from Eigen's GEMM implementation
+          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
+        }
+      }
+    }
+
+    this->m_device.deallocate(blockA);
+    this->m_device.deallocate(blockB);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_result[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
+
+  protected:
+  // Prevent assignment
+  TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
+  Dimensions m_dimensions;
+
+  contract_t m_k_strides;
+  contract_t m_left_contracting_strides;
+  contract_t m_right_contracting_strides;
+
+  bool m_lhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_reordered;
+
+  left_nocontract_t m_i_strides;
+  right_nocontract_t m_j_strides;
+  left_nocontract_t m_left_nocontract_strides;
+  right_nocontract_t m_right_nocontract_strides;
+
+  Index m_i_size;
+  Index m_j_size;
+  Index m_k_size;
+
+  TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
+  TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
+  const Device& m_device;
+  Scalar* m_result;
+};
+
+
+// evaluator for default device
+template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
+    public TensorContractionEvaluatorBase<
+      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  // Could we use NumDimensions here?
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) { }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
new file mode 100644
index 000000000..5cf7b4f71
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -0,0 +1,56 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
+
+
+namespace Eigen {
+namespace internal {
+
+enum {
+  ShardByRow = 0,
+  ShardByCol = 1
+};
+
+
+// Default Blocking Strategy
+template <typename LhsMapper, typename RhsMapper, typename Index, int ShardingType=ShardByCol>
+class TensorContractionBlocking {
+ public:
+
+  typedef typename LhsMapper::Scalar LhsScalar;
+  typedef typename RhsMapper::Scalar RhsScalar;
+
+  EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+      kc_(k), mc_(m), nc_(n)
+  {
+    if (ShardingType == ShardByCol) {
+      computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads);
+    }
+    else {
+      computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+
+ private:
+  Index kc_;
+  Index mc_;
+  Index nc_;
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
new file mode 100644
index 000000000..d65dbb40f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -0,0 +1,1391 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
+                       const Index m_size, const Index n_size, const Index k_size) {
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // threadIdx.x: the vertical position in an 8x8 block
+  // threadIdx.y: the vertical index of the 8x8 block in the grid
+  // threadIdx.z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                           \
+  {                                                             \
+    lhs_pf0 = conv(0);                                          \
+    lhs_pf1 = conv(0);                                          \
+    lhs_pf2 = conv(0);                                          \
+    lhs_pf3 = conv(0);                                          \
+    lhs_pf4 = conv(0);                                          \
+    lhs_pf5 = conv(0);                                          \
+    lhs_pf6 = conv(0);                                          \
+    lhs_pf7 = conv(0);                                          \
+                                                                \
+    rhs_pf0 = conv(0);                                          \
+    rhs_pf1 = conv(0);                                          \
+    rhs_pf2 = conv(0);                                          \
+    rhs_pf3 = conv(0);                                          \
+    rhs_pf4 = conv(0);                                          \
+    rhs_pf5 = conv(0);                                          \
+    rhs_pf6 = conv(0);                                          \
+    rhs_pf7 = conv(0);                                          \
+                                                                \
+    if (!needs_edge_check || lhs_vert < m_size) {               \
+      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
+      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
+      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
+      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
+      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
+      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
+      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
+      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
+      } else if (lhs_horiz_6 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+      } else if (lhs_horiz_5 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+      } else if (lhs_horiz_4 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+      } else if (lhs_horiz_3 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+      } else if (lhs_horiz_2 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+      } else if (lhs_horiz_1 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+      } else if (lhs_horiz_0 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+                                                                \
+    const Index rhs_vert = base_k + load_idx_vert;              \
+    if (!needs_edge_check || rhs_vert < k_size) {               \
+      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
+      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
+      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
+      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
+      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
+      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
+      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
+      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (rhs_horiz_7 < n_size) {                               \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
+      } else if (rhs_horiz_6 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+      } else if (rhs_horiz_5 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+      } else if (rhs_horiz_4 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+      } else if (rhs_horiz_3 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+      } else if (rhs_horiz_2 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+      } else if (rhs_horiz_1 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+      } else if (rhs_horiz_0 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+  }                                                             \
+
+#define writeRegToShmem(_)                      \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
+                                                \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
+                                                \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
+                                                \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
+                                                \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
+                                                \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
+                                                \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
+                                                \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)                        \
+  Scalar res(i, 0) = conv(0);                   \
+  Scalar res(i, 1) = conv(0);                   \
+  Scalar res(i, 2) = conv(0);                   \
+  Scalar res(i, 3) = conv(0);                   \
+  Scalar res(i, 4) = conv(0);                   \
+  Scalar res(i, 5) = conv(0);                   \
+  Scalar res(i, 6) = conv(0);                   \
+  Scalar res(i, 7) = conv(0);                   \
+
+  internal::scalar_cast_op<int, Scalar> conv;
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+    #undef prefetchIntoRegisters
+    #undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)                          \
+    lcol(0) = lhs_element(0, j);               \
+    rrow(0) = rhs_element(i, 0);               \
+    lcol(1) = lhs_element(1, j);               \
+    rrow(1) = rhs_element(i, 1);               \
+    lcol(2) = lhs_element(2, j);               \
+    rrow(2) = rhs_element(i, 2);               \
+    lcol(3) = lhs_element(3, j);               \
+    rrow(3) = rhs_element(i, 3);               \
+    lcol(4) = lhs_element(4, j);               \
+    rrow(4) = rhs_element(i, 4);               \
+    lcol(5) = lhs_element(5, j);               \
+    rrow(5) = rhs_element(i, 5);               \
+    lcol(6) = lhs_element(6, j);               \
+    rrow(6) = rhs_element(i, 6);               \
+    lcol(7) = lhs_element(7, j);               \
+    rrow(7) = rhs_element(i, 7);               \
+
+#define computeCol(j)                           \
+    res(0, j) += lcol(0) * rrow(j);             \
+    res(1, j) += lcol(1) * rrow(j);             \
+    res(2, j) += lcol(2) * rrow(j);             \
+    res(3, j) += lcol(3) * rrow(j);             \
+    res(4, j) += lcol(4) * rrow(j);             \
+    res(5, j) += lcol(5) * rrow(j);             \
+    res(6, j) += lcol(6) * rrow(j);             \
+    res(7, j) += lcol(7) * rrow(j);             \
+
+#define computePass(i)                          \
+    loadData(i, i);                             \
+                                                \
+    computeCol(0);                              \
+    computeCol(1);                              \
+    computeCol(2);                              \
+    computeCol(3);                              \
+    computeCol(4);                              \
+    computeCol(5);                              \
+    computeCol(6);                              \
+    computeCol(7);                              \
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  } // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+
+#define reduceRow(i, mask)                      \
+  shuffleInc(i, 0, mask);                       \
+  shuffleInc(i, 1, mask);                       \
+  shuffleInc(i, 2, mask);                       \
+  shuffleInc(i, 3, mask);                       \
+  shuffleInc(i, 4, mask);                       \
+  shuffleInc(i, 5, mask);                       \
+  shuffleInc(i, 6, mask);                       \
+  shuffleInc(i, 7, mask);                       \
+
+#define reduceMatrix(mask)                      \
+  reduceRow(0, mask);                           \
+  reduceRow(1, mask);                           \
+  reduceRow(2, mask);                           \
+  reduceRow(3, mask);                           \
+  reduceRow(4, mask);                           \
+  reduceRow(5, mask);                           \
+  reduceRow(6, mask);                           \
+  reduceRow(7, mask);                           \
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j)                                          \
+  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
+
+#define writeRow(i)                             \
+  writeResultShmem(i, 0);                       \
+  writeResultShmem(i, 1);                       \
+  writeResultShmem(i, 2);                       \
+  writeResultShmem(i, 3);                       \
+  writeResultShmem(i, 4);                       \
+  writeResultShmem(i, 5);                       \
+  writeResultShmem(i, 6);                       \
+  writeResultShmem(i, 7);                       \
+
+  if (threadIdx.x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+  if (threadIdx.x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(512)
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][16],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+  typedef float Scalar;
+
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i=0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+
+#define prefetch_lhs(reg, row, col)                   \
+    if (!CHECK_LHS_BOUNDARY) {                        \
+      if (col < k_size) {                             \
+        reg =lhs.loadPacket<Unaligned>(row, col);     \
+      }                                               \
+    } else {                                          \
+      if (col < k_size) {                             \
+        if (row + 3 < m_size) {                       \
+          reg =lhs.loadPacket<Unaligned>(row, col);   \
+        } else if (row + 2 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+          reg.z =lhs(row + 2, col);                   \
+        } else if (row + 1 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+        } else if (row  < m_size) {                   \
+          reg.x =lhs(row + 0, col);                   \
+        }                                             \
+      }                                               \
+    }                                                 \
+
+
+  Index lhs_vert = base_m+threadIdx.x*4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf0 = internal::pset1<float4>(0);
+
+    Index lhs_horiz = threadIdx.y+k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+    Index rhs_vert = k+(threadIdx.x%4)*4;
+    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2 ;
+    // the following can be a bitwise operation..... some day.
+    if((threadIdx.x%8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+    if((threadIdx.x%8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+    results[0].x += fl1.x * fr1.x;\
+    results[0].y += fl1.y * fr1.x;\
+    results[0].z += fl2.x * fr1.x;\
+    results[0].w += fl2.y * fr1.x;\
+\
+    results[1].x += fl1.x * fr1.y;\
+    results[1].y += fl1.y * fr1.y;\
+    results[1].z += fl2.x * fr1.y;\
+    results[1].w += fl2.y * fr1.y;\
+\
+    results[2].x += fl1.x * fr2.x;\
+    results[2].y += fl1.y * fr2.x;\
+    results[2].z += fl2.x * fr2.x;\
+    results[2].w += fl2.y * fr2.x;\
+\
+    results[3].x += fl1.x * fr2.y;\
+    results[3].y += fl1.y * fr2.y;\
+    results[3].z += fl2.x * fr2.y;\
+    results[3].w += fl2.y * fr2.y;\
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 16; koff ++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+      int start_feature = threadIdx.y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = threadIdx.y*4+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+       }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][32],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+  typedef float Scalar;
+
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i=0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+
+  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
+  for (Index k = 0; k < k_size; k += 32) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);
+
+     if (!CHECK_LHS_BOUNDARY) {
+      if ((threadIdx.y/4+k+24) < k_size) {
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+      } else if ((threadIdx.y/4+k+16) < k_size) {
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+      } else if ((threadIdx.y/4+k+8) < k_size) {
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+      } else if ((threadIdx.y/4+k) < k_size) {
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k+threadIdx.x*4;
+    Index rhs_horiz0 = threadIdx.y*2+base_n;
+    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k+threadIdx.x*4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k+threadIdx.x*4  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+      results[0].x += a_feat1.x * f1.x;\
+      results[1].x += a_feat1.x * f1.y;\
+      results[2].x += a_feat1.x * f2.x;\
+      results[3].x += a_feat1.x * f2.y;\
+      results[4].x += a_feat1.x * f3.x;\
+      results[5].x += a_feat1.x * f3.y;\
+      results[6].x += a_feat1.x * f4.x;\
+      results[7].x += a_feat1.x * f4.y;\
+\
+      results[0].y += a_feat1.y * f1.x;\
+      results[1].y += a_feat1.y * f1.y;\
+      results[2].y += a_feat1.y * f2.x;\
+      results[3].y += a_feat1.y * f2.y;\
+      results[4].y += a_feat1.y * f3.x;\
+      results[5].y += a_feat1.y * f3.y;\
+      results[6].y += a_feat1.y * f4.x;\
+      results[7].y += a_feat1.y * f4.y;\
+\
+      results[0].z += a_feat2.x * f1.x;\
+      results[1].z += a_feat2.x * f1.y;\
+      results[2].z += a_feat2.x * f2.x;\
+      results[3].z += a_feat2.x * f2.y;\
+      results[4].z += a_feat2.x * f3.x;\
+      results[5].z += a_feat2.x * f3.y;\
+      results[6].z += a_feat2.x * f4.x;\
+      results[7].z += a_feat2.x * f4.y;\
+\
+      results[0].w += a_feat2.y * f1.x;\
+      results[1].w += a_feat2.y * f1.y;\
+      results[2].w += a_feat2.y * f2.x;\
+      results[3].w += a_feat2.y * f2.y;\
+      results[4].w += a_feat2.y * f3.x;\
+      results[5].w += a_feat2.y * f3.y;\
+      results[6].w += a_feat2.y * f4.x;\
+      results[7].w += a_feat2.y * f4.y;\
+
+    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 32; koff ++) {
+      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+      int start_feature = (threadIdx.y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
+      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  } // end loop over k
+
+
+  __syncthreads();
+  Index horiz_base = (threadIdx.y/4)*8+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256)
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64*32];
+  __shared__ float2 rhs_shmem[128*8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  typedef float2 LHS_MEM16x16[32][16];
+  typedef float2 RHS_MEM16x16[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  bool check_rhs = (base_n + 63) >= n_size;
+  bool check_lhs128 = (base_m + 127) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256)
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
+
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) {}
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+    const Index m_blocks = (m + 63) / 64;
+    const Index n_blocks = (n + 63) / 64;
+    const dim3 num_blocks(m_blocks, n_blocks, 1);
+    const dim3 block_size(8, 8, 8);
+    LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+    }
+  };
+
+  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      } else {
+        const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      }
+    }
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
+    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and __CUDACC__
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
new file mode 100644
index 000000000..9b2cb3ff6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -0,0 +1,467 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+
+namespace Eigen {
+
+namespace internal {
+
+enum {
+  Rhs = 0,
+  Lhs = 1
+};
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+
+template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
+  enum {
+    DirectOffsets = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
+    eigen_assert(false && "unsupported");
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+  {
+    return m_tensor.template packet<LoadMode>(index);
+  }
+
+
+ private:
+  const Tensor m_tensor;
+};
+
+template <typename Tensor> struct CoeffLoader<Tensor, true> {
+  enum {
+    DirectOffsets = true
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+    m_data += offset;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+  {
+    return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
+  }
+ private:
+  typedef typename Tensor::Scalar Scalar;
+  const Scalar* m_data;
+};
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size, bool inner_dim_contiguous, int Alignment>
+class SimpleTensorContractionMapper {
+  public:
+  EIGEN_DEVICE_FUNC
+  SimpleTensorContractionMapper(const Tensor& tensor,
+                                const nocontract_t& nocontract_strides,
+                                const nocontract_t& ij_strides,
+                                const contract_t& contract_strides,
+                                const contract_t& k_strides) :
+      m_tensor(tensor),
+      m_nocontract_strides(nocontract_strides),
+      m_ij_strides(ij_strides),
+      m_contract_strides(contract_strides),
+      m_k_strides(k_strides) { }
+
+  enum {
+    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+    m_tensor.offsetBuffer(offset);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+    // column major assumption
+    return operator()(row, 0);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+    return m_tensor.coeff(computeIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val = left ? row : col;
+    Index linidx = 0;
+    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+      const Index idx = nocontract_val / m_ij_strides[i];
+      linidx += idx * m_nocontract_strides[i];
+      nocontract_val -= idx * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx += nocontract_val;
+      } else {
+        linidx += nocontract_val * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val = left ? col : row;
+    if(array_size<contract_t>::value > 0) {
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx = contract_val / m_k_strides[i];
+        linidx += idx * m_contract_strides[i];
+        contract_val -= idx * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx += contract_val;
+      } else {
+        linidx += contract_val * m_contract_strides[0];
+      }
+    }
+
+    return linidx;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+    Index linidx[2] = {0, 0};
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+        const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+        linidx[0] += idx0 * m_nocontract_strides[i];
+        linidx[1] += idx1 * m_nocontract_strides[i];
+        nocontract_val[0] -= idx0 * m_ij_strides[i];
+        nocontract_val[1] -= idx1 * m_ij_strides[i];
+      }
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx[0] += nocontract_val[0];
+        linidx[1] += nocontract_val[1];
+      } else {
+        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+    if (array_size<contract_t>::value> 0) {
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = contract_val[0] / m_k_strides[i];
+        const Index idx1 = contract_val[1] / m_k_strides[i];
+        linidx[0] += idx0 * m_contract_strides[i];
+        linidx[1] += idx1 * m_contract_strides[i];
+        contract_val[0] -= idx0 * m_k_strides[i];
+        contract_val[1] -= idx1 * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx[0] += contract_val[0];
+        linidx[1] += contract_val[1];
+      } else {
+        linidx[0] += contract_val[0] * m_contract_strides[0];
+        linidx[1] += contract_val[1] * m_contract_strides[0];
+      }
+    }
+    return IndexPair<Index>(linidx[0], linidx[1]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
+    // Only claim alignment when we can compute the actual stride (ie when we're
+    // dealing with the lhs with inner_dim_contiguous. This is because the
+    // matrix-vector product relies on the stride when dealing with aligned inputs.
+    return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
+    return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
+  }
+
+ protected:
+  CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
+  const nocontract_t m_nocontract_strides;
+  const nocontract_t m_ij_strides;
+  const contract_t m_contract_strides;
+  const contract_t m_k_strides;
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size, bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
+{
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
+
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  typedef typename Tensor::PacketReturnType Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+
+  template <int AlignmentType>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    // current code assumes packet size must be a multiple of 2
+    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+      const Index index = this->computeIndex(i, j);
+      eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
+      return this->m_tensor.template packet<AlignmentType>(index);
+    }
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+    const Index first = indexPair.first;
+    const Index last = indexPair.second;
+
+    // We can always do optimized packet reads from left hand side right now, because
+    // the vertical matrix dimension on the left hand side is never contracting.
+    // On the right hand side we need to check if the contracting dimensions may have
+    // been shuffled first.
+    if (Tensor::PacketAccess &&
+        (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+        (last - first) == (packet_size - 1)) {
+
+      return this->m_tensor.template packet<AlignmentType>(first);
+    }
+
+    EIGEN_ALIGN_MAX Scalar data[packet_size];
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[packet_size - 1] = this->m_tensor.coeff(last);
+
+    return pload<Packet>(data);
+  }
+
+  template <int AlignmentType>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
+    if (half_packet_size == packet_size) {
+      return loadPacket<AlignmentType>(i, j);
+    }
+    EIGEN_ALIGN_MAX Scalar data[half_packet_size];
+    for (Index k = 0; k < half_packet_size; k++) {
+      data[k] = operator()(i + k, j);
+    }
+    return pload<HalfPacket>(data);
+  }
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
+{
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
+
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  typedef typename Tensor::PacketReturnType Packet;
+  template <int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<typename Tensor::PacketReturnType>(data);
+  }
+  template <int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
+    return loadPacket(i, j);
+  }
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper {
+ public:
+  typedef typename Tensor::PacketReturnType Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef Self LinearMapper;
+
+  enum {
+    // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
+    // TODO: we should also enable direct offsets for the Rhs case.
+    UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0)
+  };
+
+  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
+    // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
+    // this offset every time we attempt to access a coefficient.
+    if (UseDirectOffsets) {
+      Index stride = m_base_mapper.stride();
+      m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, 0);
+    }
+    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, j);
+    }
+    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<Alignment>(i, 0);
+    }
+    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<Alignment>(i, j);
+    }
+    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
+    }
+    return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+    if (UseDirectOffsets) {
+      m_base_mapper.storePacket(i, 0, p);
+    }
+    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return LinearMapper(m_base_mapper, i, j);
+    }
+    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
+    if (UseDirectOffsets) {
+     return m_base_mapper.template loadPacket<ActualAlignment>(i, 0);
+    }
+    return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
+    return false;
+  }
+
+ private:
+  ParentMapper m_base_mapper;
+  const Index m_vert_offset;
+  const Index m_horiz_offset;
+};
+
+
+template<typename Scalar_, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper
+  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+
+ public:
+  typedef Scalar_ Scalar;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper VectorMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
+                               const nocontract_t& nocontract_strides,
+                               const nocontract_t& ij_strides,
+                               const contract_t& contract_strides,
+                               const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+};
+
+
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
new file mode 100644
index 000000000..ee16cde9b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -0,0 +1,1052 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+
+// evaluator for thread pool device
+#ifdef EIGEN_USE_THREADS
+
+namespace Eigen {
+
+#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
+namespace internal {
+
+template<typename LhsScalar, typename LhsMapper, typename Index>
+struct packLhsArg {
+  LhsScalar* blockA;
+  const LhsMapper& lhs;
+  const Index m_start;
+  const Index k_start;
+  const Index mc;
+  const Index kc;
+};
+
+template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
+struct packRhsAndKernelArg {
+  const MaxSizeVector<LhsScalar*>* blockAs;
+  RhsScalar* blockB;
+  const RhsMapper& rhs;
+  OutputMapper& output;
+  const Index m;
+  const Index k;
+  const Index n;
+  const Index mc;
+  const Index kc;
+  const Index nc;
+  const Index num_threads;
+  const Index num_blockAs;
+  const Index max_m;
+  const Index k_block_idx;
+  const Index m_block_idx;
+  const Index n_block_idx;
+  const Index m_blocks;
+  const Index n_blocks;
+  MaxSizeVector<Notification*>* kernel_notifications;
+  const MaxSizeVector<Notification*>* lhs_notifications;
+  const bool need_to_pack;
+};
+
+}  // end namespace internal
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
+
+  typedef ThreadPoolDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) {}
+
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    typedef
+        typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
+            LhsScalar;
+    typedef
+        typename internal::remove_const<typename EvalRightArgType::Scalar>::type
+            RhsScalar;
+    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+        contract_t, internal::packet_traits<LhsScalar>::size,
+        lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+    typedef internal::TensorContractionInputMapper<
+        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+        contract_t, internal::packet_traits<RhsScalar>::size,
+        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    typedef internal::gemm_pack_lhs<LhsScalar, Index,
+                                    typename LhsMapper::SubMapper, Traits::mr,
+                                    Traits::LhsProgress, ColMajor>
+        LhsPacker;
+    typedef internal::gemm_pack_rhs<
+        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
+        RhsPacker;
+    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+                                  Traits::mr, Traits::nr, false, false>
+        GebpKernel;
+
+    const Index m = this->m_i_size;
+    const Index n = this->m_j_size;
+    const Index k = this->m_k_size;
+    if (m == 0 || n == 0 || k == 0) return;
+
+    // Compute a set of algorithm parameters:
+    // - kernel block sizes (bm, bn, bk)
+    // - task grain sizes (number of kernels executed per task: gm, gn)
+    // - number of threads
+    // - sharding by row/column
+    // - parallel packing or first lhs then rhs
+    // and some derived parameters:
+    // - number of tasks (nm, nn, nk)
+    // - number of kernels (nm0, nn0)
+    // Unfortunately, all these parameters are tightly interdependent.
+    // So in some cases we first compute approximate values, then compute other
+    // values based on these approximations and then refine the approximations.
+
+    // There are lots of heuristics here. There is some reasoning behind them,
+    // but ultimately they are just tuned on contraction benchmarks for
+    // different input configurations, thread counts and instruction sets.
+    // So feel free to question any of them.
+
+    // Compute whether we want to shard by row or by column.
+    // This is a first approximation, it will be refined later. Since we don't
+    // know number of threads yet we use 2, because what's we are most
+    // interested in at this point is whether it makes sense to use
+    // parallelization at all or not.
+    bool shard_by_col = shardByCol(m, n, 2);
+
+    // First approximation of kernel blocking sizes.
+    // Again, we don't know number of threads yet, so we use 2.
+    Index bm, bn, bk;
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByCol>
+          blocking(k, m, n, 2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByRow>
+          blocking(k, m, n, 2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
+
+    // Compute optimal number of threads.
+    // Note: we use bk instead of k here because we are interested in amount of
+    // _parallelizable_ computations, and computations are not parallelizable
+    // across k dimension.
+    const TensorOpCost cost =
+        contractionCost(m, n, bm, bn, bk, shard_by_col, false);
+    int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+        static_cast<double>(n) * m, cost, this->m_device.numThreads());
+
+    // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
+    // model is not tuned. Remove this when the cost model is tuned.
+    if (n == 1) num_threads = 1;
+
+    if (num_threads == 1) {
+      // The single-threaded algorithm should be faster in this case.
+      if (n == 1)
+        this->template evalGemv<lhs_inner_dim_contiguous,
+                                rhs_inner_dim_contiguous,
+                                rhs_inner_dim_reordered, Alignment>(buffer);
+      else
+        this->template evalGemm<lhs_inner_dim_contiguous,
+                                rhs_inner_dim_contiguous,
+                                rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    // Now that we know number of threads, recalculate sharding and blocking.
+    shard_by_col = shardByCol(m, n, num_threads);
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByCol>
+          blocking(k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByRow>
+          blocking(k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
+
+    // Number of kernels for each dimension.
+    Index nm0 = divup(m, bm);
+    Index nn0 = divup(n, bn);
+    Index nk = divup(k, bk);
+
+    // Calculate task grain size (number of kernels executed per task).
+    // This task size coarsening serves two purposes:
+    // 1. It reduces per-task overheads including synchronization overheads.
+    // 2. It allows to use caches better (reuse the same packed rhs in several
+    // consecutive kernels).
+    Index gm = 1;
+    Index gn = 1;
+    // If we are sharding by column, then we prefer to reduce rows first.
+    if (shard_by_col) {
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+    } else {
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+    }
+    // Number of tasks in each dimension.
+    Index nm = divup(nm0, gm);
+    Index nn = divup(nn0, gn);
+
+    // Last by not least, decide whether we want to issue both lhs and rhs
+    // packing in parallel; or issue lhs packing first, and then issue rhs
+    // packing when lhs packing completes (for !shard_by_col lhs and rhs are
+    // swapped). Parallel packing allows more parallelism (for both packing and
+    // kernels), while sequential packing provides better locality (once
+    // a thread finishes rhs packing it proceed to kernels with that rhs).
+    // First, we are interested in parallel packing if there are few tasks.
+    bool parallel_pack = num_threads >= nm * nn;
+    // Also do parallel packing if all data fits into L2$.
+    if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
+        l2CacheSize() * num_threads)
+      parallel_pack = true;
+    // But don't do it if we will use each rhs only once. Locality seems to be
+    // more important in this case.
+    if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
+                  this->m_i_strides, this->m_left_contracting_strides,
+                  this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
+                  this->m_j_strides, this->m_right_contracting_strides,
+                  this->m_k_strides);
+
+    Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
+            OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
+                          k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
+                          shard_by_col, parallel_pack)
+        .run();
+  }
+
+  // Context coordinates a single parallel gemm operation.
+  template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
+            typename LhsMapper, typename RhsMapper, typename OutputMapper>
+  class Context {
+   public:
+    Context(const Device& device, int num_threads, LhsMapper& lhs,
+            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
+            Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
+            Index gn, Index nm0, Index nn0, bool shard_by_col,
+            bool parallel_pack)
+        : device_(device),
+          lhs_(lhs),
+          rhs_(rhs),
+          buffer_(buffer),
+          output_(buffer, tm),
+          num_threads_(num_threads),
+          shard_by_col_(shard_by_col),
+          parallel_pack_(parallel_pack),
+          m_(tm),
+          n_(tn),
+          k_(tk),
+          bm_(bm),
+          bn_(bn),
+          bk_(bk),
+          nm_(nm),
+          nn_(nn),
+          nk_(nk),
+          gm_(gm),
+          gn_(gn),
+          nm0_(nm0),
+          nn0_(nn0)
+  {
+      for (Index x = 0; x < P; x++) {
+        // Normal number of notifications for k slice switch is
+        // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
+        // nm_ + nn_ notifications, because they will not receive notifications
+        // from preceeding kernels.
+        state_switch_[x] =
+            x == 0
+                ? 1
+                : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) +
+                      (x == P - 1 ? nm_ * nn_ : 0);
+        state_packing_ready_[x] =
+            parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
+        state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
+        for (Index m = 0; m < nm_; m++) {
+          state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
+          // Kernels generally receive 3 notifications (previous kernel + 2
+          // packing), but the first slice won't get notifications from previous
+          // kernels.
+          for (Index n = 0; n < nn_; n++)
+            state_kernel_[x][m][n].store(
+                (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1),
+                std::memory_order_relaxed);
+        }
+      }
+
+      // Allocate memory for packed rhs/lhs matrices.
+      size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+      size_t lhs_size =
+          divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
+      size_t rhs_size =
+          divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
+      packed_mem_ = static_cast<char*>(internal::aligned_malloc(
+          (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
+      char* mem = static_cast<char*>(packed_mem_);
+      for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
+        packed_lhs_[x].resize(nm0_);
+        for (Index m = 0; m < nm0_; m++) {
+          packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
+          mem += lhs_size;
+        }
+        packed_rhs_[x].resize(nn0_);
+        for (Index n = 0; n < nn0_; n++) {
+          packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
+          mem += rhs_size;
+        }
+      }
+    }
+
+    ~Context() {
+      for (Index x = 0; x < P; x++) {
+        for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
+        delete[] state_kernel_[x];
+      }
+      internal::aligned_free(packed_mem_);
+    }
+
+    void run() {
+      // Kick off packing of the first slice.
+      signal_switch(0, 1);
+      // Wait for overall completion.
+      // TODO(dvyukov): this wait can lead to deadlock.
+      // If nthreads contractions are concurrently submitted from worker
+      // threads, this wait will block all worker threads and the system will
+      // deadlock.
+      done_.Wait();
+    }
+
+   private:
+    Notification done_;
+    const Device& device_;
+    LhsMapper& lhs_;
+    RhsMapper& rhs_;
+    Scalar* const buffer_;
+    OutputMapper output_;
+    const int num_threads_;
+    const bool shard_by_col_;
+    const bool parallel_pack_;
+    // Matrix sizes.
+    const Index m_;
+    const Index n_;
+    const Index k_;
+    // Block sizes.
+    const Index bm_;
+    const Index bn_;
+    const Index bk_;
+    // Number of tasks.
+    const Index nm_;
+    const Index nn_;
+    const Index nk_;
+    // Task grain sizes (number of kernels executed per task).
+    const Index gm_;
+    const Index gn_;
+    // Number of blocks (this is different from ni_/nn_ because of task size
+    // coarsening).
+    const Index nm0_;
+    const Index nn0_;
+
+    // Parallelization strategy.
+    //
+    // Blocks related to the same k block can run in parallel because they write
+    // to different output blocks. So we parallelize within k slices, this
+    // gives us parallelism level of m x n. Before we can start any kernels
+    // related to k-th slice, we need to issue m lhs packing tasks and n rhs
+    // packing tasks.
+    //
+    // However, there is a bottleneck when we are finishing kernels for k-th
+    // slice (at the very end there is only 1 runnable kernel). To mitigate this
+    // bottleneck we allow kernels from k-th and k+1-th slices to run in
+    // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
+    // output block, so they must not run in parallel.
+    //
+    // This gives us the following dependency graph.
+    // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
+    // packing tasks.
+    // Kernel (m, n, k) can start when:
+    //  - kernel (m, n, k-1) has finished
+    //  - lhs packing (m, k) has finished
+    //  - rhs packing (n, k) has finished
+    // Lhs/rhs packing can start when:
+    //  - all k-1 packing has finished (artificially imposed to limit amount of
+    //  parallel packing)
+    //
+    // On top of that we limit runnable tasks to two consecutive k slices.
+    // This is done to limit amount of memory we need for packed lhs/rhs
+    // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
+    //
+    // state_switch_ tracks when we are ready to switch to the next k slice.
+    // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
+    // These variable are rolling over 3 consecutive k slices: first two we are
+    // actively executing + one to track completion of kernels in the second
+    // slice.
+    static const Index P = 3;
+    void* packed_mem_;
+    std::vector<LhsScalar*> packed_lhs_[P - 1];
+    std::vector<RhsScalar*> packed_rhs_[P - 1];
+    std::atomic<uint8_t>** state_kernel_[P];
+    // state_switch_ is frequently modified by worker threads, while other
+    // fields are read-only after constructor. Let's move it to a separate cache
+    // line to reduce cache-coherency traffic.
+    char pad_[128];
+    std::atomic<Index> state_packing_ready_[P];
+    std::atomic<Index> state_switch_[P];
+
+    void pack_lhs(Index m, Index k) {
+      const Index mend = m * gm_ + gm(m);
+      for (Index m1 = m * gm_; m1 < mend; m1++)
+        LhsPacker()(packed_lhs_[k % (P - 1)][m1],
+                    lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+
+      if (!parallel_pack_ && shard_by_col_) {
+        signal_packing(k);
+      } else {
+        signal_switch(k + 1);
+        for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
+      }
+    }
+
+    void pack_rhs(Index n, Index k) {
+      const Index nend = n * gn_ + gn(n);
+      for (Index n1 = n * gn_; n1 < nend; n1++) {
+        if (k == 0) {
+          // Zero the output memory in parallel.
+          // On 10000x2x10000 mm zeroing can easily take half of time.
+          // Zero (bn x m) row. Safe to do here because all kernels that will
+          // write to this memory depend on completion of this task.
+          // Note: don't call device_.memset() here. device_.memset() blocks on
+          // thread pool worker thread, which can lead to underutilization and
+          // deadlocks.
+          memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
+        }
+        RhsPacker()(packed_rhs_[k % (P - 1)][n1],
+                    rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+      }
+
+      if (parallel_pack_ || shard_by_col_) {
+        signal_switch(k + 1);
+        for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
+      } else {
+        signal_packing(k);
+      }
+    }
+
+    void kernel(Index m, Index n, Index k) {
+      // Note: order of iteration matters here. Iteration over m is innermost
+      // because we want to reuse the same packed rhs in consequetive tasks
+      // (rhs fits into L2$ while lhs only into L3$).
+      const Index nend = n * gn_ + gn(n);
+      const Index mend = m * gm_ + gm(m);
+      if (shard_by_col_) {
+        for (Index n1 = n * gn_; n1 < nend; n1++) {
+          for (Index m1 = m * gm_; m1 < mend; m1++)
+            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+                         packed_lhs_[k % (P - 1)][m1],
+                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+                         Scalar(1), -1, -1, 0, 0);
+        }
+      } else {
+        for (Index m1 = m * gm_; m1 < mend; m1++)
+          for (Index n1 = n * gn_; n1 < nend; n1++) {
+            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+                         packed_lhs_[k % (P - 1)][m1],
+                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+                         Scalar(1), -1, -1, 0, 0);
+          }
+      }
+      signal_kernel(m, n, k + 1, false);
+      signal_switch(k + 2);
+    }
+
+    void signal_packing(Index k) {
+      eigen_assert(!parallel_pack_);
+      Index s = state_packing_ready_[k % P].fetch_sub(1);
+      eigen_assert(s > 0);
+      if (s != 1) return;
+      state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
+      enqueue_packing(k, shard_by_col_);
+    }
+
+    void signal_kernel(Index m, Index n, Index k, bool sync) {
+      std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
+      Index s = state->load();
+      eigen_assert(s > 0);
+      if (s != 1 && state->fetch_sub(1) != 1) return;
+      state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
+      if (sync)
+        kernel(m, n, k);
+      else
+        device_.enqueueNoNotification([=]() { kernel(m, n, k); });
+    }
+
+    void signal_switch(Index k, Index v = 1) {
+      Index s = state_switch_[k % P].fetch_sub(v);
+      eigen_assert(s >= v);
+      if (s != v) return;
+
+      // Ready to switch to the next k slice.
+      // Reset counter for the next iteration.
+      state_switch_[k % P] =
+          (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
+          nm_ * nn_;
+      if (k < nk_) {
+        // Issue lhs/rhs packing. Their completion will in turn kick off
+        // kernels.
+        if (parallel_pack_) {
+          enqueue_packing(k, !shard_by_col_);
+          enqueue_packing(k, shard_by_col_);
+        } else if (shard_by_col_) {
+          enqueue_packing(k, false);
+        } else {
+          enqueue_packing(k, true);
+        }
+
+        // Termination handling.
+        // Because kernel completion signals k + 2 switch, we need to finish nk
+        // + 2 slices without issuing any tasks on nk + 1 slice. So here we
+        // pretend that all nk + 1 packing tasks just finish instantly; so that
+        // nk + 2 switch only waits for completion of nk kernels.
+      } else if (k == nk_) {
+        signal_switch(k + 1,
+                      parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
+      } else {
+        done_.Notify();
+      }
+    }
+
+    // Enqueue all rhs/lhs packing for k-th slice.
+    void enqueue_packing(Index k, bool rhs) {
+      enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs);
+    }
+
+    void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
+      if (end - start == 1) {
+        if (rhs)
+          pack_rhs(start, k);
+        else
+          pack_lhs(start, k);
+      } else {
+        Index mid = (start + end) / 2;
+        device_.enqueueNoNotification(
+            [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+        device_.enqueueNoNotification(
+            [=]() { enqueue_packing_helper(start, mid, k, rhs); });
+      }
+    }
+
+    // Block sizes with accounting for potentially incomplete last block.
+    Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
+    Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
+    Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
+    // Task grain sizes accounting for potentially incomplete last task.
+    Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
+    Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
+
+    Context(const Context&) = delete;
+    void operator=(const Context&) = delete;
+  };
+
+  // Decide whether we want to shard m x n contraction by columns or by rows.
+  static bool shardByCol(Index m, Index n, Index num_threads) {
+    // Note: we are comparing both n and m against Traits::nr, it is not
+    // a mistake. We are trying to figure out how both n and m will fit into
+    // the main sharding dimension.
+
+    // Sharding by column is the default
+    // ... unless there is enough data for vectorization over rows
+    if (m / num_threads >= Traits::nr &&
+        // and not enough data for vectorization over columns
+        (n / num_threads < Traits::nr ||
+         // ... or barely enough data for vectorization over columns,
+         // but it is not evenly dividable across threads
+         (n / num_threads < 4 * Traits::nr &&
+          (n % (num_threads * Traits::nr)) != 0 &&
+          // ... and it is evenly dividable across threads for rows
+          ((m % (num_threads * Traits::nr)) == 0 ||
+           // .. or it is not evenly dividable for both dimensions but
+           // there is much more data over rows so that corner effects are
+           // mitigated.
+           (m / n >= 6)))))
+      return false;
+    // Wait, or if matrices are just substantially prolonged over the other
+    // dimension.
+    if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
+    return true;
+  }
+
+  Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn,
+                 int num_threads, bool shard_by_col) const {
+    Index gm = 1;
+    Index gm1 = 1;
+    Index nm0 = divup(m, bm);
+    Index nm1 = nm0;
+    for (;;) {
+      // Find the next candidate for m grain size. It needs to result in
+      // different number of blocks. E.g. if we have 10 kernels, we want to try
+      // 5 and 10, but not 6, 7, 8 and 9.
+      while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
+      if (gm1 > nm0) break;
+      // Check the candidate.
+      int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
+                           shard_by_col);
+      if (res < 0) break;
+      nm1 = divup(nm0, gm1);
+      if (res == 0) continue;
+      // Commit new grain size.
+      gm = gm1;
+    }
+    return gm;
+  }
+
+  Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+                 int num_threads, bool shard_by_col) const {
+    Index gn = 1;
+    Index gn1 = 1;
+    Index nn0 = divup(n, bn);
+    Index nn1 = nn0;
+    for (;;) {
+      while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
+      if (gn1 > nn0) break;
+      int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
+                           shard_by_col);
+      if (res < 0) break;
+      nn1 = divup(nn0, gn1);
+      if (res == 0) continue;
+      gn = gn1;
+    }
+    return gn;
+  }
+
+  // checkGrain checks whether grain (gm, gn) is suitable and is better than
+  // (oldgm, oldgn).
+  int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+                 Index gn, Index oldgm, Index oldgn, int num_threads,
+                 bool shard_by_col) const {
+    const TensorOpCost cost =
+        contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
+    double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+        static_cast<double>(bm) * gm * bn * gn, cost);
+    // If the task is too small, then we agree on it regardless of anything
+    // else. Otherwise synchronization overheads will dominate.
+    if (taskSize < 1) return 1;
+    // If it is too large, then we reject it and all larger tasks.
+    if (taskSize > 2) return -1;
+    // Now we are in presumably good task size range.
+    // The main deciding factor here is parallelism. Consider that we have 12
+    // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
+    // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
+    // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
+    // us parallelism of 1 (we can load all cores).
+    Index nm0 = divup(m, bm);
+    Index nn0 = divup(n, bn);
+    Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
+    double new_parallelism = static_cast<double>(new_tasks) /
+                             (divup<int>(new_tasks, num_threads) * num_threads);
+    Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
+    double old_parallelism = static_cast<double>(old_tasks) /
+                             (divup<int>(old_tasks, num_threads) * num_threads);
+    if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
+    return 0;
+  }
+
+#else  // EIGEN_USE_SIMPLE_THREAD_POOL
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalGemm(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+
+    const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // TODO: packing could be faster sometimes if we supported row major tensor mappers
+    typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
+                                    Traits::LhsProgress, ColMajor> LhsPacker;
+    typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
+
+    // TODO: replace false, false with conjugate values?
+    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+                                  Traits::mr, Traits::nr, false, false> GebpKernel;
+
+    typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
+    typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    // compute block sizes (which depend on number of threads)
+    const Index num_threads = this->m_device.numThreads();
+    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
+    Index mc = blocking.mc();
+    Index nc = blocking.nc();
+    Index kc = blocking.kc();
+    eigen_assert(mc <= m);
+    eigen_assert(nc <= n);
+    eigen_assert(kc <= k);
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+    const Index k_blocks = CEIL_DIV(k, kc);
+    const Index n_blocks = CEIL_DIV(n, nc);
+    const Index m_blocks = CEIL_DIV(m, mc);
+    const Index sizeA = mc * kc;
+    const Index sizeB = kc * nc;
+
+    /*    cout << "m: " << m << " n: " << n << " k: " << k << endl;
+    cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
+    cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
+    cout << "num threads: " << num_threads << endl;
+    */
+
+    // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
+    //       aren't 16 byte aligned segfaults will happen due to SIMD instructions
+    // note: You can get away with allocating just a single blockA and offsets and meet the
+    //       the alignment requirements with the assumption that
+    //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
+    const Index numBlockAs = numext::mini(num_threads, m_blocks);
+    MaxSizeVector<LhsScalar *> blockAs(num_threads);
+    for (int i = 0; i < num_threads; i++) {
+      blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
+    }
+
+    // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
+    // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
+    //       Other options: (1) reuse memory when a thread finishes. con: tricky
+    //                      (2) allocate block B memory in each thread. con: overhead
+    MaxSizeVector<RhsScalar *> blockBs(n_blocks);
+    for (int i = 0; i < n_blocks; i++) {
+      blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
+    }
+
+    // lhs_notifications starts with all null Notifications
+    MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
+
+    // this should really be numBlockAs * n_blocks;
+    const Index num_kernel_notifications = num_threads * n_blocks;
+    MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
+                                                    nullptr);
+
+    for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
+      const Index k_start = k_block_idx * kc;
+      // make sure we don't overshoot right edge of left matrix
+      const Index actual_kc = numext::mini(k_start + kc, k) - k_start;
+
+      for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
+        const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs);
+
+        for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
+          const Index m_start = mt_block_idx * mc;
+          const Index actual_mc = numext::mini(m_start + mc, m) - m_start;
+          eigen_assert(actual_mc > 0);
+
+          Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
+
+          for (int i = 0; i < n_blocks; ++i) {
+            Index notification_id = (blockAId * n_blocks + i);
+            // Wait for any current kernels using this slot to complete
+            // before using it.
+            if (kernel_notifications[notification_id]) {
+              wait_until_ready(kernel_notifications[notification_id]);
+              delete kernel_notifications[notification_id];
+            }
+            kernel_notifications[notification_id] = new Notification();
+          }
+          const packLArg arg = {
+            blockAs[blockAId], // blockA
+            lhs,        // lhs
+            m_start,    // m
+            k_start,    // k
+            actual_mc,  // mc
+            actual_kc,  // kc
+          };
+
+          // Delete any existing notification since we may be
+          // replacing it.  The algorithm should ensure that there are
+          // no existing waiters on this notification.
+          delete lhs_notifications[blockAId];
+          lhs_notifications[blockAId] =
+          this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
+        }
+
+        // now start kernels.
+        const Index m_base_start = m_block_idx * mc;
+        const bool need_to_pack = m_block_idx == 0;
+
+        for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
+          const Index n_start = n_block_idx * nc;
+          const Index actual_nc = numext::mini(n_start + nc, n) - n_start;
+
+          // first make sure the previous kernels are all done before overwriting rhs. Also wait if
+          // we're going to start new k. In both cases need_to_pack is true.
+          if (need_to_pack) {
+            for (Index i = num_blocks; i < num_threads; ++i) {
+              Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
+              Index future_id = (blockAId * n_blocks + n_block_idx);
+              wait_until_ready(kernel_notifications[future_id]);
+            }
+          }
+
+          packRKArg arg = {
+            &blockAs, // blockA
+            blockBs[n_block_idx], // blockB
+            rhs,          // rhs
+            output,       // output
+            m_base_start, // m
+            k_start,      // k
+            n_start,      // n
+            mc,           // mc
+            actual_kc,    // kc
+            actual_nc,    // nc
+            num_threads,
+            numBlockAs,
+            m,
+            k_block_idx,
+            m_block_idx,
+            n_block_idx, // n_block_idx
+            m_blocks, // m_blocks
+            n_blocks, // n_blocks
+            &kernel_notifications, // kernel notifications
+            &lhs_notifications,    // lhs notifications
+            need_to_pack, // need_to_pack
+          };
+
+          // We asynchronously kick off this function, which ends up
+          // notifying the appropriate kernel_notifications objects,
+          // which this thread waits on before exiting.
+          this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
+        }
+      }
+    }
+
+    // Make sure all the kernels are done.
+    for (size_t i = 0; i < kernel_notifications.size(); ++i) {
+      wait_until_ready(kernel_notifications[i]);
+      delete kernel_notifications[i];
+    }
+
+    // No need to wait for lhs notifications since they should have
+    // already been waited on.  Just clean them up.
+    for (size_t i = 0; i < lhs_notifications.size(); ++i) {
+      delete lhs_notifications[i];
+    }
+
+    // deallocate all of the memory for both A and B's
+    for (size_t i = 0; i < blockAs.size(); i++) {
+      this->m_device.deallocate(blockAs[i]);
+    }
+    for (size_t i = 0; i < blockBs.size(); i++) {
+      this->m_device.deallocate(blockBs[i]);
+    }
+
+#undef CEIL_DIV
+  }
+
+  /*
+   * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
+   * the LHS block, check that all of the kernels that worked on the same
+   * mt_block_idx in the previous m_block are done.
+   */
+  template <typename packLArg, typename LhsPacker>
+  static void packLhs(const packLArg arg) {
+    // perform actual packing
+    LhsPacker pack_lhs;
+    pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
+  }
+
+  /*
+   * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
+   * all kernels in the previous block are done.
+   * Then for each LHS future, we wait on the future and then call GEBP
+   * on the area packed by the future (which starts at
+   * blockA + future_idx * mt * kc) on the LHS and with the full packed
+   * RHS block.
+   * The output of this GEBP is written to output(m + i * mt, n).
+   */
+  template <typename packRKArg, typename RhsPacker, typename GebpKernel>
+  static void packRhsAndKernel(packRKArg arg) {
+    if (arg.need_to_pack) {
+      RhsPacker pack_rhs;
+      pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
+    }
+
+    GebpKernel gebp;
+    for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
+      const Index m_base_start = arg.m + arg.mc*mt_block_idx;
+      if (m_base_start < arg.max_m) {
+        Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
+        wait_until_ready((*arg.lhs_notifications)[blockAId]);
+        const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
+        gebp(arg.output.getSubMapper(m_base_start, arg.n),
+             (*arg.blockAs)[blockAId], arg.blockB,
+             actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
+
+        // Notify that the kernel is done.
+        const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
+        (*arg.kernel_notifications)[set_idx]->Notify();
+      }
+    }
+  }
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
+
+  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
+                               bool shard_by_col, bool prepacked) const {
+    const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
+                                          PacketType<RhsScalar, Device>::size);
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    const double kd = static_cast<double>(bk);
+    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+    // experimentally.
+    double computeBandwidth = bk == 1 ? 4.0 :
+          (shard_by_col ? bn : bm) < Traits::nr ||
+          (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+    // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
+    // so overall bandwidth is 1.0.
+    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
+    // Computations.
+    TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
+    // Output stores.
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+    if (prepacked) {
+      // Packing and kernels are executed in different tasks. When we calculate
+      // task grain size we look only at kernel cost assuming that kernel
+      // is more expensive than packing.
+      return cost;
+    }
+    // Lhs/rhs loads + computations.
+    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
+    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
+    // Lhs packing memory cost does not contribute considerably to overall
+    // execution time because lhs is prefetched early and accessed sequentially.
+    if (shard_by_col)
+      lhsCost.dropMemoryCost();
+    else
+      rhsCost.dropMemoryCost();
+    return cost + lhsCost + rhsCost;
+  }
+};
+
+} // end namespace Eigen
+
+#endif  // EIGEN_USE_THREADS
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
new file mode 100644
index 000000000..860a6949a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -0,0 +1,279 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+
+namespace Eigen {
+
+/** \class TensorConversionOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor conversion class. This class makes it possible to vectorize
+  * type casting operations when the number of scalars per packet in the source
+  * and the destination type differ
+  */
+namespace internal {
+template<typename TargetType, typename XprType>
+struct traits<TensorConversionOp<TargetType, XprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef TargetType Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = traits<XprType>::NumDimensions;
+  static const int Layout = traits<XprType>::Layout;
+  enum { Flags = 0 };
+};
+
+template<typename TargetType, typename XprType>
+struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense>
+{
+  typedef const TensorConversionOp<TargetType, XprType>& type;
+};
+
+template<typename TargetType, typename XprType>
+struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorConversionOp<TargetType, XprType> >::type>
+{
+  typedef TensorConversionOp<TargetType, XprType> type;
+};
+
+}  // end namespace internal
+
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct PacketConverter {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    // Only call m_impl.packet() when we have direct access to the underlying data. This
+    // ensures that we don't compute the subexpression twice. We may however load some
+    // coefficients twice, but in practice this doesn't negatively impact performance.
+    if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) {
+      // Force unaligned memory loads since we can't ensure alignment anymore
+      return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
+    } else {
+      const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+      typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+      typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
+      internal::scalar_cast_op<SrcType, TgtType> converter;
+      EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+      for (int i = 0; i < TgtPacketSize; ++i) {
+        values[i] = converter(m_impl.coeff(index+i));
+      }
+      TgtPacket rslt = internal::pload<TgtPacket>(values);
+      return rslt;
+    }
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+  const typename TensorEvaluator::Index m_maxIndex;
+};
+
+template<typename TargetType, typename XprType>
+class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename internal::traits<TensorConversionOp>::Scalar Scalar;
+    typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind;
+    typedef typename internal::traits<TensorConversionOp>::Index Index;
+    typedef typename internal::nested<TensorConversionOp>::type Nested;
+    typedef Scalar CoeffReturnType;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr)
+        : m_xpr(xpr) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
+    impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+};
+
+template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
+    return impl.evalSubExprsIfNeeded(data);
+  }
+};
+
+
+// Eval as rvalue
+template<typename TargetType, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
+{
+  typedef TensorConversionOp<TargetType, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef TargetType Scalar;
+  typedef TargetType CoeffReturnType;
+  typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename PacketType<SrcType, Device>::type PacketSourceType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = true,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_impl(op.expression(), device)
+  {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
+  {
+    return ConversionSubExprEval<internal::is_same<TargetType, SrcType>::value, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    return converter(m_impl.coeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
+        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
+    return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
+    if (vectorized) {
+      const double SrcCoeffRatio =
+          internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+      const double TgtCoeffRatio =
+          internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+      return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
+          TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
+    } else {
+      return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  protected:
+  template <int LoadMode, bool ActuallyVectorize>
+  struct PacketConv {
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+      internal::scalar_cast_op<SrcType, TargetType> converter;
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = converter(impl.coeff(index+i));
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
+
+  template <int LoadMode>
+  struct PacketConv<LoadMode, true> {
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+      const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+      const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+      PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
+                      SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+      return converter.template packet<LoadMode>(index);
+    }
+  };
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
new file mode 100644
index 000000000..abdf742c6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -0,0 +1,1104 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor convolution class.
+  *
+  *
+  */
+namespace internal {
+
+template <typename Index, typename InputDims, int NumKernelDims, int Layout>
+class IndexMapper {
+ public:
+  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
+              const array<Index, NumKernelDims>& indices) {
+
+    array<Index, NumDims> dimensions = input_dims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = indices[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      dimensions[index] = result_dim;
+    }
+
+    array<Index, NumDims> inputStrides;
+    array<Index, NumDims> outputStrides;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputStrides[0] = 1;
+      outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
+        outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
+      }
+    } else {
+      inputStrides[NumDims - 1] = 1;
+      outputStrides[NumDims - 1] = 1;
+      for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
+        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
+      }
+    }
+
+    array<Index, NumDims> cudaInputDimensions;
+    array<Index, NumDims> cudaOutputDimensions;
+    array<Index, NumDims> tmp = dimensions;
+    array<Index, NumDims> ordering;
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = i + offset;
+      ordering[index] = indices[i];
+      tmp[indices[i]] = -1;
+      cudaInputDimensions[index] = input_dims[indices[i]];
+      cudaOutputDimensions[index] = dimensions[indices[i]];
+    }
+
+    int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                      ? NumKernelDims
+                      : 0;
+    for (int i = 0; i < NumDims; ++i) {
+      if (tmp[i] >= 0) {
+        ordering[written] = i;
+        cudaInputDimensions[written] = input_dims[i];
+        cudaOutputDimensions[written] = dimensions[i];
+        ++written;
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = inputStrides[ordering[i]];
+      m_outputStrides[i] = outputStrides[ordering[i]];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims; ++i) {
+        if (i > NumKernelDims) {
+          m_cudaInputStrides[i] =
+              m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
+          m_cudaOutputStrides[i] =
+              m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
+        } else {
+          m_cudaInputStrides[i] = 1;
+          m_cudaOutputStrides[i] = 1;
+        }
+      }
+    } else {
+      for (int i = NumDims - 1; i >= 0; --i) {
+        if (i + 1 < offset) {
+          m_cudaInputStrides[i] =
+              m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
+          m_cudaOutputStrides[i] =
+              m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
+        } else {
+          m_cudaInputStrides[i] = 1;
+          m_cudaOutputStrides[i] = 1;
+        }
+      }
+    }
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_cudaInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_cudaInputStrides[d];
+      }
+      inputIndex += p * m_inputStrides[NumKernelDims];
+    } else {
+      std::ptrdiff_t limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_cudaInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_cudaInputStrides[d];
+      }
+      inputIndex += p * m_inputStrides[limit];
+    }
+    return inputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
+    Index outputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_cudaOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_cudaOutputStrides[d];
+      }
+      outputIndex += p * m_outputStrides[NumKernelDims];
+    } else {
+      std::ptrdiff_t limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_cudaOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_cudaOutputStrides[d];
+      }
+      outputIndex += p * m_outputStrides[limit];
+    }
+    return outputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
+           k * m_inputStrides[offset + 2];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
+           k * m_outputStrides[offset + 2];
+  }
+
+ private:
+  static const int NumDims = internal::array_size<InputDims>::value;
+  array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_cudaInputStrides;
+  array<Index, NumDims> m_cudaOutputStrides;
+};
+
+
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename InputXprType::Scalar,
+                                        typename KernelXprType::Scalar>::ret Scalar;
+  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<InputXprType>::Index,
+                                      typename traits<KernelXprType>::Index>::type Index;
+  typedef typename InputXprType::Nested LhsNested;
+  typedef typename KernelXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<InputXprType>::NumDimensions;
+  static const int Layout = traits<InputXprType>::Layout;
+
+  enum {
+    Flags = 0
+  };
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
+{
+  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
+{
+  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
+      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Indices& indices() const { return m_indices; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename InputXprType::Nested>::type&
+    inputExpression() const { return m_input_xpr; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename KernelXprType::Nested>::type&
+    kernelExpression() const { return m_kernel_xpr; }
+
+  protected:
+    typename InputXprType::Nested m_input_xpr;
+    typename KernelXprType::Nested m_kernel_xpr;
+    const Indices m_indices;
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
+{
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<InputArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
+      }
+    } else {
+      m_inputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
+      }
+    }
+
+    m_dimensions = m_inputImpl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumKernelDims; ++i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i > 0) {
+          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
+        } else {
+          m_kernelStride[0] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      for (int i = NumKernelDims - 1; i >= 0; --i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i < NumKernelDims - 1) {
+          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
+        } else {
+          m_kernelStride[NumKernelDims - 1] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    preloadKernel();
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  void evalTo(typename XprType::Scalar* buffer) {
+    evalSubExprsIfNeeded(NULL);
+    for (int i = 0; i < dimensions().TotalSize(); ++i) {
+      buffer[i] += coeff(i);
+    }
+    cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    CoeffReturnType result = CoeffReturnType(0);
+    convolve(firstInput(index), 0, NumKernelDims-1, result);
+    return result;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
+  {
+    Index indices[2] = {index, index+PacketSize-1};
+    Index startInputs[2] = {0, 0};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    }
+    startInputs[0] += indices[0];
+    startInputs[1] += indices[1];
+
+    if (startInputs[1]-startInputs[0] == PacketSize-1) {
+      PacketReturnType result = internal::pset1<PacketReturnType>(0);
+      convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
+      return result;
+    } else {
+      EIGEN_ALIGN_MAX Scalar data[PacketSize];
+      data[0] = Scalar(0);
+      convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
+      for (int i = 1; i < PacketSize-1; ++i) {
+        data[i] = Scalar(0);
+        convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
+      }
+      data[PacketSize-1] = Scalar(0);
+      convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
+      return internal::pload<PacketReturnType>(data);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost =
+        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+         TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+                          m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+                                       PacketSize));
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    }
+    startInput += index;
+    return startInput;
+  }
+
+  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolve(input, kernel, DimIndex-1, accum);
+      } else {
+        accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+      }
+    }
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolvePacket(input, kernel, DimIndex-1, accum);
+      } else {
+        accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  array<Index, NumDims> m_inputStride;
+  array<Index, NumDims> m_outputStride;
+
+  array<Index, NumKernelDims> m_indexStride;
+  array<Index, NumKernelDims> m_kernelStride;
+  TensorEvaluator<InputArgType, Device> m_inputImpl;
+  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+  Dimensions m_dimensions;
+
+  KernelArgType m_kernelArg;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+  const Device& m_device;
+};
+
+
+
+
+// Use an optimized implementation of the evaluation code for GPUs whenever possible.
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+template <int StaticKernelSize>
+struct GetKernelSize {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
+    return StaticKernelSize;
+  }
+};
+template <>
+struct GetKernelSize<Dynamic> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
+    return kernelSize;
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims,
+          int StaticKernelSize>
+__global__ void EigenConvolutionKernel1D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX,
+    const int maxX, const int kernelSize, float* buffer) {
+  extern __shared__ float s[];
+
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_plane = blockIdx.y * blockDim.y;
+  const int plane_stride = blockDim.y * gridDim.y;
+
+  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
+    // Load inputs to shared memory
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = threadIdx.y * num_x_input;
+    #pragma unroll
+    for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+      const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
+      s[i + plane_kernel_offset] = eval.coeff(tensor_index);
+    }
+
+    __syncthreads();
+
+    // Compute the convolution
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+    #pragma unroll
+    for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+      const int kernel_offset = plane_kernel_offset + i;
+      float result = 0.0f;
+      #pragma unroll
+      for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
+        result += s[k + kernel_offset] * kernel[k];
+      }
+      const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
+      buffer[tensor_index] = result;
+    }
+    __syncthreads();
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims,
+          int StaticKernelSizeX, int StaticKernelSizeY>
+__global__ void EigenConvolutionKernel2D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX,
+    const int maxX, const int numY, const int maxY, const int kernelSizeX,
+    const int kernelSizeY, float* buffer) {
+  extern __shared__ float s[];
+
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_y = blockIdx.y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
+  const int num_y_output = last_y - first_y + 1;
+
+  const int first_plane = blockIdx.z * blockDim.z;
+  const int plane_stride = blockDim.z * gridDim.z;
+
+  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
+
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = threadIdx.z * num_y_input;
+
+    // Load inputs to shared memory
+    #pragma unroll
+    for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+      const int input_offset = num_x_input * (j + plane_kernel_offset);
+      #pragma unroll
+      for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+        const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
+        s[i + input_offset] = eval.coeff(tensor_index);
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+    #pragma unroll
+    for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+      #pragma unroll
+      for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+        float result = 0.0f;
+        #pragma unroll
+        for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
+          const int kernel_offset = kernelSizeX * l;
+          const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
+          #pragma unroll
+          for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
+            result += s[k + input_offset] * kernel[k + kernel_offset];
+          }
+        }
+        const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+        buffer[tensor_index] = result;
+      }
+    }
+
+    __syncthreads();
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims>
+__global__ void EigenConvolutionKernel3D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const size_t numPlanes, const size_t numX,
+    const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
+    const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
+    const size_t kernelSizeZ, float* buffer) {
+  extern __shared__ float s[];
+
+  // Load inputs to shared memory
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + kernelSizeX;
+
+  const int first_y = blockIdx.y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + kernelSizeY;
+
+  const int first_z = blockIdx.z * maxZ;
+  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+  const int num_z_input = last_z - first_z + kernelSizeZ;
+
+  for (int p = 0; p < numPlanes; ++p) {
+
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = 0;
+
+    for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
+      for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+        for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+          const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+          s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
+        }
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int num_z_output = last_z - first_z + 1;
+    const int num_y_output = last_y - first_y + 1;
+    const int num_x_output = last_x - first_x + 1;
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+    for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
+      for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+        for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+          float result = 0.0f;
+          for (int n = 0; n < kernelSizeZ; ++n) {
+            for (int m = 0; m < kernelSizeY; ++m) {
+              for (int l = 0; l < kernelSizeX; ++l) {
+                result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
+              }
+            }
+          }
+          const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+          buffer[tensor_index] = result;
+        }
+      }
+    }
+    __syncthreads();
+  }
+};
+
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
+{
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
+    PacketAccess = false,
+    Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  static unsigned int ceil(unsigned int num, unsigned int denom) {
+    const unsigned int rounded_toward_zero = num / denom;
+    if (num > rounded_toward_zero * denom) {
+      return rounded_toward_zero + 1;
+    }
+    return rounded_toward_zero;
+  }
+
+  void executeEval(Scalar* data) const {
+    typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
+
+    const int maxSharedMem = m_device.sharedMemPerBlock();
+    const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
+    const int warpSize = 32;
+
+    switch (NumKernelDims) {
+      case 1: {
+        const int kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        const int numX = dimensions()[m_indices[0]];
+        const int numP = dimensions().TotalSize() / numX;
+        int maxX;
+        dim3 block_size;
+
+        const int single_stride_dim =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                ? 0
+                : m_inputImpl.dimensions().rank() - 1;
+        if (m_indices[0] == single_stride_dim) {
+          // Maximum the reuse
+          const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
+          maxX = numext::mini<int>(inner_dim, numX);
+          const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+          block_size.x = numext::mini(maxThreadsPerBlock, maxX);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
+        }
+        else {
+          // Read as much as possible alongside the inner most dimension, that is the plane
+          const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
+          const int maxP = numext::mini<int>(inner_dim, numP);
+          maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+
+          block_size.x = numext::mini(warpSize, maxX);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
+        }
+
+        const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
+
+        dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
+
+
+        //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 1> indices(m_indices[0]);
+        const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
+        switch(kernel_size) {
+          case 4: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+            break;
+          }
+          case 7: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+            break;
+          }
+          default: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+          }
+        }
+        break;
+      }
+
+      case 2: {
+        const int idxX =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
+        const int idxY =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numP = dimensions().TotalSize() / (numX*numY);
+
+        const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
+
+        // Snap maxX to warp size
+        int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
+        const int maxX = numext::mini<int>(inner_dim, numX);
+        const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+        const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+
+        dim3 block_size;
+        block_size.x = numext::mini(1024, maxX);
+        block_size.y = numext::mini<int>(1024/block_size.x, maxY);
+        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
+
+        const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int num_y_blocks = ceil(numY, maxY);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
+
+        dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
+
+
+        //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
+        const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
+                                          m_kernelImpl.dimensions()[idxY]);
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
+        switch (kernel_size_x) {
+          case 4: {
+            switch (kernel_size_y) {
+              case 7: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+                break;
+              }
+              default: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          case 7: {
+            switch (kernel_size_y) {
+              case 4: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+                break;
+              }
+              default: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          default: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            break;
+          }
+        }
+        break;
+      }
+
+      case 3: {
+        const int idxX =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
+        const int idxY =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
+        const int idxZ =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
+
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+        const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numZ = dimensions()[m_indices[idxZ]];
+        const int numP = dimensions().TotalSize() / (numX*numY*numZ);
+
+        const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
+        const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
+        const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
+
+        dim3 block_size;
+        block_size.x = numext::mini(32, maxX);
+        block_size.y = numext::mini(32, maxY);
+        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
+        dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
+
+        const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+        const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
+                                      m_indices[idxZ]);
+        const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
+                                          m_kernelImpl.dimensions()[idxY],
+                                          m_kernelImpl.dimensions()[idxZ]);
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
+
+        LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        break;
+      }
+
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost =
+        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+         TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+                          m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+                                       PacketSize));
+  }
+
+ private:
+  // No assignment (copies are needed by the kernels)
+  TensorEvaluator& operator = (const TensorEvaluator&);
+
+  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
+  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
+  KernelArgType m_kernelArg;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  Scalar* m_buf;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+
+  const GpuDevice& m_device;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
new file mode 100644
index 000000000..83c449cf1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -0,0 +1,212 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A cost model used to limit the number of threads used for evaluating
+  * tensor expression.
+  *
+  */
+
+// Class storing the cost of evaluating a tensor expression in terms of the
+// estimated number of operand bytes loads, bytes stored, and compute cycles.
+class TensorOpCost {
+ public:
+  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
+  // model based on minimal reciprocal throughput numbers from Intel or
+  // Agner Fog's tables would be better than what is there now.
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
+    return internal::functor_traits<
+        internal::scalar_product_op<ArgType, ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
+    return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
+    return internal::functor_traits<
+        internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
+    return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
+  }
+  template <typename SrcType, typename TargetType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
+    return internal::functor_traits<
+        internal::scalar_cast_op<SrcType, TargetType> >::Cost;
+  }
+
+  EIGEN_DEVICE_FUNC
+  TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+  EIGEN_DEVICE_FUNC
+  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
+      : bytes_loaded_(bytes_loaded),
+        bytes_stored_(bytes_stored),
+        compute_cycles_(compute_cycles) {}
+
+  EIGEN_DEVICE_FUNC
+  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
+               bool vectorized, double packet_size)
+      : bytes_loaded_(bytes_loaded),
+        bytes_stored_(bytes_stored),
+        compute_cycles_(vectorized ? compute_cycles / packet_size
+                                   : compute_cycles) {
+    eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
+    eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
+    eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
+    return bytes_loaded_;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
+    return bytes_stored_;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
+    return compute_cycles_;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
+      double load_cost, double store_cost, double compute_cost) const {
+    return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
+           compute_cost * compute_cycles_;
+  }
+
+  // Drop memory access component. Intended for cases when memory accesses are
+  // sequential or are completely masked by computations.
+  EIGEN_DEVICE_FUNC void dropMemoryCost() {
+    bytes_loaded_ = 0;
+    bytes_stored_ = 0;
+  }
+
+  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
+  }
+
+  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
+      const TensorOpCost& rhs) {
+    bytes_loaded_ += rhs.bytes_loaded();
+    bytes_stored_ += rhs.bytes_stored();
+    compute_cycles_ += rhs.compute_cycles();
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
+    bytes_loaded_ *= rhs;
+    bytes_stored_ *= rhs;
+    compute_cycles_ *= rhs;
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
+      TensorOpCost lhs, const TensorOpCost& rhs) {
+    lhs += rhs;
+    return lhs;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
+      TensorOpCost lhs, double rhs) {
+    lhs *= rhs;
+    return lhs;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
+      double lhs, TensorOpCost rhs) {
+    rhs *= lhs;
+    return rhs;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
+    return os << "[bytes_loaded = " << tc.bytes_loaded()
+              << ", bytes_stored = " << tc.bytes_stored()
+              << ", compute_cycles = " << tc.compute_cycles() << "]";
+  }
+
+ private:
+  double bytes_loaded_;
+  double bytes_stored_;
+  double compute_cycles_;
+};
+
+// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
+// in [1:max_threads] instead of just switching multi-threading off for small
+// work units.
+template <typename Device>
+class TensorCostModel {
+ public:
+  // Scaling from Eigen compute cost to device cycles.
+  static const int kDeviceCyclesPerComputeCycle = 1;
+
+ // Costs in device cycles.
+  static const int kStartupCycles = 100000;
+  static const int kPerThreadCycles = 100000;
+  static const int kTaskSize = 40000;
+
+  // Returns the number of threads in [1:max_threads] to use for
+  // evaluating an expression with the given output size and cost per
+  // coefficient.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
+      double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
+    double cost = totalCost(output_size, cost_per_coeff);
+    int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+    return numext::mini(max_threads, numext::maxi(1, threads));
+  }
+
+  // taskSize assesses parallel task size.
+  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
+  // granularity needs to be increased to mitigate parallelization overheads.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
+      double output_size, const TensorOpCost& cost_per_coeff) {
+    return totalCost(output_size, cost_per_coeff) / kTaskSize;
+  }
+
+ private:
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
+      double output_size, const TensorOpCost& cost_per_coeff) {
+    // Cost of memory fetches from L2 cache. 64 is typical cache line size.
+    // 11 is L2 cache latency on Haswell.
+    // We don't know whether data is in L1, L2 or L3. But we are most interested
+    // in single-threaded computational time around 100us-10ms (smaller time
+    // is too small for parallelization, larger time is not intersting
+    // either because we are probably using all available threads already).
+    // And for the target time range, L2 seems to be what matters. Data set
+    // fitting into L1 is too small to take noticeable time. Data set fitting
+    // only into L3 presumably will take more than 10ms to load and process.
+    const double kLoadCycles = 1.0 / 64 * 11;
+    const double kStoreCycles = 1.0 / 64 * 11;
+    // Scaling from Eigen compute cost to device cycles.
+    return output_size *
+        cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
+                                  kDeviceCyclesPerComputeCycle);
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
new file mode 100644
index 000000000..e020d076f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -0,0 +1,313 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+
+namespace Eigen {
+
+/** \class TensorCustomUnaryOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor custom class.
+  *
+  *
+  */
+namespace internal {
+template<typename CustomUnaryFunc, typename XprType>
+struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::StorageKind StorageKind;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = traits<XprType>::NumDimensions;
+  static const int Layout = traits<XprType>::Layout;
+};
+
+template<typename CustomUnaryFunc, typename XprType>
+struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
+{
+  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
+};
+
+template<typename CustomUnaryFunc, typename XprType>
+struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
+{
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename CustomUnaryFunc, typename XprType>
+class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorCustomUnaryOp>::type Nested;
+  typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorCustomUnaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func)
+      : m_expr(expr), m_func(func) {}
+
+  EIGEN_DEVICE_FUNC
+  const CustomUnaryFunc& func() const { return m_func; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_expr; }
+
+  protected:
+    typename XprType::Nested m_expr;
+    const CustomUnaryFunc m_func;
+};
+
+
+// Eval as rvalue
+template<typename CustomUnaryFunc, typename XprType, typename Device>
+struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device>
+{
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType;
+  typedef typename internal::traits<ArgType>::Index Index;
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<XprType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
+      : m_op(op), m_device(device), m_result(NULL)
+  {
+    m_dimensions = op.func().dimensions(op.expression());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<CoeffReturnType*>(
+          m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_result[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+
+ protected:
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
+        data, m_dimensions);
+    m_op.func().eval(m_op.expression(), result, m_device);
+  }
+
+  Dimensions m_dimensions;
+  const ArgType m_op;
+  const Device& m_device;
+  CoeffReturnType* m_result;
+};
+
+
+
+/** \class TensorCustomBinaryOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor custom class.
+  *
+  *
+  */
+namespace internal {
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
+{
+  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
+                                                  typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<LhsXprType>::NumDimensions;
+  static const int Layout = traits<LhsXprType>::Layout;
+};
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type;
+};
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
+{
+  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+class TensorCustomBinaryOp : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorCustomBinaryOp>::type Nested;
+  typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorCustomBinaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func)
+
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {}
+
+  EIGEN_DEVICE_FUNC
+  const CustomBinaryFunc& func() const { return m_func; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename LhsXprType::Nested>::type&
+  lhsExpression() const { return m_lhs_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename RhsXprType::Nested>::type&
+  rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const CustomBinaryFunc m_func;
+};
+
+
+// Eval as rvalue
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device>
+struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device>
+{
+  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType;
+  typedef typename internal::traits<XprType>::Index Index;
+  static const int NumDims = internal::traits<XprType>::NumDimensions;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<LhsXprType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_op(op), m_device(device), m_result(NULL)
+  {
+    m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_result[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+
+ protected:
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
+    TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
+    m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
+  }
+
+  Dimensions m_dimensions;
+  const XprType m_op;
+  const Device& m_device;
+  CoeffReturnType* m_result;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
new file mode 100644
index 000000000..29e50a3b2
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -0,0 +1,68 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+
+namespace Eigen {
+
+/** \class TensorDevice
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Pseudo expression providing an operator = that will evaluate its argument
+  * on the specified computing 'device' (GPU, thread pool, ...)
+  *
+  * Example:
+  *    C.device(EIGEN_GPU) = A + B;
+  *
+  * Todo: operator *= and /=.
+  */
+
+template <typename ExpressionType, typename DeviceType> class TensorDevice {
+  public:
+    TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+      Assign assign(m_expression, other);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+      Difference difference(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+      Assign assign(m_expression, difference);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+      return *this;
+    }
+
+  protected:
+    const DeviceType& m_device;
+    ExpressionType& m_expression;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
new file mode 100644
index 000000000..4f5767bc7
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -0,0 +1,337 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
+
+namespace Eigen {
+
+static const int kCudaScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const cudaStream_t& stream() const = 0;
+  virtual const cudaDeviceProp& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
+};
+
+static cudaDeviceProp* m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+  if (!m_devicePropInitialized) {
+    // Attempts to ensure proper behavior in the case of multiple threads
+    // calling this function simultaneously. This would be trivial to
+    // implement if we could use std::mutex, but unfortunately mutex don't
+    // compile with nvcc, so we resort to atomics and thread fences instead.
+    // Note that if the caller uses a compiler that doesn't support c++11 we
+    // can't ensure that the initialization is thread safe.
+#if __cplusplus >= 201103L
+    static std::atomic<bool> first(true);
+    if (first.exchange(false)) {
+#else
+    static bool first = true;
+    if (first) {
+      first = false;
+#endif
+      // We're the first thread to reach this point.
+      int num_devices;
+      cudaError_t status = cudaGetDeviceCount(&num_devices);
+      if (status != cudaSuccess) {
+        std::cerr << "Failed to get the number of CUDA devices: "
+                  << cudaGetErrorString(status)
+                  << std::endl;
+        assert(status == cudaSuccess);
+      }
+      m_deviceProperties = new cudaDeviceProp[num_devices];
+      for (int i = 0; i < num_devices; ++i) {
+        status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
+        if (status != cudaSuccess) {
+          std::cerr << "Failed to initialize CUDA device #"
+                    << i
+                    << ": "
+                    << cudaGetErrorString(status)
+                    << std::endl;
+          assert(status == cudaSuccess);
+        }
+      }
+
+#if __cplusplus >= 201103L
+      std::atomic_thread_fence(std::memory_order_release);
+#endif
+      m_devicePropInitialized = true;
+    } else {
+      // Wait for the other thread to inititialize the properties.
+      while (!m_devicePropInitialized) {
+#if __cplusplus >= 201103L
+        std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+        sleep(1);
+      }
+    }
+  }
+}
+
+static const cudaStream_t default_stream = cudaStreamDefault;
+
+class CudaStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+    cudaGetDevice(&device_);
+    initializeDeviceProp();
+  }
+  // Use the default stream on the specified device
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    initializeDeviceProp();
+  }
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  CudaStreamDevice(const cudaStream_t* stream, int device = -1)
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    if (device < 0) {
+      cudaGetDevice(&device_);
+    } else {
+      int num_devices;
+      cudaError_t err = cudaGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(err)
+      assert(err == cudaSuccess);
+      assert(device < num_devices);
+      device_ = device;
+    }
+    initializeDeviceProp();
+  }
+
+  virtual ~CudaStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
+  const cudaStream_t& stream() const { return *stream_; }
+  const cudaDeviceProp& deviceProperties() const {
+    return m_deviceProperties[device_];
+  }
+  virtual void* allocate(size_t num_bytes) const {
+    cudaError_t err = cudaSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+    void* result;
+    err = cudaMalloc(&result, num_bytes);
+    assert(err == cudaSuccess);
+    assert(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    cudaError_t err = cudaSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+    assert(buffer != NULL);
+    err = cudaFree(buffer);
+    assert(err == cudaSuccess);
+  }
+
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      assert(err == cudaSuccess);
+    }
+    return semaphore_;
+  }
+
+ private:
+  const cudaStream_t* stream_;
+  int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+    eigen_assert(stream);
+  }
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
+    return stream_->stream();
+  }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return stream_->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    stream_->deallocate(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void* scratchpad() const {
+    return stream_->scratchpad();
+  }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+    return stream_->semaphore();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
+                                      stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+#else
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    cudaError_t err =
+        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+  }
+
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    cudaError_t err =
+        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+#else
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on cuda devices.
+    return firstLevelCacheSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
+    cudaError_t err = cudaStreamSynchronize(stream_->stream());
+    if (err != cudaSuccess) {
+      std::cerr << "Error detected in CUDA stream: "
+                << cudaGetErrorString(err)
+                << std::endl;
+      assert(err == cudaSuccess);
+    }
+#else
+    assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
+    return stream_->deviceProperties().multiProcessorCount;
+  }
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
+    return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+    return stream_->deviceProperties().sharedMemPerBlock;
+  }
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return stream_->deviceProperties().major;
+  }
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+    return stream_->deviceProperties().minor;
+  }
+
+  EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
+  // This function checks if the CUDA runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+#ifdef __CUDACC__
+    cudaError_t error = cudaStreamQuery(stream_->stream());
+    return (error == cudaSuccess) || (error == cudaErrorNotReady);
+#else
+    return false;
+#endif
+  }
+
+ private:
+  const StreamInterface* stream_;
+  int max_blocks_;
+};
+
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
+  assert(cudaGetLastError() == cudaSuccess);
+
+
+// FIXME: Should be device and kernel specific.
+#ifdef __CUDACC__
+static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+#ifndef __CUDA_ARCH__
+  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status)
+  assert(status == cudaSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
+#endif
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
new file mode 100644
index 000000000..9d141395b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+#ifndef __CUDA_ARCH__
+    // Running on the host CPU
+    return 1;
+#else
+    // Running on a CUDA device
+    return 32;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+#ifndef __CUDA_ARCH__
+    // Running on the host CPU
+    return l1CacheSize();
+#else
+    // Running on a CUDA device, return the amount of shared memory available.
+    return 48*1024;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+#ifndef __CUDA_ARCH__
+    // Running single threaded on the host CPU
+    return l3CacheSize();
+#else
+    // Running on a CUDA device
+    return firstLevelCacheSize();
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
+    // Running single threaded on the host CPU
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+#else
+    // Running on a CUDA device
+    return __CUDA_ARCH__ / 100;
+#endif
+  }
+};
+
+}  // namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
new file mode 100644
index 000000000..7c039890e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -0,0 +1,122 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
+
+namespace Eigen {
+struct SyclDevice {
+  /// class members
+  /// sycl queue
+  mutable cl::sycl::queue m_queue;
+  /// std::map is the container used to make sure that we create only one buffer
+  /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
+  /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
+  mutable std::map<const void *, std::shared_ptr<void>> buffer_map;
+  /// creating device by using selector
+  template<typename dev_Selector> SyclDevice(dev_Selector s)
+  :
+#ifdef EIGEN_EXCEPTIONS
+  m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) {
+    for (const auto& e : l) {
+      try {
+        std::rethrow_exception(e);
+      } catch (cl::sycl::exception e) {
+          std::cout << e.what() << std::endl;
+        }
+    }
+  }))
+#else
+  m_queue(cl::sycl::queue(s))
+#endif
+  {}
+  // destructor
+  ~SyclDevice() { deallocate_all(); }
+
+  template <typename T> void deallocate(T *p) const {
+    auto it = buffer_map.find(p);
+    if (it != buffer_map.end()) {
+      buffer_map.erase(it);
+      internal::aligned_free(p);
+    }
+  }
+  void deallocate_all() const {
+    std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin();
+    while (it!=buffer_map.end()) {
+      auto p=it->first;
+      buffer_map.erase(it);
+      internal::aligned_free(const_cast<void*>(p));
+      it=buffer_map.begin();
+    }
+    buffer_map.clear();
+  }
+
+  /// creation of sycl accessor for a buffer. This function first tries to find
+  /// the buffer in the buffer_map. If found it gets the accessor from it, if not,
+  ///the function then adds an entry by creating a sycl buffer for that particular pointer.
+  template <cl::sycl::access::mode AcMd, typename T> inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const {
+    return (get_sycl_buffer<T>(num_bytes, ptr)->template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
+  }
+
+  template<typename T> inline  std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const {
+    using Type = cl::sycl::buffer<T, 1>;
+    std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> ret = buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(ptr, std::shared_ptr<void>(new Type(cl::sycl::range<1>(num_bytes)),
+      [](void *dataMem) { delete static_cast<Type*>(dataMem); })));
+    (static_cast<Type*>(buffer_map.at(ptr).get()))->set_final_data(nullptr);
+    return ret;
+  }
+
+  template <typename T> inline cl::sycl::buffer<T, 1>* get_sycl_buffer(size_t num_bytes,const T * ptr) const {
+    return static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(ptr, num_bytes).first->second.get());
+  }
+
+  /// allocating memory on the cpu
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const {
+    return internal::aligned_malloc(8);
+  }
+
+  // some runtime conditions that can be applied here
+  bool isDeviceSuitable() const { return true; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+
+  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
+    auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
+    memcpy(host_acc.get_pointer(), src, n);
+  }
+ /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon.
+  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const {
+    auto it = buffer_map.find(src);
+    if (it != buffer_map.end()) {
+      auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>();
+      memcpy(dst,host_acc.get_pointer(),  n);
+    } else{
+      eigen_assert("no device memory found. The memory might be destroyed before creation");
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+  return 1;
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
new file mode 100644
index 000000000..069680a11
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -0,0 +1,279 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
+
+namespace Eigen {
+
+// Use the SimpleThreadPool by default. We'll switch to the new non blocking
+// thread pool later.
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
+typedef NonBlockingThreadPool ThreadPool;
+#else
+template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
+typedef SimpleThreadPool ThreadPool;
+#endif
+
+
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+class Barrier {
+ public:
+  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+    eigen_assert(((count << 1) >> 1) == count);
+  }
+  ~Barrier() {
+    eigen_assert((state_>>1) == 0);
+  }
+
+  void Notify() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      eigen_assert(((v + 2) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
+    std::unique_lock<std::mutex> l(mu_);
+    while (!notified_) {
+      cv_.wait(l);
+    }
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
+  bool notified_;
+};
+
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1) {};
+};
+
+
+// Runs an arbitrary function and then calls Notify() on the passed in
+// Notification.
+template <typename Function, typename... Args> struct FunctionWrapperWithNotification
+{
+  static void run(Notification* n, Function f, Args... args) {
+    f(args...);
+    if (n) {
+      n->Notify();
+    }
+  }
+};
+
+template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
+{
+  static void run(Barrier* b, Function f, Args... args) {
+    f(args...);
+    if (b) {
+      b->Notify();
+    }
+  }
+};
+
+template <typename SyncType>
+static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
+  if (n) {
+    n->Wait();
+  }
+}
+
+
+// Build a thread pool device on top the an existing pool of threads.
+struct ThreadPoolDevice {
+  // The ownership of the thread pool remains with the caller.
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+
+  EIGEN_STRONG_INLINE int numThreads() const {
+    return num_threads_;
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    return l1CacheSize();
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // The l3 cache size is shared between all the cores.
+    return l3CacheSize() / num_threads_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
+    Notification* n = new Notification();
+    pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
+    return n;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
+                                                Function&& f,
+                                                Args&&... args) const {
+    pool_->Schedule(std::bind(
+        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
+    pool_->Schedule(std::bind(f, args...));
+  }
+
+  // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+  // called from one of the threads in pool_. Returns -1 otherwise.
+  EIGEN_STRONG_INLINE int currentThreadId() const {
+    return pool_->CurrentThreadId();
+  }
+
+  // parallelFor executes f with [0, n) arguments in parallel and waits for
+  // completion. F accepts a half-open interval [first, last).
+  // Block size is choosen based on the iteration cost and resulting parallel
+  // efficiency. If block_align is not nullptr, it is called to round up the
+  // block size.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<Index(Index)> block_align,
+                   std::function<void(Index, Index)> f) const {
+    typedef TensorCostModel<ThreadPoolDevice> CostModel;
+    if (n <= 1 || numThreads() == 1 ||
+        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      return;
+    }
+
+    // Calculate block size based on (1) the iteration cost and (2) parallel
+    // efficiency. We want blocks to be not too small to mitigate
+    // parallelization overheads; not too large to mitigate tail
+    // effect and potential load imbalance and we also want number
+    // of blocks to be evenly dividable across threads.
+
+    double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+    Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
+    const Index max_block_size =
+        numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
+    if (block_align) {
+      Index new_block_size = block_align(block_size);
+      eigen_assert(new_block_size >= block_size);
+      block_size = numext::mini(n, new_block_size);
+    }
+    Index block_count = divup(n, block_size);
+    // Calculate parallel efficiency as fraction of total CPU time used for
+    // computations:
+    double max_efficiency =
+        static_cast<double>(block_count) /
+        (divup<int>(block_count, numThreads()) * numThreads());
+    // Now try to increase block size up to max_block_size as long as it
+    // doesn't decrease parallel efficiency.
+    for (Index prev_block_count = block_count; prev_block_count > 1;) {
+      // This is the next block size that divides size into a smaller number
+      // of blocks than the current block_size.
+      Index coarser_block_size = divup(n, prev_block_count - 1);
+      if (block_align) {
+        Index new_block_size = block_align(coarser_block_size);
+        eigen_assert(new_block_size >= coarser_block_size);
+        coarser_block_size = numext::mini(n, new_block_size);
+      }
+      if (coarser_block_size > max_block_size) {
+        break;  // Reached max block size. Stop.
+      }
+      // Recalculate parallel efficiency.
+      const Index coarser_block_count = divup(n, coarser_block_size);
+      eigen_assert(coarser_block_count < prev_block_count);
+      prev_block_count = coarser_block_count;
+      const double coarser_efficiency =
+          static_cast<double>(coarser_block_count) /
+          (divup<int>(coarser_block_count, numThreads()) * numThreads());
+      if (coarser_efficiency + 0.01 >= max_efficiency) {
+        // Taking it.
+        block_size = coarser_block_size;
+        block_count = coarser_block_count;
+        if (max_efficiency < coarser_efficiency) {
+          max_efficiency = coarser_efficiency;
+        }
+      }
+    }
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    Barrier barrier(static_cast<unsigned int>(block_count));
+    std::function<void(Index, Index)> handleRange;
+    handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
+      if (last - first <= block_size) {
+        // Single block or less, execute directly.
+        f(first, last);
+        barrier.Notify();
+        return;
+      }
+      // Split into halves and submit to the pool.
+      Index mid = first + divup((last - first) / 2, block_size) * block_size;
+      pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
+      pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
+    };
+    handleRange(0, n);
+    barrier.Wait();
+  }
+
+  // Convenience wrapper for parallelFor that does not align blocks.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<void(Index, Index)> f) const {
+    parallelFor(n, cost, nullptr, std::move(f));
+  }
+
+ private:
+  ThreadPoolInterface* pool_;
+  int num_threads_;
+};
+
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
new file mode 100644
index 000000000..1a30e45fb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorDimensionList
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n.
+  *
+  * \sa Tensor
+  */
+
+template <typename Index, std::size_t Rank> struct DimensionList {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  const Index operator[] (const Index i) const { return i; }
+};
+
+namespace internal {
+
+template<typename Index, std::size_t Rank> struct array_size<DimensionList<Index, Rank> > {
+  static const size_t value = Rank;
+};
+template<typename Index, std::size_t Rank> struct array_size<const DimensionList<Index, Rank> > {
+  static const size_t value = Rank;
+};
+
+template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>&) {
+  return n;
+}
+template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>&) {
+  return n;
+}
+
+
+#if EIGEN_HAS_CONSTEXPR
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<DimensionList<Index, Rank> > {
+  static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return i == value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return i == value;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return i != value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
+  static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return i != value;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return i > value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return i > value;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return i < value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return i < value;
+  }
+};
+
+#else
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+    return false;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+    return false;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+    return false;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+    return false;
+  }
+};
+#endif
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
new file mode 100644
index 000000000..b24cdebf1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -0,0 +1,428 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorDimensions
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Set of classes used to encode and store the dimensions of a Tensor.
+  *
+  * The Sizes class encodes as part of the type the number of dimensions and the
+  * sizes corresponding to each dimension. It uses no storage space since it is
+  * entirely known at compile time.
+  * The DSizes class is its dynamic sibling: the number of dimensions is known
+  * at compile time but the sizes are set during execution.
+  *
+  * \sa Tensor
+  */
+
+// Boilerplate code
+namespace internal {
+
+template<std::size_t n, typename Dimension> struct dget {
+  static const std::size_t value = get<n, Dimension>::value;
+};
+
+
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(array<Index, NumIndices> const& indices,
+                          const Dimensions& dimensions)
+  {
+    return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
+        dget<RowMajor ? n - 1 : (NumIndices - n), Dimensions>::value *
+        fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+  }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(array<Index, NumIndices> const&, const Dimensions&)
+  {
+    return 0;
+  }
+};
+
+template<typename Index, std::size_t n>
+struct fixed_size_tensor_index_extraction_helper
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(const Index index,
+                          const Dimensions& dimensions)
+  {
+    const Index mult = (index == n-1) ? 1 : 0;
+    return array_get<n-1>(dimensions) * mult +
+        fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
+  }
+};
+
+template<typename Index>
+struct fixed_size_tensor_index_extraction_helper<Index, 0>
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(const Index,
+                          const Dimensions&)
+  {
+    return 0;
+  }
+  };
+
+}  // end namespace internal
+
+
+// Fixed size
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::ptrdiff_t... Indices>
+struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
+  typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
+  static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
+    return Base::count;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() {
+    return internal::arg_prod(Indices...);
+  }
+
+  EIGEN_DEVICE_FUNC Sizes() { }
+  template <typename DenseIndex>
+  explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+    // todo: add assertion
+  }
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
+  explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
+    // todo: add assertion
+  }
+#endif
+
+  template <typename T> Sizes& operator = (const T& /*other*/) {
+    // add assertion failure if the size of other is different
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
+  }
+
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
+  }
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
+  }
+};
+
+namespace internal {
+template <typename std::ptrdiff_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
+  return Sizes<Indices...>::total_size;
+}
+}
+
+#else
+
+template <std::size_t n>
+struct non_zero_size {
+  typedef internal::type2val<std::size_t, n> type;
+};
+template <>
+struct non_zero_size<0> {
+  typedef internal::null_type type;
+};
+
+template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
+  typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
+  static const size_t count = Base::count;
+  static const std::size_t total_size = internal::arg_prod<Base>::value;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+    return count;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
+    return internal::arg_prod<Base>::value;
+  }
+
+  Sizes() { }
+  template <typename DenseIndex>
+  explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+    // todo: add assertion
+  }
+  template <typename T> Sizes& operator = (const T& /*other*/) {
+    // add assertion failure if the size of other is different
+    return *this;
+  }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
+  explicit Sizes(std::initializer_list<std::size_t>) {
+    // todo: add assertion
+  }
+#else
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
+  }
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
+  }
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
+  }
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+  }
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex operator[] (const int index) const {
+    switch (index) {
+      case 0:
+        return internal::get<0, Base>::value;
+      case 1:
+        return internal::get<1, Base>::value;
+      case 2:
+        return internal::get<2, Base>::value;
+      case 3:
+        return internal::get<3, Base>::value;
+      case 4:
+        return internal::get<4, Base>::value;
+      default:
+        eigen_assert(false && "index overflow");
+        return static_cast<DenseIndex>(-1);
+    }
+  }
+
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
+  }
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
+  }
+};
+
+namespace internal {
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
+  return Sizes<V1, V2, V3, V4, V5>::total_size;
+}
+}
+
+#endif
+
+// Boilerplate
+namespace internal {
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct tensor_index_linearization_helper
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions)
+  {
+    return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+      array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
+        tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+  }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&)
+  {
+    return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+  }
+};
+}  // end namespace internal
+
+
+
+// Dynamic size
+template <typename DenseIndex, int NumDims>
+struct DSizes : array<DenseIndex, NumDims> {
+  typedef array<DenseIndex, NumDims> Base;
+  static const int count = NumDims;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+    return NumDims;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const {
+    return (NumDims == 0) ? 1 : internal::array_prod(*static_cast<const Base*>(this));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = 0;
+    }
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
+
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
+    eigen_assert(NumDims == 1);
+    (*this)[0] = i0;
+  }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
+    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+#else
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
+    eigen_assert(NumDims == 2);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+  }
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+    eigen_assert(NumDims == 3);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+  }
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+    eigen_assert(NumDims == 4);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+    (*this)[3] = i3;
+  }
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+    eigen_assert(NumDims == 5);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+    (*this)[3] = i3;
+    (*this)[4] = i4;
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) {
+    *static_cast<Base*>(this) = other;
+    return *this;
+  }
+
+  // A constexpr would be so much better here
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
+  }
+};
+
+
+
+
+// Boilerplate
+namespace internal {
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct tensor_vsize_index_linearization_helper
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions)
+  {
+    return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+      array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
+        tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+  }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&)
+  {
+    return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+  }
+};
+}  // end namespace internal
+
+
+namespace internal {
+
+template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
+  static const size_t value = NumDims;
+};
+template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
+  static const size_t value = NumDims;
+};
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
+static const std::ptrdiff_t value = Sizes<Indices...>::count;
+};
+template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...> > {
+static const std::ptrdiff_t value = Sizes<Indices...>::count;
+};
+template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
+  return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
+}
+template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
+  eigen_assert(false && "should never be called");
+  return -1;
+}
+#else
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
+  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+};
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
+  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+};
+template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
+  return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
+}
+
+#endif
+
+
+template <typename Dims1, typename Dims2, size_t n, size_t m>
+struct sizes_match_below_dim {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
+    return false;
+  }
+};
+template <typename Dims1, typename Dims2, size_t n>
+struct sizes_match_below_dim<Dims1, Dims2, n, n> {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1& dims1, Dims2& dims2) {
+    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
+        sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
+  }
+};
+template <typename Dims1, typename Dims2>
+struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
+    return true;
+  }
+};
+
+} // end namespace internal
+
+
+template <typename Dims1, typename Dims2>
+EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+  return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
new file mode 100644
index 000000000..06987132b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -0,0 +1,181 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+namespace internal {
+template<typename XprType, template <class> class MakePointer_>
+struct traits<TensorEvalToOp<XprType, MakePointer_> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0
+  };
+  template <class T>
+  struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+};
+
+template<typename XprType, template <class> class MakePointer_>
+struct eval<TensorEvalToOp<XprType, MakePointer_>, Eigen::Dense>
+{
+  typedef const TensorEvalToOp<XprType, MakePointer_>& type;
+};
+
+template<typename XprType, template <class> class MakePointer_>
+struct nested<TensorEvalToOp<XprType, MakePointer_>, 1, typename eval<TensorEvalToOp<XprType, MakePointer_> >::type>
+{
+  typedef TensorEvalToOp<XprType, MakePointer_> type;
+};
+
+}  // end namespace internal
+
+
+
+
+template<typename XprType, template <class> class MakePointer_>
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename MakePointer_<CoeffReturnType>::Type PointerType;
+  typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
+      : m_xpr(expr), m_buffer(buffer) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    PointerType m_buffer;
+};
+
+
+
+template<typename ArgType, typename Device, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
+{
+  typedef TensorEvalToOp<ArgType, MakePointer_> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef typename XprType::Index Index;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = true
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device),
+          m_buffer(op.buffer()), m_op(op), m_expression(op.expression())
+  { }
+
+  // Used for accessor extraction in SYCL Managed TensorMap:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const {
+    return m_op;
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
+  }
+
+  typedef typename internal::traits<const TensorEvalToOp<ArgType, MakePointer_> >::template MakePointer<CoeffReturnType>::Type DevicePointer;
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) {
+    EIGEN_UNUSED_VARIABLE(scalar);
+    eigen_assert(scalar == NULL);
+    return m_impl.evalSubExprsIfNeeded(m_buffer);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+    m_buffer[i] = m_impl.coeff(i);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+    internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_buffer[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // We assume that evalPacket or evalScalar is called to perform the
+    // assignment and account for the cost of the write here.
+    return m_impl.costPerCoeff(vectorized) +
+        TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; }
+  ArgType expression() const { return m_expression; }
+
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  /// added for sycl in order to construct the buffer from the sycl device
+  const Device& device() const{return m_device;}
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  DevicePointer m_buffer;
+  const XprType& m_op;
+  const ArgType m_expression;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
new file mode 100644
index 000000000..834ce07df
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -0,0 +1,633 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor evaluator classes.
+  *
+  * These classes are responsible for the evaluation of the tensor expression.
+  *
+  * TODO: add support for more types of expressions, in particular expressions
+  * leading to lvalues (slicing, reshaping, etc...)
+  */
+
+// Generic evaluator
+template<typename Derived, typename Device>
+struct TensorEvaluator
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  // NumDimensions is -1 for variable dim tensors
+  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
+                               internal::traits<Derived>::NumDimensions : 0;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    Layout = Derived::Layout,
+    CoordAccess = NumCoords > 0,
+    RawAccess = true
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
+  { }
+
+  // Used for accessor extraction in SYCL Managed TensorMap:
+  const Derived& derived() const { return m_impl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
+    if (dest) {
+      m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
+      return false;
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data);
+    return m_data[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    eigen_assert(m_data);
+    return m_data[index];
+  }
+
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return m_data[m_dims.IndexOfColMajor(coords)];
+    } else {
+      return m_data[m_dims.IndexOfRowMajor(coords)];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
+    eigen_assert(m_data);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return m_data[m_dims.IndexOfColMajor(coords)];
+    } else {
+      return m_data[m_dims.IndexOfRowMajor(coords)];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+                        internal::unpacket_traits<PacketReturnType>::size);
+  }
+
+  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
+
+  /// required by sycl in order to construct sycl buffer from raw pointer
+  const Device& device() const{return m_device;}
+
+ protected:
+  typename internal::traits<Derived>::template MakePointer<Scalar>::Type m_data;
+  Dimensions m_dims;
+  const Device& m_device;
+  const Derived& m_impl;
+};
+
+namespace {
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T loadConstant(const T* address) {
+  return *address;
+}
+// Use the texture cache on CUDA devices whenever possible
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float loadConstant(const float* address) {
+  return __ldg(address);
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double loadConstant(const double* address) {
+  return __ldg(address);
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+Eigen::half loadConstant(const Eigen::half* address) {
+  return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
+}
+#endif
+}
+
+
+// Default evaluator for rvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<const Derived, Device>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  // NumDimensions is -1 for variable dim tensors
+  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
+                               internal::traits<Derived>::NumDimensions : 0;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    Layout = Derived::Layout,
+    CoordAccess = NumCoords > 0,
+    RawAccess = true
+  };
+
+  // Used for accessor extraction in SYCL Managed TensorMap:
+  const Derived& derived() const { return m_impl; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
+      m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar));
+      return false;
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data);
+    return loadConstant(m_data+index);
+  }
+
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data);
+    const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
+                        : m_dims.IndexOfRowMajor(coords);
+    return loadConstant(m_data+index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+                        internal::unpacket_traits<PacketReturnType>::size);
+  }
+
+  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
+
+  /// added for sycl in order to construct the buffer from the sycl device
+  const Device& device() const{return m_device;}
+
+ protected:
+  typename internal::traits<Derived>::template MakePointer<const Scalar>::Type m_data;
+  Dimensions m_dims;
+  const Device& m_device;
+  const Derived& m_impl;
+};
+
+
+
+
+// -------------------- CwiseNullaryOp --------------------
+
+template<typename NullaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
+{
+  typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_wrapper(m_functor, index);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+                        internal::unpacket_traits<PacketReturnType>::size);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
+  /// required by sycl in order to extract the accessor
+  NullaryOp functor() const { return m_functor; }
+
+
+ private:
+  const NullaryOp m_functor;
+  TensorEvaluator<ArgType, Device> m_argImpl;
+  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
+};
+
+
+
+// -------------------- CwiseUnaryOp --------------------
+
+template<typename UnaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
+{
+  typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_functor(op.functor()),
+      m_argImpl(op.nestedExpression(), device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_argImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_argImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_argImpl.coeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+    return m_argImpl.costPerCoeff(vectorized) +
+        TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; }
+  /// added for sycl in order to construct the buffer from sycl device
+  UnaryOp functor() const { return m_functor; }
+
+
+ private:
+  const UnaryOp m_functor;
+  TensorEvaluator<ArgType, Device> m_argImpl;
+};
+
+
+// -------------------- CwiseBinaryOp --------------------
+
+template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
+{
+  typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
+                   internal::functor_traits<BinaryOp>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_functor(op.functor()),
+      m_leftImpl(op.lhsExpression(), device),
+      m_rightImpl(op.rhsExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use right impl instead if right impl dimensions are known at compile time.
+    return m_leftImpl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+    return m_leftImpl.costPerCoeff(vectorized) +
+           m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
+  /// required by sycl in order to extract the accessor
+  BinaryOp functor() const { return m_functor; }
+
+ private:
+  const BinaryOp m_functor;
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+// -------------------- CwiseTernaryOp --------------------
+
+template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
+struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
+{
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
+                   internal::functor_traits<TernaryOp>::PacketAccess,
+    Layout = TensorEvaluator<Arg1Type, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_functor(op.functor()),
+      m_arg1Impl(op.arg1Expression(), device),
+      m_arg2Impl(op.arg2Expression(), device),
+      m_arg3Impl(op.arg3Expression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                         typename internal::traits<Arg2Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                         typename internal::traits<Arg3Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+
+    eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use arg2 or arg3 dimensions if they are known at compile time.
+    return m_arg1Impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_arg1Impl.evalSubExprsIfNeeded(NULL);
+    m_arg2Impl.evalSubExprsIfNeeded(NULL);
+    m_arg3Impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_arg1Impl.cleanup();
+    m_arg2Impl.cleanup();
+    m_arg3Impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
+                              m_arg2Impl.template packet<LoadMode>(index),
+                              m_arg3Impl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
+    return m_arg1Impl.costPerCoeff(vectorized) +
+           m_arg2Impl.costPerCoeff(vectorized) +
+           m_arg3Impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<Arg2Type, Device>& arg2Impl() const { return m_arg2Impl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<Arg3Type, Device>& arg3Impl() const { return m_arg3Impl; }
+
+ private:
+  const TernaryOp m_functor;
+  TensorEvaluator<Arg1Type, Device> m_arg1Impl;
+  TensorEvaluator<Arg2Type, Device> m_arg2Impl;
+  TensorEvaluator<Arg3Type, Device> m_arg3Impl;
+};
+
+
+// -------------------- SelectOp --------------------
+
+template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
+struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
+{
+  typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
+                   internal::packet_traits<Scalar>::HasBlend,
+    Layout = TensorEvaluator<IfArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_condImpl(op.ifExpression(), device),
+      m_thenImpl(op.thenExpression(), device),
+      m_elseImpl(op.elseExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
+    eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use then or else impl instead if they happen to be known at compile time.
+    return m_condImpl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_condImpl.evalSubExprsIfNeeded(NULL);
+    m_thenImpl.evalSubExprsIfNeeded(NULL);
+    m_elseImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_condImpl.cleanup();
+    m_thenImpl.cleanup();
+    m_elseImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+    internal::Selector<PacketSize> select;
+    for (Index i = 0; i < PacketSize; ++i) {
+      select.select[i] = m_condImpl.coeff(index+i);
+    }
+    return internal::pblend(select,
+                            m_thenImpl.template packet<LoadMode>(index),
+                            m_elseImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    return m_condImpl.costPerCoeff(vectorized) +
+           m_thenImpl.costPerCoeff(vectorized)
+        .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ThenArgType, Device>& then_impl() const { return m_thenImpl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ElseArgType, Device>& else_impl() const { return m_elseImpl; }
+
+ private:
+  TensorEvaluator<IfArgType, Device> m_condImpl;
+  TensorEvaluator<ThenArgType, Device> m_thenImpl;
+  TensorEvaluator<ElseArgType, Device> m_elseImpl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
new file mode 100644
index 000000000..f01d77c0a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -0,0 +1,288 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+
+namespace Eigen {
+
+/** \class TensorExecutor
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor executor class.
+  *
+  * This class is responsible for launch the evaluation of the expression on
+  * the specified computing device.
+  */
+namespace internal {
+
+// Default strategy: the expression is evaluated with a single cpu thread.
+template<typename Expression, typename Device, bool Vectorizable>
+class TensorExecutor
+{
+ public:
+  typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Expression& expr, const Device& device = Device())
+  {
+    TensorEvaluator<Expression, Device> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+      for (Index i = 0; i < size; ++i) {
+        evaluator.evalScalar(i);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+
+template<typename Expression>
+class TensorExecutor<Expression, DefaultDevice, true>
+{
+ public:
+  typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
+  {
+    TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
+      // unroll the loop at the expense of inlining.
+      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
+        for (Index j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
+      }
+      const Index VectorizedSize = (size / PacketSize) * PacketSize;
+      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+      for (Index i = VectorizedSize; i < size; ++i) {
+        evaluator.evalScalar(i);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+
+
+// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+#ifdef EIGEN_USE_THREADS
+template <typename Evaluator, typename Index, bool Vectorizable>
+struct EvalRange {
+  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+    Evaluator evaluator = *evaluator_in;
+    eigen_assert(last >= first);
+    for (Index i = first; i < last; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+
+  static Index alignBlockSize(Index size) {
+    return size;
+  }
+};
+
+template <typename Evaluator, typename Index>
+struct EvalRange<Evaluator, Index, true> {
+  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+
+  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+    Evaluator evaluator = *evaluator_in;
+    eigen_assert(last >= first);
+    Index i = first;
+    if (last - first >= PacketSize) {
+      eigen_assert(first % PacketSize == 0);
+      Index last_chunk_offset = last - 4 * PacketSize;
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
+      // unroll the loop at the expense of inlining.
+      for (; i <= last_chunk_offset; i += 4*PacketSize) {
+        for (Index j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
+      }
+      last_chunk_offset = last - PacketSize;
+      for (; i <= last_chunk_offset; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+    }
+    for (; i < last; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+
+  static Index alignBlockSize(Index size) {
+    // Align block size to packet size and account for unrolling in run above.
+    if (size >= 16 * PacketSize) {
+      return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
+    }
+    // Aligning to 4 * PacketSize would increase block size by more than 25%.
+    return (size + PacketSize - 1) & ~(PacketSize - 1);
+  }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
+ public:
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
+  {
+    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    Evaluator evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+      device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+                         EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
+                         [&evaluator](Index first, Index last) {
+                           EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+                         });
+#else
+      size_t num_threads = device.numThreads();
+      if (num_threads > 1) {
+        num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+            size, evaluator.costPerCoeff(Vectorizable), num_threads);
+      }
+      if (num_threads == 1) {
+        EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
+      } else {
+        const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
+        Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
+        const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+        const Index numblocks = size / blocksize;
+
+        Barrier barrier(numblocks);
+        for (int i = 0; i < numblocks; ++i) {
+          device.enqueue_with_barrier(
+              &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
+              &evaluator, i * blocksize, (i + 1) * blocksize);
+        }
+        if (numblocks * blocksize < size) {
+          EvalRange<Evaluator, Index, Vectorizable>::run(
+              &evaluator, numblocks * blocksize, size);
+        }
+        barrier.Wait();
+      }
+#endif  // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
+    }
+    evaluator.cleanup();
+  }
+};
+#endif  // EIGEN_USE_THREADS
+
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, GpuDevice, Vectorizable> {
+ public:
+  typedef typename Expression::Index Index;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+
+#if defined(__CUDACC__)
+template <typename Evaluator, typename Index, bool Vectorizable>
+struct EigenMetaKernelEval {
+  static __device__ EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, Index first, Index last, Index step_size) {
+    for (Index i = first; i < last; i += step_size) {
+      eval.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename Index>
+struct EigenMetaKernelEval<Evaluator, Index, true> {
+  static __device__ EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, Index first, Index last, Index step_size) {
+    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const Index vectorized_size = (last / PacketSize) * PacketSize;
+    const Index vectorized_step_size = step_size * PacketSize;
+
+    // Use the vector path
+    for (Index i = first * PacketSize; i < vectorized_size;
+         i += vectorized_step_size) {
+      eval.evalPacket(i);
+    }
+    for (Index i = vectorized_size + first; i < last; i += step_size) {
+      eval.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename Index>
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel(Evaluator eval, Index size) {
+
+  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index step_size = blockDim.x * gridDim.x;
+
+  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
+  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
+}
+
+/*static*/
+template <typename Expression, bool Vectorizable>
+inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
+    const Expression& expr, const GpuDevice& device) {
+  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  if (needs_assign) {
+    const int block_size = device.maxCudaThreadsPerBlock();
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
+
+    LAUNCH_CUDA_KERNEL(
+        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+        num_blocks, block_size, 0, device, evaluator, size);
+  }
+  evaluator.cleanup();
+}
+
+#endif  // __CUDACC__
+#endif  // EIGEN_USE_GPU
+
+// SYCL Executor policy
+#ifdef EIGEN_USE_SYCL
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, SyclDevice, Vectorizable> {
+public:
+  static inline void run(const Expression &expr, const SyclDevice &device) {
+    // call TensorSYCL module
+    TensorSycl::run(expr, device);
+  }
+};
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
new file mode 100644
index 000000000..85dfc7a69
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -0,0 +1,371 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+
+namespace Eigen {
+
+/** \class TensorExpr
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor expression classes.
+  *
+  * The TensorCwiseNullaryOp class applies a nullary operators to an expression.
+  * This is typically used to generate constants.
+  *
+  * The TensorCwiseUnaryOp class represents an expression where a unary operator
+  * (e.g. cwiseSqrt) is applied to an expression.
+  *
+  * The TensorCwiseBinaryOp class represents an expression where a binary
+  * operator (e.g. addition) is applied to a lhs and a rhs expression.
+  *
+  */
+namespace internal {
+template<typename NullaryOp, typename XprType>
+struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
+    : traits<XprType>
+{
+  typedef traits<XprType> XprTraits;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0
+  };
+};
+
+}  // end namespace internal
+
+
+
+template<typename NullaryOp, typename XprType>
+class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
+        : m_xpr(xpr), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    nestedExpression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const NullaryOp& functor() const { return m_functor; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const NullaryOp m_functor;
+};
+
+
+
+namespace internal {
+template<typename UnaryOp, typename XprType>
+struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
+    : traits<XprType>
+{
+  // TODO(phli): Add InputScalar, InputPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Input or Output.
+  typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename UnaryOp, typename XprType>
+struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense>
+{
+  typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
+};
+
+template<typename UnaryOp, typename XprType>
+struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type>
+{
+  typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename UnaryOp, typename XprType>
+class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors>
+{
+  public:
+    // TODO(phli): Add InputScalar, InputPacket.  Check references to
+    // current Scalar/Packet to see if the intent is Input or Output.
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef Scalar CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+      : m_xpr(xpr), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const UnaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    nestedExpression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const UnaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs
+  // are different.
+  // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Inputs or Output.
+  typedef typename result_of<
+      BinaryOp(typename LhsXprType::Scalar,
+               typename RhsXprType::Scalar)>::type Scalar;
+  typedef traits<LhsXprType> XprTraits;
+  typedef typename promote_storage_type<
+      typename traits<LhsXprType>::StorageKind,
+      typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<
+      typename traits<LhsXprType>::Index,
+      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0
+  };
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+  public:
+    // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
+    // current Scalar/Packet to see if the intent is Inputs or Output.
+    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef Scalar CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
+        : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const BinaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return m_lhs_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const BinaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
+{
+  // Type promotion to handle the case where the types of the args are different.
+  typedef typename result_of<
+      TernaryOp(typename Arg1XprType::Scalar,
+                typename Arg2XprType::Scalar,
+                typename Arg3XprType::Scalar)>::type Scalar;
+  typedef traits<Arg1XprType> XprTraits;
+  typedef typename traits<Arg1XprType>::StorageKind StorageKind;
+  typedef typename traits<Arg1XprType>::Index Index;
+  typedef typename Arg1XprType::Nested Arg1Nested;
+  typedef typename Arg2XprType::Nested Arg2Nested;
+  typedef typename Arg3XprType::Nested Arg3Nested;
+  typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0
+  };
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
+{
+  typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
+{
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef Scalar CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
+        : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const TernaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+    arg1Expression() const { return m_arg1_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg2XprType::Nested>::type&
+    arg2Expression() const { return m_arg2_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg3XprType::Nested>::type&
+    arg3Expression() const { return m_arg3_xpr; }
+
+  protected:
+    typename Arg1XprType::Nested m_arg1_xpr;
+    typename Arg2XprType::Nested m_arg2_xpr;
+    typename Arg3XprType::Nested m_arg3_xpr;
+    const TernaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+    : traits<ThenXprType>
+{
+  typedef typename traits<ThenXprType>::Scalar Scalar;
+  typedef traits<ThenXprType> XprTraits;
+  typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
+                                        typename traits<ElseXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<ElseXprType>::Index,
+                                      typename traits<ThenXprType>::Index>::type Index;
+  typedef typename IfXprType::Nested IfNested;
+  typedef typename ThenXprType::Nested ThenNested;
+  typedef typename ElseXprType::Nested ElseNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense>
+{
+  typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
+};
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type>
+{
+  typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
+};
+
+}  // end namespace internal
+
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
+                                                    typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC
+    TensorSelectOp(const IfXprType& a_condition,
+                   const ThenXprType& a_then,
+                   const ElseXprType& a_else)
+      : m_condition(a_condition), m_then(a_then), m_else(a_else)
+    { }
+
+    EIGEN_DEVICE_FUNC
+    const IfXprType& ifExpression() const { return m_condition; }
+
+    EIGEN_DEVICE_FUNC
+    const ThenXprType& thenExpression() const { return m_then; }
+
+    EIGEN_DEVICE_FUNC
+    const ElseXprType& elseExpression() const { return m_else; }
+
+  protected:
+    typename IfXprType::Nested m_condition;
+    typename ThenXprType::Nested m_then;
+    typename ElseXprType::Nested m_else;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
new file mode 100644
index 000000000..08eb5595a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -0,0 +1,651 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+
+// This code requires the ability to initialize arrays of constant
+// values directly inside a class.
+#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900
+
+namespace Eigen {
+
+/** \class TensorFFT
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor FFT class.
+  *
+  * TODO:
+  * Vectorize the Cooley Tukey and the Bluestein algorithm
+  * Add support for multithreaded evaluation
+  * Improve the performance on GPU
+  */
+
+template <bool NeedUprade> struct MakeComplex {
+  template <typename T>
+  EIGEN_DEVICE_FUNC
+  T operator() (const T& val) const { return val; }
+};
+
+template <> struct MakeComplex<true> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC
+  std::complex<T> operator() (const T& val) const { return std::complex<T>(val, 0); }
+};
+
+template <> struct MakeComplex<false> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC
+  std::complex<T> operator() (const std::complex<T>& val) const { return val; }
+};
+
+template <int ResultType> struct PartOf {
+  template <typename T> T operator() (const T& val) const { return val; }
+};
+
+template <> struct PartOf<RealPart> {
+  template <typename T> T operator() (const std::complex<T>& val) const { return val.real(); }
+};
+
+template <> struct PartOf<ImagPart> {
+  template <typename T> T operator() (const std::complex<T>& val) const { return val.imag(); }
+};
+
+namespace internal {
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename NumTraits<typename XprTraits::Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename XprTraits::Scalar InputScalar;
+  typedef typename conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> {
+  typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1, typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> {
+  typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type;
+};
+
+}  // end namespace internal
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft)
+      : m_xpr(expr), m_fft(fft) {}
+
+  EIGEN_DEVICE_FUNC
+  const FFT& fft() const { return m_fft; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type& expression() const {
+    return m_xpr;
+  }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const FFT m_fft;
+};
+
+// Eval as rvalue
+template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir>
+struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> {
+  typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  typedef internal::traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar InputScalar;
+  typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = true,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+    m_size = m_dimensions.TotalSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_dimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalToBuf(data);
+      return false;
+    } else {
+      m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size);
+      evalToBuf(m_data);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_data) {
+      m_device.deallocate(m_data);
+      m_data = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
+    return m_data[index];
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType
+  packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
+
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) {
+    const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
+    ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
+
+    for (Index i = 0; i < m_size; ++i) {
+      buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i));
+    }
+
+    for (size_t i = 0; i < m_fft.size(); ++i) {
+      Index dim = m_fft[i];
+      eigen_assert(dim >= 0 && dim < NumDims);
+      Index line_len = m_dimensions[dim];
+      eigen_assert(line_len >= 1);
+      ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len);
+      const bool is_power_of_two = isPowerOfTwo(line_len);
+      const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
+      const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
+
+      ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+      ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+      ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1));
+      if (!is_power_of_two) {
+        // Compute twiddle factors
+        //   t_n = exp(sqrt(-1) * pi * n^2 / line_len)
+        // for n = 0, 1,..., line_len-1.
+        // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
+        pos_j_base_powered[0] = ComplexScalar(1, 0);
+        if (line_len > 1) {
+          const RealScalar pi_over_len(EIGEN_PI / line_len);
+          const ComplexScalar pos_j_base = ComplexScalar(
+	       std::cos(pi_over_len), std::sin(pi_over_len));
+          pos_j_base_powered[1] = pos_j_base;
+          if (line_len > 2) {
+            const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
+            for (int j = 2; j < line_len + 1; ++j) {
+              pos_j_base_powered[j] = pos_j_base_powered[j - 1] *
+                                      pos_j_base_powered[j - 1] /
+                                      pos_j_base_powered[j - 2] * pos_j_base_sq;
+            }
+          }
+        }
+      }
+
+      for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) {
+        const Index base_offset = getBaseOffsetFromIndex(partial_index, dim);
+
+        // get data into line_buf
+        const Index stride = m_strides[dim];
+        if (stride == 1) {
+          memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+        } else {
+          Index offset = base_offset;
+          for (int j = 0; j < line_len; ++j, offset += stride) {
+            line_buf[j] = buf[offset];
+          }
+        }
+
+        // processs the line
+        if (is_power_of_two) {
+          processDataLineCooleyTukey(line_buf, line_len, log_len);
+        }
+        else {
+          processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered);
+        }
+
+        // write back
+        if (FFTDir == FFT_FORWARD && stride == 1) {
+          memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+        } else {
+          Index offset = base_offset;
+          const ComplexScalar div_factor =  ComplexScalar(1.0 / line_len, 0);
+          for (int j = 0; j < line_len; ++j, offset += stride) {
+             buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor;
+          }
+        }
+      }
+      m_device.deallocate(line_buf);
+      if (!is_power_of_two) {
+        m_device.deallocate(a);
+        m_device.deallocate(b);
+        m_device.deallocate(pos_j_base_powered);
+      }
+    }
+
+    if(!write_to_out) {
+      for (Index i = 0; i < m_size; ++i) {
+        data[i] = PartOf<FFTResultType>()(buf[i]);
+      }
+      m_device.deallocate(buf);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) {
+    eigen_assert(x > 0);
+    return !(x & (x - 1));
+  }
+
+  // The composite number for padding, used in Bluestein's FFT algorithm
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) {
+    Index i = 2;
+    while (i < 2 * n - 1) i *= 2;
+    return i;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) {
+    Index log2m = 0;
+    while (m >>= 1) log2m++;
+    return log2m;
+  }
+
+  // Call Cooley Tukey algorithm directly, data length must be power of 2
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) {
+    eigen_assert(isPowerOfTwo(line_len));
+    scramble_FFT(line_buf, line_len);
+    compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len);
+  }
+
+  // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) {
+    Index n = line_len;
+    Index m = good_composite;
+    ComplexScalar* data = line_buf;
+
+    for (Index i = 0; i < n; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
+      }
+      else {
+        a[i] = data[i] * pos_j_base_powered[i];
+      }
+    }
+    for (Index i = n; i < m; ++i) {
+      a[i] = ComplexScalar(0, 0);
+    }
+
+    for (Index i = 0; i < n; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        b[i] = pos_j_base_powered[i];
+      }
+      else {
+        b[i] = numext::conj(pos_j_base_powered[i]);
+      }
+    }
+    for (Index i = n; i < m - n; ++i) {
+      b[i] = ComplexScalar(0, 0);
+    }
+    for (Index i = m - n; i < m; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        b[i] = pos_j_base_powered[m-i];
+      }
+      else {
+        b[i] = numext::conj(pos_j_base_powered[m-i]);
+      }
+    }
+
+    scramble_FFT(a, m);
+    compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len);
+
+    scramble_FFT(b, m);
+    compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len);
+
+    for (Index i = 0; i < m; ++i) {
+      a[i] *= b[i];
+    }
+
+    scramble_FFT(a, m);
+    compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len);
+
+    //Do the scaling after ifft
+    for (Index i = 0; i < m; ++i) {
+      a[i] /= m;
+    }
+
+    for (Index i = 0; i < n; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
+      }
+      else {
+        data[i] = a[i] * pos_j_base_powered[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) {
+    eigen_assert(isPowerOfTwo(n));
+    Index j = 1;
+    for (Index i = 1; i < n; ++i){
+      if (j > i) {
+        std::swap(data[j-1], data[i-1]);
+      }
+      Index m = n >> 1;
+      while (m >= 2 && j > m) {
+        j -= m;
+        m >>= 1;
+      }
+      j += m;
+    }
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) {
+    ComplexScalar tmp = data[1];
+    data[1] = data[0] - data[1];
+    data[0] += tmp;
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) {
+    ComplexScalar tmp[4];
+    tmp[0] = data[0] + data[1];
+    tmp[1] = data[0] - data[1];
+    tmp[2] = data[2] + data[3];
+    if (Dir == FFT_FORWARD) {
+      tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]);
+    } else {
+      tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]);
+    }
+    data[0] = tmp[0] + tmp[2];
+    data[1] = tmp[1] + tmp[3];
+    data[2] = tmp[0] - tmp[2];
+    data[3] = tmp[1] - tmp[3];
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) {
+    ComplexScalar tmp_1[8];
+    ComplexScalar tmp_2[8];
+
+    tmp_1[0] = data[0] + data[1];
+    tmp_1[1] = data[0] - data[1];
+    tmp_1[2] = data[2] + data[3];
+    if (Dir == FFT_FORWARD) {
+      tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1);
+    } else {
+      tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1);
+    }
+    tmp_1[4] = data[4] + data[5];
+    tmp_1[5] = data[4] - data[5];
+    tmp_1[6] = data[6] + data[7];
+    if (Dir == FFT_FORWARD) {
+      tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1);
+    } else {
+      tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1);
+    }
+    tmp_2[0] = tmp_1[0] + tmp_1[2];
+    tmp_2[1] = tmp_1[1] + tmp_1[3];
+    tmp_2[2] = tmp_1[0] - tmp_1[2];
+    tmp_2[3] = tmp_1[1] - tmp_1[3];
+    tmp_2[4] = tmp_1[4] + tmp_1[6];
+// SQRT2DIV2 = sqrt(2)/2
+#define SQRT2DIV2 0.7071067811865476
+    if (Dir == FFT_FORWARD) {
+      tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2);
+      tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1);
+      tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2);
+    } else {
+      tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2);
+      tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1);
+      tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2);
+    }
+    data[0] = tmp_2[0] + tmp_2[4];
+    data[1] = tmp_2[1] + tmp_2[5];
+    data[2] = tmp_2[2] + tmp_2[6];
+    data[3] = tmp_2[3] + tmp_2[7];
+    data[4] = tmp_2[0] - tmp_2[4];
+    data[5] = tmp_2[1] - tmp_2[5];
+    data[6] = tmp_2[2] - tmp_2[6];
+    data[7] = tmp_2[3] - tmp_2[7];
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(
+      ComplexScalar* data, Index n, Index n_power_of_2) {
+    // Original code:
+    // RealScalar wtemp = std::sin(M_PI/n);
+    // RealScalar wpi =  -std::sin(2 * M_PI/n);
+    const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2];
+    const RealScalar wpi = (Dir == FFT_FORWARD)
+                               ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2]
+                               : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
+
+    const ComplexScalar wp(wtemp, wpi);
+    const ComplexScalar wp_one = wp + ComplexScalar(1, 0);
+    const ComplexScalar wp_one_2 = wp_one * wp_one;
+    const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
+    const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
+    const Index n2 = n / 2;
+    ComplexScalar w(1.0, 0.0);
+    for (Index i = 0; i < n2; i += 4) {
+       ComplexScalar temp0(data[i + n2] * w);
+       ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
+       ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
+       ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3);
+       w = w * wp_one_4;
+
+       data[i + n2] = data[i] - temp0;
+       data[i] += temp0;
+
+       data[i + 1 + n2] = data[i + 1] - temp1;
+       data[i + 1] += temp1;
+
+       data[i + 2 + n2] = data[i + 2] - temp2;
+       data[i + 2] += temp2;
+
+       data[i + 3 + n2] = data[i + 3] - temp3;
+       data[i + 3] += temp3;
+    }
+  }
+
+ template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(
+      ComplexScalar* data, Index n, Index n_power_of_2) {
+    eigen_assert(isPowerOfTwo(n));
+    if (n > 8) {
+      compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
+      compute_1D_Butterfly<Dir>(data + n / 2, n / 2, n_power_of_2 - 1);
+      butterfly_1D_merge<Dir>(data, n, n_power_of_2);
+    } else if (n == 8) {
+      butterfly_8<Dir>(data);
+    } else if (n == 4) {
+      butterfly_4<Dir>(data);
+    } else if (n == 2) {
+      butterfly_2<Dir>(data);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
+    Index result = 0;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > omitted_dim; --i) {
+        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+        const Index idx = index / partial_m_stride;
+        index -= idx * partial_m_stride;
+        result += idx * m_strides[i];
+      }
+      result += index;
+    }
+    else {
+      for (Index i = 0; i < omitted_dim; ++i) {
+        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+        const Index idx = index / partial_m_stride;
+        index -= idx * partial_m_stride;
+        result += idx * m_strides[i];
+      }
+      result += index;
+    }
+    // Value of index_coords[omitted_dim] is not determined to this step
+    return result;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const {
+    Index result = base + offset * m_strides[omitted_dim] ;
+    return result;
+  }
+
+ protected:
+  Index m_size;
+  const FFT& m_fft;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  CoeffReturnType* m_data;
+  const Device& m_device;
+
+  // This will support a maximum FFT size of 2^32 for each dimension
+  // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
+  const RealScalar m_sin_PI_div_n_LUT[32] = {
+    RealScalar(0.0),
+    RealScalar(-2),
+    RealScalar(-0.999999999999999),
+    RealScalar(-0.292893218813453),
+    RealScalar(-0.0761204674887130),
+    RealScalar(-0.0192147195967696),
+    RealScalar(-0.00481527332780311),
+    RealScalar(-0.00120454379482761),
+    RealScalar(-3.01181303795779e-04),
+    RealScalar(-7.52981608554592e-05),
+    RealScalar(-1.88247173988574e-05),
+    RealScalar(-4.70619042382852e-06),
+    RealScalar(-1.17654829809007e-06),
+    RealScalar(-2.94137117780840e-07),
+    RealScalar(-7.35342821488550e-08),
+    RealScalar(-1.83835707061916e-08),
+    RealScalar(-4.59589268710903e-09),
+    RealScalar(-1.14897317243732e-09),
+    RealScalar(-2.87243293150586e-10),
+    RealScalar( -7.18108232902250e-11),
+    RealScalar(-1.79527058227174e-11),
+    RealScalar(-4.48817645568941e-12),
+    RealScalar(-1.12204411392298e-12),
+    RealScalar(-2.80511028480785e-13),
+    RealScalar(-7.01277571201985e-14),
+    RealScalar(-1.75319392800498e-14),
+    RealScalar(-4.38298482001247e-15),
+    RealScalar(-1.09574620500312e-15),
+    RealScalar(-2.73936551250781e-16),
+    RealScalar(-6.84841378126949e-17),
+    RealScalar(-1.71210344531737e-17),
+    RealScalar(-4.28025861329343e-18)
+  };
+
+  // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i));
+  const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
+    RealScalar(0.0),
+    RealScalar(0.0),
+    RealScalar(-1.00000000000000e+00),
+    RealScalar(-7.07106781186547e-01),
+    RealScalar(-3.82683432365090e-01),
+    RealScalar(-1.95090322016128e-01),
+    RealScalar(-9.80171403295606e-02),
+    RealScalar(-4.90676743274180e-02),
+    RealScalar(-2.45412285229123e-02),
+    RealScalar(-1.22715382857199e-02),
+    RealScalar(-6.13588464915448e-03),
+    RealScalar(-3.06795676296598e-03),
+    RealScalar(-1.53398018628477e-03),
+    RealScalar(-7.66990318742704e-04),
+    RealScalar(-3.83495187571396e-04),
+    RealScalar(-1.91747597310703e-04),
+    RealScalar(-9.58737990959773e-05),
+    RealScalar(-4.79368996030669e-05),
+    RealScalar(-2.39684498084182e-05),
+    RealScalar(-1.19842249050697e-05),
+    RealScalar(-5.99211245264243e-06),
+    RealScalar(-2.99605622633466e-06),
+    RealScalar(-1.49802811316901e-06),
+    RealScalar(-7.49014056584716e-07),
+    RealScalar(-3.74507028292384e-07),
+    RealScalar(-1.87253514146195e-07),
+    RealScalar(-9.36267570730981e-08),
+    RealScalar(-4.68133785365491e-08),
+    RealScalar(-2.34066892682746e-08),
+    RealScalar(-1.17033446341373e-08),
+    RealScalar(-5.85167231706864e-09),
+    RealScalar(-2.92583615853432e-09)
+  };
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_HAS_CONSTEXPR
+
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FFT_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
new file mode 100644
index 000000000..fcee5f60d
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -0,0 +1,389 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+
+namespace Eigen {
+
+/** \class TensorFixedSize
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The fixed sized version of the tensor class.
+  *
+  * The fixed sized equivalent of
+  * Eigen::Tensor<float, 3> t(3, 5, 7);
+  * is
+  * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
+  */
+
+template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
+class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> >
+{
+  public:
+    typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self;
+    typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<Self>::StorageKind StorageKind;
+    typedef typename internal::traits<Self>::Index Index;
+    typedef Scalar_ Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+    static const int Options = Options_;
+
+    enum {
+      IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
+      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+      CoordAccess = true,
+      RawAccess = true
+    };
+
+  typedef Dimensions_ Dimensions;
+  static const std::size_t NumIndices = Dimensions::count;
+
+  protected:
+  TensorStorage<Scalar, Dimensions, Options> m_storage;
+
+  public:
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    rank()                   const { return NumIndices; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&        dimensions()             const { return m_storage.dimensions(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    size()                   const { return m_storage.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                   *data()                        { return m_storage.data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar             *data()                  const { return m_storage.data(); }
+
+    // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    // work, because that uses base().coeffRef() - and we don't yet
+    // implement a similar class hierarchy
+    inline Self& base()             { return *this; }
+    inline const Self& base() const { return *this; }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    {
+      if (Options&RowMajor) {
+        const Index index = i1 + i0 * m_storage.dimensions()[1];
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + i1 * m_storage.dimensions()[0];
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    {
+      if (Options&RowMajor) {
+         const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
+         return m_storage.data()[index];
+      } else {
+         const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      if (Options&RowMajor) {
+        const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      if (Options&RowMajor) {
+        const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
+        return m_storage.data()[index];
+      }
+    }
+#endif
+
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    {
+      eigen_assert(checkIndexRange(indices));
+      return coeff(indices);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff();
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
+    {
+      // The bracket operator is only for vectors, use the parenthesis operator instead.
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff(index);
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    {
+       if (Options&RowMajor) {
+         const Index index = i1 + i0 * m_storage.dimensions()[1];
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + i1 * m_storage.dimensions()[0];
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    {
+       if (Options&RowMajor) {
+         const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
+        return m_storage.data()[index];
+      } else {
+         const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      if (Options&RowMajor) {
+        const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      if (Options&RowMajor) {
+        const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
+        return m_storage.data()[index];
+      }
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    {
+      eigen_assert(checkIndexRange(indices));
+      return coeffRef(indices);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_assert(index >= 0 && index < size());
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeffRef();
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator[](Index index)
+    {
+      // The bracket operator is only for vectors, use the parenthesis operator instead
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize()
+      : m_storage()
+    {
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize(const Self& other)
+      : m_storage(other.m_storage)
+    {
+    }
+
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
+      : m_storage(other.m_storage)
+    {
+    }
+#endif
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
+    {
+      typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, WriteAccessors>& other)
+    {
+      typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other)
+    {
+      // FIXME: check that the dimensions of other match the dimensions of *this.
+      // Unfortunately this isn't possible yet when the rhs is an expression.
+      typedef TensorAssignOp<Self, const TensorFixedSize> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other)
+    {
+      // FIXME: check that the dimensions of other match the dimensions of *this.
+      // Unfortunately this isn't possible yet when the rhs is an expression.
+      typedef TensorAssignOp<Self, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const
+    {
+      using internal::array_apply_and_reduce;
+      using internal::array_zip_and_reduce;
+      using internal::greater_equal_zero_op;
+      using internal::logical_and_op;
+      using internal::lesser_op;
+
+      return true;
+        // check whether the indices are all >= 0
+          /*       array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+        // check whether the indices fit in the dimensions
+        array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
+    {
+      if (Options&RowMajor) {
+        return m_storage.dimensions().IndexOfRowMajor(indices);
+      } else {
+        return m_storage.dimensions().IndexOfColMajor(indices);
+      }
+    }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
new file mode 100644
index 000000000..bbd5eb374
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -0,0 +1,167 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+/// template <class> class MakePointer_ is added to convert the host pointer to the device pointer.
+/// It is added due to the fact that for our device compiler T* is not allowed.
+/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer T.
+/// This is done through our MakePointer_ class. By default the Type in the MakePointer_<T> is T* .
+/// Therefore, by adding the default value, we managed to convert the type and it does not break any
+/// existing code as its default value is T*.
+namespace internal {
+template<typename XprType, template <class> class MakePointer_>
+struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0
+  };
+  template <class T> struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+};
+
+template<typename XprType, template <class> class MakePointer_>
+struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
+{
+  typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
+};
+
+template<typename XprType, template <class> class MakePointer_>
+struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
+{
+  typedef TensorForcedEvalOp<XprType, MakePointer_> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename XprType, template <class> class MakePointer_>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr)
+      : m_xpr(expr) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+
+template<typename ArgType, typename Device, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
+{
+  typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = (PacketSize > 1),
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = true
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+	/// op_ is used for sycl
+      : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
+  { }
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    const Index numValues =  internal::array_prod(m_impl.dimensions());
+    m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
+    // Should initialize the memory in case we're dealing with non POD types.
+    if (NumTraits<CoeffReturnType>::RequireInitialization) {
+      for (Index i = 0; i < numValues; ++i) {
+        new(m_buffer+i) CoeffReturnType();
+      }
+    }
+    typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
+    EvalTo evalToTmp(m_buffer, m_op);
+    const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value;
+    internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, PacketAccess>::run(evalToTmp, m_device);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_device.deallocate(m_buffer);
+    m_buffer = NULL;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_buffer[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
+
+  /// required by sycl in order to extract the sycl accessor
+  const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
+  /// used by sycl in order to build the sycl buffer
+  const Device& device() const{return m_device;}
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const ArgType m_op;
+  const Device& m_device;
+  typename MakePointer<CoeffReturnType>::Type m_buffer;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
new file mode 100644
index 000000000..52b803d7f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -0,0 +1,109 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+
+namespace Eigen {
+
+// MakePointer class is used as a container of the adress space of the pointer
+// on the host and on the device. From the host side it generates the T* pointer
+// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
+// T* m_data on the host. It is always called on the device.
+// Specialisation of MakePointer class for creating the sycl buffer with
+// map_allocator.
+template<typename T> struct MakePointer {
+  typedef T* Type;
+};
+
+template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
+template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
+template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
+template<typename PlainObjectType> class TensorRef;
+template<typename Derived, int AccessLevel> class TensorBase;
+
+template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
+template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
+template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
+template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_ = MakePointer > class TensorReductionOp;
+template<typename XprType> class TensorIndexTupleOp;
+template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
+template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
+template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
+template<typename TargetType, typename XprType> class TensorConversionOp;
+template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
+template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
+template<typename PatchDim, typename XprType> class TensorPatchOp;
+template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp;
+template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
+template<DenseIndex DimId, typename XprType> class TensorChippingOp;
+template<typename NewDimensions, typename XprType> class TensorReshapingOp;
+template<typename XprType> class TensorLayoutSwapOp;
+template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp;
+template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
+template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
+template<typename Shuffle, typename XprType> class TensorShufflingOp;
+template<typename Strides, typename XprType> class TensorStridingOp;
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp;
+template<typename Strides, typename XprType> class TensorInflationOp;
+template<typename Generator, typename XprType> class TensorGeneratorOp;
+template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
+template<typename Op, typename XprType> class TensorScanOp;
+
+template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
+
+template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
+template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
+
+template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename Derived, typename Device> struct TensorEvaluator;
+
+struct DefaultDevice;
+struct ThreadPoolDevice;
+struct GpuDevice;
+struct SyclDevice;
+
+enum FFTResultType {
+  RealPart = 0,
+  ImagPart = 1,
+  BothParts = 2
+};
+
+enum FFTDirection {
+    FFT_FORWARD = 0,
+    FFT_REVERSE = 1
+};
+
+
+namespace internal {
+
+template <typename Device, typename Expression>
+struct IsVectorizable {
+  static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
+};
+
+template <typename Expression>
+struct IsVectorizable<GpuDevice, Expression> {
+  static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess &&
+                            TensorEvaluator<Expression, GpuDevice>::IsAligned;
+};
+
+template <typename Expression, typename Device,
+          bool Vectorizable = IsVectorizable<Device, Expression>::value>
+class TensorExecutor;
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
new file mode 100644
index 000000000..d73f6dc68
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -0,0 +1,489 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+
+namespace Eigen {
+namespace internal {
+
+
+/** \internal
+ * \brief Template functor to compute the modulo between an array and a scalar.
+ */
+template <typename Scalar>
+struct scalar_mod_op {
+  EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
+  const Scalar m_divisor;
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod_op<Scalar> >
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
+
+
+/** \internal
+ * \brief Template functor to compute the modulo between 2 arrays.
+ */
+template <typename Scalar>
+struct scalar_mod2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod2_op<Scalar> >
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
+
+template <typename Scalar>
+struct scalar_fmod_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    return numext::fmod(a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_fmod_op<Scalar> > {
+  enum { Cost = 13,  // Reciprocal throughput of FPREM on Haswell.
+         PacketAccess = false };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the sigmoid of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sigmoid()
+  */
+template <typename T>
+struct scalar_sigmoid_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    const T one = T(1);
+    return one / (one + numext::exp(-x));
+  }
+
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& x) const {
+    const Packet one = pset1<Packet>(T(1));
+    return pdiv(one, padd(one, pexp(pnegate(x))));
+  }
+};
+
+template <typename T>
+struct functor_traits<scalar_sigmoid_op<T> > {
+  enum {
+    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
+    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
+                   packet_traits<T>::HasNegate && packet_traits<T>::HasExp
+  };
+};
+
+
+template<typename Reducer, typename Device>
+struct reducer_traits {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+// Standard reduction functors
+template <typename T> struct SumReducer
+{
+  static const bool PacketAccess = packet_traits<T>::HasAdd;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = padd<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    internal::scalar_cast_op<int, T> conv;
+    return conv(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    internal::scalar_sum_op<T> sum_op;
+    return sum_op(saccum, predux(vaccum));
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd
+  };
+};
+
+
+template <typename T> struct MeanReducer
+{
+  static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
+  static const bool IsStateful = true;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  MeanReducer() : scalarCount_(0), packetCount_(0) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
+    scalarCount_++;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
+    (*accum) = padd<Packet>(*accum, p);
+    packetCount_++;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    internal::scalar_cast_op<int, T> conv;
+    return conv(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum / scalarCount_;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return pdiv(vaccum, pset1<Packet>(packetCount_));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    internal::scalar_sum_op<T> sum_op;
+    return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size);
+  }
+
+  protected:
+    DenseIndex scalarCount_;
+    DenseIndex packetCount_;
+};
+
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd
+  };
+};
+
+
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return Eigen::NumTraits<T>::lowest();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return -Eigen::NumTraits<T>::infinity();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return Eigen::NumTraits<T>::highest();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return Eigen::NumTraits<T>::infinity();
+  }
+};
+
+
+template <typename T> struct MaxReducer
+{
+  static const bool PacketAccess = packet_traits<T>::HasMax;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t > *accum) { *accum = t; }
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmax<Packet>(*accum, p);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return numext::maxi(saccum, predux_max(vaccum));
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMax
+  };
+};
+
+
+template <typename T> struct MinReducer
+{
+  static const bool PacketAccess = packet_traits<T>::HasMin;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t < *accum) { *accum = t; }
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmin<Packet>(*accum, p);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return numext::mini(saccum, predux_min(vaccum));
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMin
+  };
+};
+
+
+template <typename T> struct ProdReducer
+{
+  static const bool PacketAccess = packet_traits<T>::HasMul;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    internal::scalar_product_op<T> prod_op;
+    (*accum) = prod_op(*accum, t);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmul<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    internal::scalar_cast_op<int, T> conv;
+    return conv(1);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    internal::scalar_product_op<T> prod_op;
+    return prod_op(saccum, predux_mul(vaccum));
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::MulCost,
+    PacketAccess = PacketType<T, Device>::HasMul
+  };
+};
+
+
+struct AndReducer
+{
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
+    *accum = *accum && t;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
+    return accum;
+  }
+};
+
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
+struct OrReducer {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
+    *accum = *accum || t;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
+    return false;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
+    return accum;
+  }
+};
+
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
+// Argmin/Argmax reducers
+template <typename T> struct ArgMaxTupleReducer
+{
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t.second > accum->second) { *accum = t; }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return T(0, NumTraits<typename T::second_type>::lowest());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
+    return accum;
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
+template <typename T> struct ArgMinTupleReducer
+{
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
+    if (t.second < accum->second) { *accum = t; }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return T(0, NumTraits<typename T::second_type>::highest());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
+    return accum;
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
+template <typename T, typename Index, size_t NumDims>
+class GaussianGenerator {
+ public:
+  static const bool PacketAccess = false;
+
+  EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means,
+                                      const array<T, NumDims>& std_devs)
+      : m_means(means)
+  {
+    for (size_t i = 0; i < NumDims; ++i) {
+      m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
+    T tmp = T(0);
+    for (size_t i = 0; i < NumDims; ++i) {
+      T offset = coordinates[i] - m_means[i];
+      tmp += offset * offset / m_two_sigmas[i];
+    }
+    return numext::exp(-tmp);
+  }
+
+ private:
+  array<T, NumDims> m_means;
+  array<T, NumDims> m_two_sigmas;
+};
+
+template <typename T, typename Index, size_t NumDims>
+struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
+  enum {
+    Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost +
+                      functor_traits<scalar_quotient_op<T, T> >::Cost) +
+           functor_traits<scalar_exp_op<T> >::Cost,
+    PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
+  };
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
new file mode 100644
index 000000000..eb1d4934e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -0,0 +1,185 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+
+namespace Eigen {
+
+/** \class TensorGenerator
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor generator class.
+  *
+  *
+  */
+namespace internal {
+template<typename Generator, typename XprType>
+struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Generator, typename XprType>
+struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense>
+{
+  typedef const TensorGeneratorOp<Generator, XprType>& type;
+};
+
+template<typename Generator, typename XprType>
+struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type>
+{
+  typedef TensorGeneratorOp<Generator, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Generator, typename XprType>
+class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator)
+      : m_xpr(expr), m_generator(generator) {}
+
+    EIGEN_DEVICE_FUNC
+    const Generator& generator() const { return m_generator; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Generator m_generator;
+};
+
+
+// Eval as rvalue
+template<typename Generator, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
+{
+  typedef TensorGeneratorOp<Generator, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  static const int NumDims = internal::array_size<Dimensions>::value;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_generator(op.generator())
+  {
+    TensorEvaluator<ArgType, Device> impl(op.expression(), device);
+    m_dimensions = impl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    array<Index, NumDims> coords;
+    extract_coordinates(index, coords);
+    return m_generator(coords);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool) const {
+    // TODO(rmlarsen): This is just a placeholder. Define interface to make
+    // generators return their cost.
+    return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
+                                  TensorOpCost::MulCost<Scalar>());
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[NumDims-1] = index;
+    }
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  Generator m_generator;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
new file mode 100644
index 000000000..665b861cf
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ */
+template <typename ADerived, typename BDerived, typename XDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
+                         const ADerived, const BDerived, const XDerived>
+    betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
+  return TensorCwiseTernaryOp<
+      internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
+      const BDerived, const XDerived>(
+      a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
new file mode 100644
index 000000000..a901c5dd4
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Print the tensor as a 2d matrix
+template <typename Tensor, int Rank>
+struct TensorPrinter {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index Index;
+    const Index total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+      static const int layout = Tensor::Layout;
+      Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+      os << matrix;
+    }
+  }
+};
+
+
+// Print the tensor as a vector
+template <typename Tensor>
+struct TensorPrinter<Tensor, 1> {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index Index;
+    const Index total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+      os << array;
+    }
+  }
+};
+
+
+// Print the tensor as a scalar
+template <typename Tensor>
+struct TensorPrinter<Tensor, 0> {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    os << tensor.coeff(0);
+  }
+};
+}
+
+template <typename T>
+std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+  typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+  typedef typename Evaluator::Dimensions Dimensions;
+
+  // Evaluate the expression if needed
+  TensorForcedEvalOp<const T> eval = expr.eval();
+  Evaluator tensor(eval, DefaultDevice());
+  tensor.evalSubExprsIfNeeded(NULL);
+
+  // Print the result
+  static const int rank = internal::array_size<Dimensions>::value;
+  internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
+
+  // Cleanup.
+  tensor.cleanup();
+  return os;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
new file mode 100644
index 000000000..566856ed2
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -0,0 +1,509 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorImagePatch
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Patch extraction specialized for image processing.
+  * This assumes that the input has a least 3 dimensions ordered as follow:
+  *  1st dimension: channels (of size d)
+  *  2nd dimension: rows (of size r)
+  *  3rd dimension: columns (of size c)
+  *  There can be additional dimensions such as time (for video) or batch (for
+  * bulk processing after the first 3.
+  * Calling the image patch code with patch_rows and patch_cols is equivalent
+  * to calling the regular patch extraction code with parameters d, patch_rows,
+  * patch_cols, and 1 for all the additional dimensions.
+  */
+namespace internal {
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
+{
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions + 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense>
+{
+  typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
+};
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type>
+{
+  typedef TensorImagePatchOp<Rows, Cols, XprType> type;
+};
+
+}  // end namespace internal
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                           DenseIndex row_strides, DenseIndex col_strides,
+                                                           DenseIndex in_row_strides, DenseIndex in_col_strides,
+                                                           DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+                                                           PaddingType padding_type, Scalar padding_value)
+      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+        m_padding_type(padding_type), m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                           DenseIndex row_strides, DenseIndex col_strides,
+                                                           DenseIndex in_row_strides, DenseIndex in_col_strides,
+                                                           DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+                                                           DenseIndex padding_top, DenseIndex padding_bottom,
+                                                           DenseIndex padding_left, DenseIndex padding_right,
+                                                           Scalar padding_value)
+      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left), m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_rows() const { return m_patch_rows; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_cols() const { return m_patch_cols; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_strides() const { return m_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_strides() const { return m_col_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_row_strides() const { return m_in_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_col_strides() const { return m_in_col_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    bool padding_explicit() const { return m_padding_explicit; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_top() const { return m_padding_top; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_bottom() const { return m_padding_bottom; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_left() const { return m_padding_left; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_right() const { return m_padding_right; }
+    EIGEN_DEVICE_FUNC
+    PaddingType padding_type() const { return m_padding_type; }
+    EIGEN_DEVICE_FUNC
+    Scalar padding_value() const { return m_padding_value; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const DenseIndex m_patch_rows;
+    const DenseIndex m_patch_cols;
+    const DenseIndex m_row_strides;
+    const DenseIndex m_col_strides;
+    const DenseIndex m_in_row_strides;
+    const DenseIndex m_in_col_strides;
+    const DenseIndex m_row_inflate_strides;
+    const DenseIndex m_col_inflate_strides;
+    const bool m_padding_explicit;
+    const DenseIndex m_padding_top;
+    const DenseIndex m_padding_bottom;
+    const DenseIndex m_padding_left;
+    const DenseIndex m_padding_right;
+    const PaddingType m_padding_type;
+    const Scalar m_padding_value;
+};
+
+// Eval as rvalue
+template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
+{
+  typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
+                          Device> Self;
+  typedef TensorEvaluator<ArgType, Device> Impl;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    m_paddingValue = op.padding_value();
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // Caches a few variables.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputDepth = input_dims[0];
+      m_inputRows = input_dims[1];
+      m_inputCols = input_dims[2];
+    } else {
+      m_inputDepth = input_dims[NumInputDims-1];
+      m_inputRows = input_dims[NumInputDims-2];
+      m_inputCols = input_dims[NumInputDims-3];
+    }
+
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // Input strides and effective input/patch size
+    m_in_row_strides = op.in_row_strides();
+    m_in_col_strides = op.in_col_strides();
+    m_row_inflate_strides = op.row_inflate_strides();
+    m_col_inflate_strides = op.col_inflate_strides();
+    // The "effective" input rows and input cols are the input rows and cols
+    // after inflating them with zeros.
+    // For examples, a 2x3 matrix with row_inflate_strides and
+    // col_inflate_strides of 2 comes from:
+    //   A B C
+    //   D E F
+    //
+    // to a matrix is 3 x 5:
+    //
+    //   A . B . C
+    //   . . . . .
+    //   D . E . F
+
+    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+    if (op.padding_explicit()) {
+      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+      m_rowPaddingTop = op.padding_top();
+      m_colPaddingLeft = op.padding_left();
+    } else {
+      // Computing padding from the type
+      switch (op.padding_type()) {
+        case PADDING_VALID:
+          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+          // Calculate the padding
+          m_rowPaddingTop = numext::maxi<Index>(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2);
+          m_colPaddingLeft = numext::maxi<Index>(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2);
+          break;
+        case PADDING_SAME:
+          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+          // Calculate the padding
+          m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
+          m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
+          break;
+        default:
+          eigen_assert(false && "unexpected padding");
+      }
+    }
+    eigen_assert(m_outputRows > 0);
+    eigen_assert(m_outputCols > 0);
+
+    // Dimensions for result of extraction.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // ColMajor
+      // 0: depth
+      // 1: patch_rows
+      // 2: patch_cols
+      // 3: number of patches
+      // 4 and beyond: anything else (such as batch).
+      m_dimensions[0] = input_dims[0];
+      m_dimensions[1] = op.patch_rows();
+      m_dimensions[2] = op.patch_cols();
+      m_dimensions[3] = m_outputRows * m_outputCols;
+      for (int i = 4; i < NumDims; ++i) {
+        m_dimensions[i] = input_dims[i-1];
+      }
+    } else {
+      // RowMajor
+      // NumDims-1: depth
+      // NumDims-2: patch_rows
+      // NumDims-3: patch_cols
+      // NumDims-4: number of patches
+      // NumDims-5 and beyond: anything else (such as batch).
+      m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
+      m_dimensions[NumDims-2] = op.patch_rows();
+      m_dimensions[NumDims-3] = op.patch_cols();
+      m_dimensions[NumDims-4] = m_outputRows * m_outputCols;
+      for (int i = NumDims-5; i >= 0; --i) {
+        m_dimensions[i] = input_dims[i];
+      }
+    }
+
+    // Strides for moving the patch in various dimensions.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_colStride = m_dimensions[1];
+      m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
+      m_otherStride = m_patchStride * m_dimensions[3];
+    } else {
+      m_colStride = m_dimensions[NumDims-2];
+      m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1];
+      m_otherStride = m_patchStride * m_dimensions[NumDims-4];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_rowInputStride = m_inputDepth;
+    m_colInputStride = m_inputDepth * m_inputRows;
+    m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols;
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastInflateRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+    m_fastInflateColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+
+    // Number of patches in the width dimension.
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+    } else {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    // Other ways to index this element.
+    const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
+    const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    // Calculate col index in the input original tensor.
+    const Index colIndex = patch2DIndex / m_fastOutputRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate row index in the original input tensor.
+    const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+    const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride;
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index indices[2] = {index, index + PacketSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+    const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch2DIndex / m_fastOutputRows;
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] -
+      m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputCols[0] == inputCols[1]) {
+      const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+      const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
+      eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+      // Calculate col indices in the original input tensor.
+      const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] -
+        m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+      if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+        return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+      }
+
+      if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+        // no padding
+        const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+        const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+        const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
+        return m_impl.template packet<Unaligned>(inputIndex);
+      }
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Index rowPaddingTop() const { return m_rowPaddingTop; }
+  Index colPaddingLeft() const { return m_colPaddingLeft; }
+  Index outputRows() const { return m_outputRows; }
+  Index outputCols() const { return m_outputCols; }
+  Index userRowStride() const { return m_row_strides; }
+  Index userColStride() const { return m_col_strides; }
+  Index userInRowStride() const { return m_in_row_strides; }
+  Index userInColStride() const { return m_in_col_strides; }
+  Index rowInflateStride() const { return m_row_inflate_strides; }
+  Index colInflateStride() const { return m_col_inflate_strides; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    // We conservatively estimate the cost for the code path where the computed
+    // index is inside the original image and
+    // TensorEvaluator<ArgType, Device>::CoordAccess is false.
+    const double compute_cost = 3 * TensorOpCost::DivCost<Index>() +
+                                6 * TensorOpCost::MulCost<Index>() +
+                                8 * TensorOpCost::MulCost<Index>();
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+  {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_colStride;
+  Index m_row_strides;
+  Index m_col_strides;
+
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+  Index m_row_inflate_strides;
+  Index m_col_inflate_strides;
+
+  Index m_input_rows_eff;
+  Index m_input_cols_eff;
+  Index m_patch_rows_eff;
+  Index m_patch_cols_eff;
+
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+  internal::TensorIntDivisor<Index> m_fastInflateRowStride;
+  internal::TensorIntDivisor<Index> m_fastInflateColStride;
+  internal::TensorIntDivisor<Index> m_fastInputColsEff;
+
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_patchInputStride;
+
+  Index m_inputDepth;
+  Index m_inputRows;
+  Index m_inputCols;
+
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+  Scalar m_paddingValue;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
new file mode 100644
index 000000000..3209fecd3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -0,0 +1,725 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+
+
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
+
+#define EIGEN_HAS_INDEX_LIST
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorIndexList
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Set of classes used to encode a set of Tensor dimensions/indices.
+  *
+  * The indices in the list can be known at compile time or at runtime. A mix
+  * of static and dynamic indices can also be provided if needed. The tensor
+  * code will attempt to take advantage of the indices that are known at
+  * compile time to optimize the code it generates.
+  *
+  * This functionality requires a c++11 compliant compiler. If your compiler
+  * is older you need to use arrays of indices instead.
+  *
+  * Several examples are provided in the cxx11_tensor_index_list.cpp file.
+  *
+  * \sa Tensor
+  */
+
+template <DenseIndex n>
+struct type2index {
+  static const DenseIndex value = n;
+  EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; }
+  EIGEN_DEVICE_FUNC void set(DenseIndex val) {
+    eigen_assert(val == n);
+  }
+};
+
+// This can be used with IndexPairList to get compile-time constant pairs,
+// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
+template <DenseIndex f, DenseIndex s>
+struct type2indexpair {
+  static const DenseIndex first = f;
+  static const DenseIndex second = s;
+
+  constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const {
+    return IndexPair<DenseIndex>(f, s);
+  }
+
+  EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) {
+    eigen_assert(val.first == f);
+    eigen_assert(val.second == s);
+  }
+};
+
+
+template<DenseIndex n> struct NumTraits<type2index<n> >
+{
+  typedef DenseIndex Real;
+  enum {
+    IsComplex = 0,
+    RequireInitialization = false,
+    ReadCost = 1,
+    AddCost = 1,
+    MulCost = 1
+  };
+
+  EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; }
+  EIGEN_DEVICE_FUNC static inline Real highest() { return n; }
+  EIGEN_DEVICE_FUNC static inline Real lowest() { return n; }
+};
+
+namespace internal {
+template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) {
+  val = new_val;
+}
+template <DenseIndex n>
+EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) {
+  val.set(new_val);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<DenseIndex> new_val) {
+  val = new_val;
+}
+template <DenseIndex f, DenseIndex s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) {
+  val.set(new_val);
+}
+
+
+template <typename T>
+struct is_compile_time_constant {
+  static constexpr bool value = false;
+};
+
+template <DenseIndex idx>
+struct is_compile_time_constant<type2index<idx> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<const type2index<idx> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<type2index<idx>& > {
+  static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<const type2index<idx>& > {
+  static constexpr bool value = true;
+};
+
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<type2indexpair<f, s> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<const type2indexpair<f, s> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<type2indexpair<f, s>& > {
+  static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<const type2indexpair<f, s>& > {
+  static constexpr bool value = true;
+};
+
+
+template<typename... T>
+struct IndexTuple;
+
+template<typename T, typename... O>
+struct IndexTuple<T, O...> {
+  EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { }
+  EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { }
+
+  constexpr static int count = 1 + sizeof...(O);
+  T head;
+  IndexTuple<O...> others;
+  typedef T Head;
+  typedef IndexTuple<O...> Other;
+};
+
+template<typename T>
+  struct IndexTuple<T> {
+  EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { }
+  EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { }
+
+  constexpr static int count = 1;
+  T head;
+  typedef T Head;
+};
+
+
+template<int N, typename... T>
+struct IndexTupleExtractor;
+
+template<int N, typename T, typename... O>
+struct IndexTupleExtractor<N, T, O...> {
+
+  typedef typename IndexTupleExtractor<N-1, O...>::ValType ValType;
+
+  EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
+    return IndexTupleExtractor<N-1, O...>::get_val(val.others);
+  }
+
+  EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
+    return IndexTupleExtractor<N-1, O...>::get_val(val.others);
+  }
+  template <typename V>
+  EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+    IndexTupleExtractor<N-1, O...>::set_val(val.others, new_val);
+  }
+
+};
+
+template<typename T, typename... O>
+  struct IndexTupleExtractor<0, T, O...> {
+
+  typedef T ValType;
+
+  EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
+    return val.head;
+  }
+  EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
+    return val.head;
+  }
+  template <typename V>
+  EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+    val.head = new_val;
+  }
+};
+
+
+
+template <int N, typename T, typename... O>
+EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor<N, T, O...>::ValType& array_get(IndexTuple<T, O...>& tuple) {
+  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <int N, typename T, typename... O>
+EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor<N, T, O...>::ValType& array_get(const IndexTuple<T, O...>& tuple) {
+  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <typename T, typename... O>
+  struct array_size<IndexTuple<T, O...> > {
+  static const size_t value = IndexTuple<T, O...>::count;
+};
+template <typename T, typename... O>
+  struct array_size<const IndexTuple<T, O...> > {
+  static const size_t value = IndexTuple<T, O...>::count;
+};
+
+
+
+
+template <DenseIndex Idx, typename ValueT>
+struct tuple_coeff {
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple<T...>& t) {
+    //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+    return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
+  }
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT& value) {
+    if (i == Idx) {
+      update_value(array_get<Idx>(t), value);
+    } else {
+      tuple_coeff<Idx-1, ValueT>::set(i, t, value);
+    }
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) {
+    return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
+        tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
+    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+        tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t);
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>& t) {
+    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           array_get<Idx>(t) > array_get<Idx-1>(t) &&
+           tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t);
+  }
+};
+
+template <typename ValueT>
+struct tuple_coeff<0, ValueT> {
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple<T...>& t) {
+    //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
+    return array_get<0>(t)/* * (i == 0)*/;
+  }
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT value) {
+    eigen_assert (i == 0);
+    update_value(array_get<0>(t), value);
+  }
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value & (i == 0);
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value;
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>&) {
+    return true;
+  }
+};
+}  // namespace internal
+
+
+
+template<typename FirstType, typename... OtherTypes>
+struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value);
+  }
+
+  EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+  EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { }
+  EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
+
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+  }
+  EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this);
+  }
+
+  EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this);
+  }
+};
+
+
+template<typename FirstType, typename... OtherTypes>
+constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
+  return IndexList<FirstType, OtherTypes...>(val1, other_vals...);
+}
+
+
+template<typename FirstType, typename... OtherTypes>
+struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value);
+  }
+
+  EIGEN_DEVICE_FUNC  constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+  EIGEN_DEVICE_FUNC  constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
+
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+  }
+};
+
+namespace internal {
+
+template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+  size_t result = 1;
+  for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
+    result *= sizes[i];
+  }
+  return result;
+}
+
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
+  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
+  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
+};
+
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > {
+  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > {
+  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+
+template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
+  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
+}
+template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
+  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
+}
+
+template <typename T>
+struct index_known_statically_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+  }
+};
+
+
+template <typename T>
+struct all_indices_known_statically_impl {
+  static constexpr bool run() {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+  }
+};
+
+
+template <typename T>
+struct indices_statically_known_to_increase_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+  struct indices_statically_known_to_increase_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+  struct indices_statically_known_to_increase_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  }
+};
+
+
+template <typename Tx>
+struct index_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) == value);
+  }
+};
+
+
+template <typename T>
+struct index_statically_ne_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) != value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) != value);
+  }
+};
+
+
+template <typename T>
+struct index_statically_gt_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) > value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) > value);
+  }
+};
+
+
+
+template <typename T>
+struct index_statically_lt_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) < value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) < value);
+  }
+};
+
+
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+  }
+};
+
+
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+  }
+};
+
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#else
+
+namespace Eigen {
+namespace internal {
+
+template <typename T>
+struct index_known_statically_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
+    return false;
+  }
+};
+
+template <typename T>
+struct all_indices_known_statically_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
+    return false;
+  }
+};
+
+template <typename T>
+struct indices_statically_known_to_increase_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_eq_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_ne_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_gt_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_lt_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif
+
+
+namespace Eigen {
+namespace internal {
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) {
+  return index_known_statically_impl<T>::run(i);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() {
+  return all_indices_known_statically_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() {
+  return indices_statically_known_to_increase_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) {
+  return index_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) {
+  return index_statically_ne_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) {
+  return index_statically_gt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) {
+  return index_statically_lt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) {
+  return index_pair_first_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) {
+  return index_pair_second_statically_eq_impl<T>::run(i, value);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
new file mode 100644
index 000000000..f391fb9ee
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -0,0 +1,229 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
+
+namespace Eigen {
+
+/** \class TensorInflation
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor inflation class.
+  *
+  *
+  */
+namespace internal {
+template<typename Strides, typename XprType>
+struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Strides, typename XprType>
+struct eval<TensorInflationOp<Strides, XprType>, Eigen::Dense>
+{
+  typedef const TensorInflationOp<Strides, XprType>& type;
+};
+
+template<typename Strides, typename XprType>
+struct nested<TensorInflationOp<Strides, XprType>, 1, typename eval<TensorInflationOp<Strides, XprType> >::type>
+{
+  typedef TensorInflationOp<Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+template<typename Strides, typename XprType>
+class TensorInflationOp : public TensorBase<TensorInflationOp<Strides, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides)
+      : m_xpr(expr), m_strides(strides) {}
+
+    EIGEN_DEVICE_FUNC
+    const Strides& strides() const { return m_strides; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Strides m_strides;
+};
+
+// Eval as rvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
+{
+  typedef TensorInflationOp<Strides, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_strides(op.strides())
+  {
+    m_dimensions = m_impl.dimensions();
+    // Expand each dimension to the inflated dimension.
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1;
+    }
+
+    // Remember the strides for fast division.
+    for (int i = 0; i < NumDims; ++i) {
+      m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+      }
+    } else {  // RowMajor
+      m_outputStrides[NumDims-1] = 1;
+      m_inputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  // Computes the input index given the output index. Returns true if the output
+  // index doesn't fall into a hole.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const
+  {
+    eigen_assert(index < dimensions().TotalSize());
+    *inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
+          return false;
+        }
+        *inputIndex += idx / m_strides[i] * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (index != index / m_fastStrides[0] * m_strides[0]) {
+        return false;
+      }
+      *inputIndex += index / m_strides[0];
+      return true;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
+          return false;
+        }
+        *inputIndex += idx / m_strides[i] * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) {
+        return false;
+      }
+      *inputIndex += index / m_strides[NumDims - 1];
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (getInputIndex(index, &inputIndex)) {
+     return m_impl.coeff(inputIndex);
+    } else {
+     return Scalar(0);
+    }
+  }
+
+  // TODO(yangke): optimize this function so that we can detect and produce
+  // all-zero packets
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() +
+                                           3 * TensorOpCost::MulCost<Index>() +
+                                           2 * TensorOpCost::AddCost<Index>());
+    const double input_size = m_impl.dimensions().TotalSize();
+    const double output_size = m_dimensions.TotalSize();
+    if (output_size == 0)
+      return TensorOpCost();
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0,
+                        compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Strides m_strides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
new file mode 100644
index 000000000..33edc49e3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -0,0 +1,82 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+
+#include <initializer_list>
+
+namespace Eigen {
+
+/** \class TensorInitializer
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Helper template to initialize Tensors from std::initializer_lists.
+  */
+namespace internal {
+
+template <typename Derived, int N>
+struct Initializer {
+  typedef std::initializer_list<
+    typename Initializer<Derived, N - 1>::InitList> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    int i = 0;
+    for (auto v : vals) {
+      (*indices)[traits<Derived>::NumDimensions - N] = i++;
+      Initializer<Derived, N - 1>::run(tensor, indices, v);
+    }
+  }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 1> {
+  typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    int i = 0;
+    // There is likely a faster way to do that than iterating.
+    for (auto v : vals) {
+      (*indices)[traits<Derived>::NumDimensions - 1] = i++;
+      tensor.coeffRef(*indices) = v;
+    }
+  }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 0> {
+  typedef typename traits<Derived>::Scalar InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>*,
+                  const InitList& v) {
+    tensor.coeffRef(0) = v;
+  }
+};
+
+
+template <typename Derived, int N>
+void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                       const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
+  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
+  Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
new file mode 100644
index 000000000..ede3939c2
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorIntDiv
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Fast integer division by a constant.
+  *
+  * See the paper from Granlund and Montgomery for explanation.
+  *   (at http://dx.doi.org/10.1145/773473.178249)
+  *
+  * \sa Tensor
+  */
+
+namespace internal {
+
+namespace {
+
+  // Note: result is undefined if val == 0
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
+  {
+#ifdef __CUDA_ARCH__
+    return __clz(val);
+#elif EIGEN_COMP_MSVC
+    unsigned long index;
+    _BitScanReverse(&index, val);
+    return 31 - index;
+#else
+    EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+  }
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
+  {
+#ifdef __CUDA_ARCH__
+    return __clzll(val);
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+    unsigned long index;
+    _BitScanReverse64(&index, val);
+    return 63 - index;
+#elif EIGEN_COMP_MSVC
+    // MSVC's _BitScanReverse64 is not available for 32bits builds.
+    unsigned int lo = (unsigned int)(val&0xffffffff);
+    unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
+    int n;
+    if(hi==0)
+      n = 32 + count_leading_zeros<unsigned int>(lo);
+    else
+      n = count_leading_zeros<unsigned int>(hi);
+    return n;
+#else
+    EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return __builtin_clzll(static_cast<uint64_t>(val));
+#endif
+  }
+
+  template <typename T>
+  struct UnsignedTraits {
+    typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
+  };
+
+  template <typename T>
+  struct DividerTraits {
+    typedef typename UnsignedTraits<T>::type type;
+    static const int N = sizeof(T) * 8;
+  };
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
+#if defined(__CUDA_ARCH__)
+    return __umulhi(a, b);
+#else
+    return (static_cast<uint64_t>(a) * b) >> 32;
+#endif
+  }
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
+#if defined(__CUDA_ARCH__)
+    return __umul64hi(a, b);
+#elif defined(__SIZEOF_INT128__)
+    __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
+    return static_cast<uint64_t>(v >> 64);
+#else
+    return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
+#endif
+  }
+
+  template <int N, typename T>
+  struct DividerHelper {
+    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
+      EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1);
+    }
+  };
+
+  template <typename T>
+  struct DividerHelper<64, T> {
+    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
+#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
+      return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
+#else
+      const uint64_t shift = 1ULL << log_div;
+      TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
+                                               - TensorUInt128<static_val<1>, static_val<0> >(1, 0)
+                                               + TensorUInt128<static_val<0>, static_val<1> >(1);
+      return static_cast<uint64_t>(result);
+#endif
+    }
+  };
+}
+
+
+template <typename T, bool div_gt_one = false>
+struct TensorIntDivisor {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    multiplier = 0;
+    shift1 = 0;
+    shift2 = 0;
+  }
+
+  // Must have 0 < divider < 2^31. This is relaxed to
+  // 0 < divider < 2^63 when using 64-bit indices on platforms that support
+  // the __uint128_t type.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+    const int N = DividerTraits<T>::N;
+    eigen_assert(static_cast<typename UnsignedTraits<T>::type>(divider) < NumTraits<UnsignedType>::highest()/2);
+    eigen_assert(divider > 0);
+
+    // fast ln2
+    const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider));
+    int log_div = N - leading_zeros;
+    // if divider is a power of two then log_div is 1 more than it should be.
+    if ((static_cast<typename UnsignedTraits<T>::type>(1) << (log_div-1)) == static_cast<typename UnsignedTraits<T>::type>(divider))
+      log_div--;
+
+    multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider);
+    shift1 = log_div > 1 ? 1 : log_div;
+    shift2 = log_div > 1 ? log_div-1 : 0;
+  }
+
+  // Must have 0 <= numerator. On platforms that dont support the __uint128_t
+  // type numerator should also be less than 2^32-1.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+    eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);
+    //eigen_assert(numerator >= 0); // this is implicitly asserted by the line above
+
+    UnsignedType t1 = muluh(multiplier, numerator);
+    UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1;
+    return (t1 + t) >> shift2;
+  }
+
+ private:
+  typedef typename DividerTraits<T>::type UnsignedType;
+  UnsignedType multiplier;
+  int32_t shift1;
+  int32_t shift2;
+};
+
+
+// Optimized version for signed 32 bit integers.
+// Derived from Hacker's Delight.
+// Only works for divisors strictly greater than one
+template <>
+class TensorIntDivisor<int32_t, true> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    magic = 0;
+    shift = 0;
+  }
+  // Must have 2 <= divider
+  EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider)  {
+    eigen_assert(divider >= 2);
+    calcMagic(divider);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
+#ifdef __CUDA_ARCH__
+    return (__umulhi(magic, n) >> shift);
+#else
+    uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
+    return (static_cast<uint32_t>(v >> 32) >> shift);
+#endif
+  }
+
+private:
+  // Compute the magic numbers. See Hacker's Delight section 10 for an in
+  // depth explanation.
+  EIGEN_DEVICE_FUNC void calcMagic(int32_t d) {
+   const unsigned two31 = 0x80000000;     // 2**31.
+   unsigned ad = d;
+   unsigned t = two31 + (ad >> 31);
+   unsigned anc = t - 1 - t%ad;     // Absolute value of nc.
+   int p = 31;                      // Init. p.
+   unsigned q1 = two31/anc;         // Init. q1 = 2**p/|nc|.
+   unsigned r1 = two31 - q1*anc;    // Init. r1 = rem(2**p, |nc|).
+   unsigned q2 = two31/ad;          // Init. q2 = 2**p/|d|.
+   unsigned r2 = two31 - q2*ad;     // Init. r2 = rem(2**p, |d|).
+   unsigned delta = 0;
+   do {
+      p = p + 1;
+      q1 = 2*q1;           // Update q1 = 2**p/|nc|.
+      r1 = 2*r1;           // Update r1 = rem(2**p, |nc|).
+      if (r1 >= anc) {     // (Must be an unsigned
+         q1 = q1 + 1;      // comparison here).
+         r1 = r1 - anc;}
+      q2 = 2*q2;           // Update q2 = 2**p/|d|.
+      r2 = 2*r2;           // Update r2 = rem(2**p, |d|).
+      if (r2 >= ad) {      // (Must be an unsigned
+         q2 = q2 + 1;      // comparison here).
+         r2 = r2 - ad;}
+      delta = ad - r2;
+   } while (q1 < delta || (q1 == delta && r1 == 0));
+
+   magic = (unsigned)(q2 + 1);
+   shift = p - 32;
+  }
+
+  uint32_t magic;
+  int32_t shift;
+};
+
+
+template <typename T, bool div_gt_one>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
+  return divisor.divide(numerator);
+}
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
new file mode 100644
index 000000000..cd0109ef4
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -0,0 +1,209 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+
+namespace Eigen {
+
+/** \class TensorLayoutSwap
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Swap the layout from col-major to row-major, or row-major
+  * to col-major, and invert the order of the dimensions.
+  *
+  * Beware: the dimensions are reversed by this operation. If you want to
+  * preserve the ordering of the dimensions, you need to combine this
+  * operation with a shuffle.
+  *
+  * \example:
+  * Tensor<float, 2, ColMajor> input(2, 4);
+  * Tensor<float, 2, RowMajor> output = input.swap_layout();
+  * eigen_assert(output.dimension(0) == 4);
+  * eigen_assert(output.dimension(1) == 2);
+  *
+  * array<int, 2> shuffle(1, 0);
+  * output = input.swap_layout().shuffle(shuffle);
+  * eigen_assert(output.dimension(0) == 2);
+  * eigen_assert(output.dimension(1) == 4);
+  *
+  */
+namespace internal {
+template<typename XprType>
+struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = traits<XprType>::NumDimensions;
+  static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
+};
+
+template<typename XprType>
+struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorLayoutSwapOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type>
+{
+  typedef TensorLayoutSwapOp<XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename XprType>
+class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
+      : m_xpr(expr) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other)
+    {
+      typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+
+// Eval as rvalue
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
+{
+  typedef TensorLayoutSwapOp<ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    for(int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = m_impl.dimensions()[NumDims-1-i];
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    return m_impl.evalSubExprsIfNeeded(data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(index);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_impl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  Dimensions m_dimensions;
+};
+
+
+// Eval as lvalue
+template<typename ArgType, typename Device>
+  struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
+  : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
+  typedef TensorLayoutSwapOp<ArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
+    CoordAccess = false  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(index);
+  }
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    this->m_impl.template writePacket<StoreMode>(index, x);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
new file mode 100644
index 000000000..ee0078bbc
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -0,0 +1,54 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+
+
+/** use this macro in sfinae selection in templated functions
+ *
+ *   template<typename T,
+ *            typename std::enable_if< isBanana<T>::value , int >::type = 0
+ *   >
+ *   void foo(){}
+ *
+ *   becomes =>
+ *
+ *   template<typename TopoType,
+ *           SFINAE_ENABLE_IF( isBanana<T>::value )
+ *   >
+ *   void foo(){}
+ */
+
+// SFINAE requires variadic templates
+#ifndef __CUDACC__
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  // SFINAE doesn't work for gcc <= 4.7
+  #ifdef EIGEN_COMP_GNUC
+    #if EIGEN_GNUC_AT_LEAST(4,8)
+      #define EIGEN_HAS_SFINAE
+    #endif
+  #else
+    #define EIGEN_HAS_SFINAE
+  #endif
+#endif
+#endif
+
+#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \
+    typename internal::enable_if< ( __condition__ ) , int >::type = 0
+
+
+#if EIGEN_HAS_CONSTEXPR
+#define EIGEN_CONSTEXPR constexpr
+#else
+#define EIGEN_CONSTEXPR
+#endif
+
+
+#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
new file mode 100644
index 000000000..a8e55757e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -0,0 +1,321 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+
+namespace Eigen {
+
+/** \class TensorMap
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A tensor expression mapping an existing array of data.
+  *
+  */
+/// template <class> class MakePointer_ is added to convert the host pointer to the device pointer.
+/// It is added due to the fact that for our device compiler T* is not allowed.
+/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer T.
+/// This is done through our MakePointer_ class. By default the Type in the MakePointer_<T> is T* .
+/// Therefore, by adding the default value, we managed to convert the type and it does not break any
+/// existing code as its default value is T*.
+template<typename PlainObjectType, int Options_, template <class> class MakePointer_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> >
+{
+  public:
+    typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
+    typedef typename PlainObjectType::Base Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+    typedef typename internal::traits<PlainObjectType>::Index Index;
+    typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+  /*    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<PlainObjectType>::value),
+                         Scalar *,
+                         const Scalar *>::type
+                     PointerType;*/
+    typedef typename MakePointer_<Scalar>::Type PointerType;
+    typedef PointerType PointerArgType;
+
+    static const int Options = Options_;
+
+    static const Index NumIndices = PlainObjectType::NumIndices;
+    typedef typename PlainObjectType::Dimensions Dimensions;
+
+    enum {
+      IsAligned = ((int(Options_)&Aligned)==Aligned),
+      Layout = PlainObjectType::Layout,
+      CoordAccess = true,
+      RawAccess = true
+    };
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
+      EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
+      EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
+      EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
+      EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#endif
+
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+      : m_data(dataPtr), m_dimensions(dimensions)
+    { }
+
+    template <typename Dimensions>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+      : m_data(dataPtr), m_dimensions(dimensions)
+    { }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor)
+      : m_data(tensor.data()), m_dimensions(tensor.dimensions())
+    { }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PointerType data() { return m_data; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const PointerType data() const { return m_data; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    {
+      //      eigen_assert(checkIndexRange(indices));
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(indices);
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(indices);
+        return m_data[index];
+      }
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return m_data[0];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_data[index];
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    {
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+        return m_data[index];
+      }
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i1 + i0 * m_dimensions[1];
+        return m_data[index];
+      } else {
+        const Index index = i0 + i1 * m_dimensions[0];
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+         const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
+         return m_data[index];
+      } else {
+         const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+        return m_data[index];
+      }
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    {
+      //      eigen_assert(checkIndexRange(indices));
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(indices);
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(indices);
+        return m_data[index];
+      }
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return m_data[0];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_data[index];
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    {
+      static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      const std::size_t NumDims = sizeof...(otherIndices) + 2;
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
+        return m_data[index];
+      }
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    {
+       if (PlainObjectType::Options&RowMajor) {
+         const Index index = i1 + i0 * m_dimensions[1];
+        return m_data[index];
+      } else {
+        const Index index = i0 + i1 * m_dimensions[0];
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    {
+       if (PlainObjectType::Options&RowMajor) {
+         const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
+        return m_data[index];
+      } else {
+         const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+        return m_data[index];
+      }
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other)
+    {
+      typedef TensorAssignOp<Self, const Self> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Self& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Self, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  private:
+    typename MakePointer_<Scalar>::Type m_data;
+    Dimensions m_dimensions;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
new file mode 100644
index 000000000..615559d44
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -0,0 +1,218 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_H
+
+namespace Eigen {
+
+template<bool cond> struct Cond {};
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T1& choose(Cond<true>, const T1& first, const T2&) {
+  return first;
+}
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T2& choose(Cond<false>, const T1&, const T2& second) {
+  return second;
+}
+
+
+template <typename T, typename X, typename Y>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const X x, const Y y) {
+  return static_cast<T>((x + y - 1) / y);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const T x, const T y) {
+  return static_cast<T>((x + y - 1) / y);
+}
+
+template <size_t n> struct max_n_1 {
+  static const size_t size = n;
+};
+template <> struct max_n_1<0> {
+  static const size_t size = 1;
+};
+
+
+// Default packet types
+template <typename Scalar, typename Device>
+struct PacketType : internal::packet_traits<Scalar> {
+  typedef typename internal::packet_traits<Scalar>::type type;
+};
+
+// For CUDA packet types when using a GpuDevice
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
+template <>
+struct PacketType<half, GpuDevice> {
+  typedef half2 type;
+  static const int size = 2;
+  enum {
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasArg    = 0,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasBlend  = 0,
+
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasLog    = 1,
+    HasLog1p  = 0,
+    HasLog10  = 0,
+    HasPow    = 1,
+  };
+};
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+template <typename T>
+  struct PacketType<T, SyclDevice> {
+  typedef T type;
+  static const int size = 1;
+  enum {
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasArg    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasBlend  = 0
+  };
+};
+#endif
+
+
+// Tuple mimics std::pair but works on e.g. nvcc.
+template <typename U, typename V> struct Tuple {
+ public:
+  U first;
+  V second;
+
+  typedef U first_type;
+  typedef V second_type;
+
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Tuple() : first(), second() {}
+
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Tuple(const U& f, const V& s) : first(f), second(s) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Tuple& operator= (const Tuple& rhs) {
+    if (&rhs == this) return *this;
+    first = rhs.first;
+    second = rhs.second;
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void swap(Tuple& rhs) {
+    using numext::swap;
+    swap(first, rhs.first);
+    swap(second, rhs.second);
+  }
+};
+
+template <typename U, typename V>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+bool operator==(const Tuple<U, V>& x, const Tuple<U, V>& y) {
+  return (x.first == y.first && x.second == y.second);
+}
+
+template <typename U, typename V>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
+  return !(x == y);
+}
+
+
+// Can't use std::pairs on cuda devices
+template <typename Idx> struct IndexPair {
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+
+  EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
+    first = val.first;
+    second = val.second;
+  }
+
+  Idx first;
+  Idx second;
+};
+
+
+#ifdef EIGEN_HAS_SFINAE
+namespace internal {
+
+  template<typename IndexType, Index... Is>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
+    return { idx[Is]... };
+  }
+  template<typename IndexType>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
+    return array<Index, 0>();
+  }
+
+  /** Make an array (for index/dimensions) out of a custom index */
+  template<typename Index, std::size_t NumIndices, typename IndexType>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  array<Index, NumIndices> customIndices2Array(IndexType& idx) {
+    return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
+  }
+
+
+  template <typename B, typename D>
+  struct is_base_of
+  {
+
+    typedef char (&yes)[1];
+    typedef char (&no)[2];
+
+    template <typename BB, typename DD>
+    struct Host
+    {
+      operator BB*() const;
+      operator DD*();
+    };
+
+    template<typename T>
+    static yes check(D*, T);
+    static no check(B*, int);
+
+    static const bool value = sizeof(check(Host<B,D>(), int())) == sizeof(yes);
+  };
+
+}
+#endif
+
+
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_META_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
new file mode 100644
index 000000000..d34f1e328
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -0,0 +1,888 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+
+namespace Eigen {
+
+/** \class TensorReshaping
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+namespace internal {
+template<typename NewDimensions, typename XprType>
+struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = array_size<NewDimensions>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename NewDimensions, typename XprType>
+struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
+{
+  typedef const TensorReshapingOp<NewDimensions, XprType>& type;
+};
+
+template<typename NewDimensions, typename XprType>
+struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type>
+{
+  typedef TensorReshapingOp<NewDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename NewDimensions, typename XprType>
+class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const NewDimensions& dimensions() const { return m_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other)
+    {
+      typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const NewDimensions m_dims;
+};
+
+
+// Eval as rvalue
+template<typename NewDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+{
+  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+  typedef NewDimensions Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dimensions(op.dimensions())
+  {
+    // The total size of the reshaped tensor must be equal to the total size
+    // of the input tensor.
+    eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    return m_impl.evalSubExprsIfNeeded(data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(index);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_impl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
+
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  NewDimensions m_dimensions;
+};
+
+
+// Eval as lvalue
+template<typename NewDimensions, typename ArgType, typename Device>
+  struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device>
+  : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+
+{
+  typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base;
+  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+  typedef NewDimensions Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(index);
+  }
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    this->m_impl.template writePacket<StoreMode>(index, x);
+  }
+};
+
+
+/** \class TensorSlicing
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor slicing class.
+  *
+  *
+  */
+namespace internal {
+template<typename StartIndices, typename Sizes, typename XprType>
+struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = array_size<StartIndices>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename StartIndices, typename Sizes, typename XprType>
+struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
+{
+  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
+};
+
+template<typename StartIndices, typename Sizes, typename XprType>
+struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type>
+{
+  typedef TensorSlicingOp<StartIndices, Sizes, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename StartIndices, typename Sizes, typename XprType>
+class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes)
+      : m_xpr(expr), m_indices(indices), m_sizes(sizes) {}
+
+    EIGEN_DEVICE_FUNC
+    const StartIndices& startIndices() const { return m_indices; }
+    EIGEN_DEVICE_FUNC
+    const Sizes& sizes() const { return m_sizes; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other)
+    {
+      typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const StartIndices m_indices;
+    const Sizes m_sizes;
+};
+
+
+// Fixme: figure out the exact threshold
+namespace {
+template <typename Index, typename Device> struct MemcpyTriggerForSlicing {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
+  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; }
+
+ private:
+  Index threshold_;
+};
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_GPU
+template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice>  {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
+  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
+};
+#endif
+}
+
+// Eval as rvalue
+template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+{
+  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+  static const int NumDims = internal::array_size<Sizes>::value;
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets and sizes.
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
+  {
+    for (std::size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Sizes& output_dims = op.sizes();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+      }
+
+     // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+      }
+    } else {
+      m_inputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+      }
+
+     // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed.
+      m_outputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+      }
+    }
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data && m_impl.data()) {
+      Index contiguous_values = 1;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = 0; i < NumDims; ++i) {
+          contiguous_values *= dimensions()[i];
+          if (dimensions()[i] != m_impl.dimensions()[i]) {
+            break;
+          }
+        }
+      } else {
+        for (int i = NumDims-1; i >= 0; --i) {
+          contiguous_values *= dimensions()[i];
+          if (dimensions()[i] != m_impl.dimensions()[i]) {
+            break;
+          }
+        }
+      }
+      // Use memcpy if it's going to be faster than using the regular evaluation.
+      const MemcpyTriggerForSlicing<Index, Device> trigger(m_device);
+      if (trigger(contiguous_values)) {
+        Scalar* src = (Scalar*)m_impl.data();
+        for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+          Index offset = srcCoeff(i);
+          m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
+        }
+        return false;
+      }
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + m_offsets[0]);
+      inputIndices[1] += (indices[1] + m_offsets[0]);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + m_offsets[NumDims-1]);
+      inputIndices[1] += (indices[1] + m_offsets[NumDims-1]);
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < packetSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+  }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+    Scalar* result = m_impl.data();
+    if (result) {
+      Index offset = 0;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = 0; i < NumDims; ++i) {
+          if (m_dimensions[i] != m_impl.dimensions()[i]) {
+            offset += m_offsets[i] * m_inputStrides[i];
+            for (int j = i+1; j < NumDims; ++j) {
+              if (m_dimensions[j] > 1) {
+                return NULL;
+              }
+              offset += m_offsets[j] * m_inputStrides[j];
+            }
+            break;
+          }
+        }
+      } else {
+        for (int i = NumDims - 1; i >= 0; --i) {
+          if (m_dimensions[i] != m_impl.dimensions()[i]) {
+            offset += m_offsets[i] * m_inputStrides[i];
+            for (int j = i-1; j >= 0; --j) {
+              if (m_dimensions[j] > 1) {
+                return NULL;
+              }
+              offset += m_offsets[j] * m_inputStrides[j];
+            }
+            break;
+          }
+        }
+      }
+      return result + offset;
+    }
+    return NULL;
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += (index + m_offsets[0]);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += (index + m_offsets[NumDims-1]);
+    }
+    return inputIndex;
+  }
+
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  Dimensions m_dimensions;
+  const StartIndices m_offsets;
+};
+
+
+// Eval as lvalue
+template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+  : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base;
+  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+  static const int NumDims = internal::array_size<Sizes>::value;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + this->m_offsets[0]);
+      inputIndices[1] += (indices[1] + this->m_offsets[0]);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]);
+      inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]);
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
+    }
+    else {
+      EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+      this->m_impl.coeffRef(inputIndices[0]) = values[0];
+      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+      for (int i = 1; i < packetSize-1; ++i) {
+        this->coeffRef(index+i) = values[i];
+      }
+    }
+  }
+};
+
+
+
+namespace internal {
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = array_size<StartIndices>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
+{
+  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1, typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >::type>
+{
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
+{
+  public:
+  typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
+  typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(
+    const XprType& expr, const StartIndices& startIndices,
+    const StopIndices& stopIndices, const Strides& strides)
+      : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices),
+        m_strides(strides) {}
+
+    EIGEN_DEVICE_FUNC
+    const StartIndices& startIndices() const { return m_startIndices; }
+    EIGEN_DEVICE_FUNC
+    const StartIndices& stopIndices() const { return m_stopIndices; }
+    EIGEN_DEVICE_FUNC
+    const StartIndices& strides() const { return m_strides; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other)
+    {
+      typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const StartIndices m_startIndices;
+    const StopIndices m_stopIndices;
+    const Strides m_strides;
+};
+
+// Eval as rvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+  static const int NumDims = internal::array_size<Strides>::value;
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets and sizes.
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
+  {
+    // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
+    DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
+    for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
+      if(m_strides[i]>0){
+        startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+      }else{
+        /* implies m_strides[i]<0 by assert */
+        startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+      }
+      m_startIndices[i] = startIndicesClamped[i];
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // check for degenerate intervals and compute output tensor shape
+    bool degenerate = false;;
+    for(int i = 0; i < NumDims; i++){
+      Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
+      if(interval == 0 || ((interval<0) != (m_strides[i]<0))){
+        m_dimensions[i] = 0;
+        degenerate = true;
+      }else{
+        m_dimensions[i] = interval / m_strides[i]
+                          + (interval % m_strides[i] != 0 ? 1 : 0);
+        eigen_assert(m_dimensions[i] >= 0);
+      }
+    }
+    Strides output_dims = m_dimensions;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = m_strides[0];
+      m_offsets[0] = startIndicesClamped[0];
+      Index previousDimProduct = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        previousDimProduct *= input_dims[i-1];
+        m_inputStrides[i] = previousDimProduct * m_strides[i];
+        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+      }
+
+      // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+      }
+    } else {
+      m_inputStrides[NumDims-1] = m_strides[NumDims-1];
+      m_offsets[NumDims-1] = startIndicesClamped[NumDims-1];
+      Index previousDimProduct = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        previousDimProduct *= input_dims[i+1];
+        m_inputStrides[i] = previousDimProduct * m_strides[i];
+        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+      }
+
+      m_outputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+      }
+    }
+    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+                                          device.lastLevelCacheSize() /
+                                          sizeof(Scalar));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+    return NULL;
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i >= 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+        index -= idx * m_outputStrides[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+        index -= idx * m_outputStrides[i];
+      }
+    }
+    return inputIndex;
+  }
+
+  static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+    return numext::maxi(min, numext::mini(max,value));
+  }
+
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  DSizes<Index, NumDims> m_startIndices; // clamped startIndices
+  DSizes<Index, NumDims> m_dimensions;
+  DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
+  const Strides m_strides;
+  std::size_t m_block_total_size_max;
+};
+
+// Eval as lvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+  : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base;
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+  static const int NumDims = internal::array_size<Strides>::value;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
new file mode 100644
index 000000000..647bcf108
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -0,0 +1,397 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+
+namespace Eigen {
+
+/** \class TensorPadding
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor padding class.
+  * At the moment only padding with a constant value is supported.
+  *
+  */
+namespace internal {
+template<typename PaddingDimensions, typename XprType>
+struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename PaddingDimensions, typename XprType>
+struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
+{
+  typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
+};
+
+template<typename PaddingDimensions, typename XprType>
+struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
+{
+  typedef TensorPaddingOp<PaddingDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename PaddingDimensions, typename XprType>
+class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value)
+      : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
+
+    EIGEN_DEVICE_FUNC
+    const PaddingDimensions& padding() const { return m_padding_dims; }
+    EIGEN_DEVICE_FUNC
+    Scalar padding_value() const { return m_padding_value; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const PaddingDimensions m_padding_dims;
+    const Scalar m_padding_value;
+};
+
+
+// Eval as rvalue
+template<typename PaddingDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
+{
+  typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<PaddingDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = true,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
+  {
+    // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
+    // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
+    // of 1 element first and then pad.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Compute dimensions
+    m_dimensions = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] += m_padding[i].first + m_padding[i].second;
+    }
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      }
+      m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
+    } else {
+      m_inputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
+      }
+      m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (isPaddingAtIndexForDim(idx, i)) {
+          return m_paddingValue;
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (isPaddingAtIndexForDim(index, 0)) {
+        return m_paddingValue;
+      }
+      inputIndex += (index - m_padding[0].first);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i+1];
+        if (isPaddingAtIndexForDim(idx, i)) {
+          return m_paddingValue;
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i+1];
+      }
+      if (isPaddingAtIndexForDim(index, NumDims-1)) {
+        return m_paddingValue;
+      }
+      inputIndex += (index - m_padding[NumDims-1].first);
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return packetColMajor(index);
+    }
+    return packetRowMajor(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    TensorOpCost cost = m_impl.costPerCoeff(vectorized);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims; ++i)
+        updateCostPerDimension(cost, i, i == 0);
+    } else {
+      for (int i = NumDims - 1; i >= 0; --i)
+        updateCostPerDimension(cost, i, i == NumDims - 1);
+    }
+    return cost;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
+      Index index, int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
+            index < m_padding[dim_index].first) ||
+        (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
+         index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#else
+    return (index < m_padding[dim_index].first) ||
+           (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
+      int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+    EIGEN_UNUSED_VARIABLE(dim_index);
+    return false;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
+      int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+    EIGEN_UNUSED_VARIABLE(dim_index);
+    return false;
+#endif
+  }
+
+
+  void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
+    const double in = static_cast<double>(m_impl.dimensions()[i]);
+    const double out = in + m_padding[i].first + m_padding[i].second;
+    if (out == 0)
+      return;
+    const double reduction = in / out;
+    cost *= reduction;
+    if (first) {
+      cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
+                    reduction * (1 * TensorOpCost::AddCost<Index>()));
+    } else {
+      cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
+                                 2 * TensorOpCost::MulCost<Index>() +
+                    reduction * (2 * TensorOpCost::MulCost<Index>() +
+                                 1 * TensorOpCost::DivCost<Index>()));
+    }
+  }
+
+ protected:
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    const Index initialIndex = index;
+    Index inputIndex = 0;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index first = index;
+      const Index last = index + PacketSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
+      const Index lastPaddedRight = m_outputStrides[i+1];
+
+      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+        // all the coefficient are between the 2 padding zones.
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      else {
+        // Every other case
+        return packetWithPossibleZero(initialIndex);
+      }
+    }
+
+    const Index last = index + PacketSize - 1;
+    const Index first = index;
+    const Index lastPaddedLeft = m_padding[0].first;
+    const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
+    const Index lastPaddedRight = m_outputStrides[1];
+
+    if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    }
+    else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    }
+    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+      // all the coefficient are between the 2 padding zones.
+      inputIndex += (index - m_padding[0].first);
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    // Every other case
+    return packetWithPossibleZero(initialIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    const Index initialIndex = index;
+    Index inputIndex = 0;
+
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index first = index;
+      const Index last = index + PacketSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
+      const Index lastPaddedRight = m_outputStrides[i];
+
+      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      }
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+        // all the coefficient are between the 2 padding zones.
+        const Index idx = index / m_outputStrides[i+1];
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i+1];
+      }
+      else {
+        // Every other case
+        return packetWithPossibleZero(initialIndex);
+      }
+    }
+
+    const Index last = index + PacketSize - 1;
+    const Index first = index;
+    const Index lastPaddedLeft = m_padding[NumDims-1].first;
+    const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
+    const Index lastPaddedRight = m_outputStrides[NumDims-1];
+
+    if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    }
+    else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    }
+    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+      // all the coefficient are between the 2 padding zones.
+      inputIndex += (index - m_padding[NumDims-1].first);
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    // Every other case
+    return packetWithPossibleZero(initialIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+  {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims+1> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  PaddingDimensions m_padding;
+
+  Scalar m_paddingValue;
+};
+
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
new file mode 100644
index 000000000..886a254f6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -0,0 +1,269 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorPatch
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor patch class.
+  *
+  *
+  */
+namespace internal {
+template<typename PatchDim, typename XprType>
+struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions + 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename PatchDim, typename XprType>
+struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense>
+{
+  typedef const TensorPatchOp<PatchDim, XprType>& type;
+};
+
+template<typename PatchDim, typename XprType>
+struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type>
+{
+  typedef TensorPatchOp<PatchDim, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename PatchDim, typename XprType>
+class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
+      : m_xpr(expr), m_patch_dims(patch_dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const PatchDim& patch_dims() const { return m_patch_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const PatchDim m_patch_dims;
+};
+
+
+// Eval as rvalue
+template<typename PatchDim, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
+{
+  typedef TensorPatchOp<PatchDim, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+ };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    Index num_patches = 1;
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const PatchDim& patch_dims = op.patch_dims();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims-1; ++i) {
+        m_dimensions[i] = patch_dims[i];
+        num_patches *= (input_dims[i] - patch_dims[i] + 1);
+      }
+      m_dimensions[NumDims-1] = num_patches;
+
+      m_inputStrides[0] = 1;
+      m_patchStrides[0] = 1;
+      for (int i = 1; i < NumDims-1; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1);
+      }
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      }
+    } else {
+      for (int i = 0; i < NumDims-1; ++i) {
+        m_dimensions[i+1] = patch_dims[i];
+        num_patches *= (input_dims[i] - patch_dims[i] + 1);
+      }
+      m_dimensions[0] = num_patches;
+
+      m_inputStrides[NumDims-2] = 1;
+      m_patchStrides[NumDims-2] = 1;
+      for (int i = NumDims-3; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1);
+      }
+      m_outputStrides[NumDims-1] = 1;
+      for (int i = NumDims-2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
+    // Find the location of the first element of the patch.
+    Index patchIndex = index / m_outputStrides[output_stride_index];
+    // Find the offset of the element wrt the location of the first element.
+    Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 2; i > 0; --i) {
+        const Index patchIdx = patchIndex / m_patchStrides[i];
+        patchIndex -= patchIdx * m_patchStrides[i];
+        const Index offsetIdx = patchOffset / m_outputStrides[i];
+        patchOffset -= offsetIdx * m_outputStrides[i];
+        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 2; ++i) {
+        const Index patchIdx = patchIndex / m_patchStrides[i];
+        patchIndex -= patchIdx * m_patchStrides[i];
+        const Index offsetIdx = patchOffset / m_outputStrides[i+1];
+        patchOffset -= offsetIdx * m_outputStrides[i+1];
+        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+      }
+    }
+    inputIndex += (patchIndex + patchOffset);
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
+    Index indices[2] = {index, index + PacketSize - 1};
+    Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
+                             indices[1] / m_outputStrides[output_stride_index]};
+    Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
+                             indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]};
+
+    Index inputIndices[2] = {0, 0};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 2; i > 0; --i) {
+        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
+                                   patchIndices[1] / m_patchStrides[i]};
+        patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+        patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+        const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i],
+                                    patchOffsets[1] / m_outputStrides[i]};
+        patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
+        patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
+
+        inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 2; ++i) {
+        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
+                                   patchIndices[1] / m_patchStrides[i]};
+        patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+        patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+        const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1],
+                                    patchOffsets[1] / m_outputStrides[i+1]};
+        patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1];
+        patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1];
+
+        inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+      }
+    }
+    inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
+    inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
+
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < PacketSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() +
+                                           TensorOpCost::MulCost<Index>() +
+                                           2 * TensorOpCost::AddCost<Index>());
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims-1> m_inputStrides;
+  array<Index, NumDims-1> m_patchStrides;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
new file mode 100644
index 000000000..1655a813e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+
+EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
+#ifdef __CUDA_ARCH__
+  // We don't support 3d kernels since we currently only use 1 and
+  // 2d kernels.
+  assert(threadIdx.z == 0);
+  return clock64() +
+      blockIdx.x * blockDim.x + threadIdx.x +
+      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+
+#elif defined _WIN32
+  // Use the current time as a baseline.
+  SYSTEMTIME st;
+  GetSystemTime(&st);
+  int time = st.wSecond + 1000 * st.wMilliseconds;
+  // Mix in a random number to make sure that we get different seeds if
+  // we try to generate seeds faster than the clock resolution.
+  // We need 2 random values since the generator only generate 16 bits at
+  // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
+  int rnd1 = ::rand();
+  int rnd2 = ::rand();
+  uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
+  return rnd;
+
+#elif defined __APPLE__
+  // Same approach as for win32, except that the random number generator
+  // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
+  uint64_t rnd = ::random() ^ mach_absolute_time();
+  return rnd;
+
+#else
+  // Augment the current time with pseudo random number generation
+  // to ensure that we get different seeds if we try to generate seeds
+  // faster than the clock resolution.
+  timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);
+  uint64_t rnd = ::random() ^ ts.tv_nsec;
+  return rnd;
+#endif
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+  // TODO: Unify with the implementation in the non blocking thread pool.
+  uint64_t current = *state;
+  // Update the internal state
+  *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+  // Generate the random output (using the PCG-XSH-RS scheme)
+  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+  seed = seed ? seed : get_random_seed();
+  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+}  // namespace
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeUniform(uint64_t* state) {
+  unsigned rnd = PCG_XSH_RS_generator(state);
+  return static_cast<T>(rnd);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
+  Eigen::half result;
+  // Generate 10 random bits for the mantissa
+  unsigned rnd = PCG_XSH_RS_generator(state);
+  result.x = static_cast<uint16_t>(rnd & 0x3ffu);
+  // Set the exponent
+  result.x |= (static_cast<uint16_t>(15) << 10);
+  // Return the final result
+  return result - Eigen::half(1.0f);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float RandomToTypeUniform<float>(uint64_t* state) {
+  typedef union {
+    uint32_t raw;
+    float fp;
+  } internal;
+  internal result;
+  // Generate 23 random bits for the mantissa mantissa
+  const unsigned rnd = PCG_XSH_RS_generator(state);
+  result.raw = rnd & 0x7fffffu;
+  // Set the exponent
+  result.raw |= (static_cast<uint32_t>(127) << 23);
+  // Return the final result
+  return result.fp - 1.0f;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double RandomToTypeUniform<double>(uint64_t* state) {
+  typedef union {
+    uint64_t raw;
+    double dp;
+  } internal;
+  internal result;
+  result.raw = 0;
+  // Generate 52 random bits for the mantissa
+  // First generate the upper 20 bits
+  unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+  // The generate the lower 32 bits
+  unsigned rnd2 = PCG_XSH_RS_generator(state);
+  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+  // Set the exponent
+  result.raw |= (static_cast<uint64_t>(1023) << 52);
+  // Return the final result
+  return result.dp - 1.0;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeUniform<float>(state),
+                             RandomToTypeUniform<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeUniform<double>(state),
+                              RandomToTypeUniform<double>(state));
+}
+
+template <typename T> class UniformRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+      uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+      const UniformRandomGenerator& other) {
+    m_state = other.m_state;
+  }
+
+  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T operator()(Index i) const {
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeUniform<T>(&local_state);
+    m_state = local_state;
+    return result;
+  }
+
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+    uint64_t local_state = m_state + i;
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeUniform<T>(&local_state);
+    }
+    m_state = local_state;
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+  enum {
+    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+    Cost = 12 * NumTraits<Scalar>::AddCost *
+           ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeNormal(uint64_t* state) {
+  // Use the ratio of uniform method to generate numbers following a normal
+  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+  // details.
+  T u, v, q;
+  do {
+    u = RandomToTypeUniform<T>(state);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+    const T x = u - T(0.449871);
+    const T y = numext::abs(v) + T(0.386595);
+    q = x*x + y * (T(0.196)*y - T(0.25472)*x);
+  } while (q > T(0.27597) &&
+           (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
+
+  return v/u;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeNormal<float>(state),
+                             RandomToTypeNormal<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeNormal<double>(state),
+                              RandomToTypeNormal<double>(state));
+}
+
+
+template <typename T> class NormalRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
+      const NormalRandomGenerator& other) {
+    m_state = other.m_state;
+  }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T operator()(Index i) const {
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeNormal<T>(&local_state);
+    m_state = local_state;
+    return result;
+  }
+
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+    uint64_t local_state = m_state + i;
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeNormal<T>(&local_state);
+    }
+    m_state = local_state;
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+};
+
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+  enum {
+    // On average, we need to generate about 3 random numbers
+    // 15 mul, 8 add, 1.5 logs
+    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
+           15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
+           3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
new file mode 100644
index 000000000..41d0d0022
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -0,0 +1,781 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+
+namespace Eigen {
+
+/** \class TensorReduction
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reduction class.
+  *
+  */
+
+namespace internal {
+  template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
+  struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
+ : traits<XprType>
+{
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar Scalar;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static const int Layout = XprTraits::Layout;
+
+  template <class T> struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+};
+
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
+{
+  typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
+};
+
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
+{
+  typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
+};
+
+
+template <typename OutputDims> struct DimInitializer {
+  template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
+  static void run(const InputDims& input_dims,
+                  const array<bool, internal::array_size<InputDims>::value>& reduced,
+                  OutputDims* output_dims, ReducedDims* reduced_dims) {
+    const int NumInputDims = internal::array_size<InputDims>::value;
+    int outputIndex = 0;
+    int reduceIndex = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (reduced[i]) {
+        (*reduced_dims)[reduceIndex] = input_dims[i];
+        ++reduceIndex;
+      } else {
+        (*output_dims)[outputIndex] = input_dims[i];
+        ++outputIndex;
+      }
+    }
+  }
+};
+
+template <> struct DimInitializer<Sizes<> > {
+  template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
+  static void run(const InputDims& input_dims, const array<bool, Rank>&,
+                  Sizes<>*, array<Index, Rank>* reduced_dims) {
+    const int NumInputDims = internal::array_size<InputDims>::value;
+    for (int i = 0; i < NumInputDims; ++i) {
+      (*reduced_dims)[i] = input_dims[i];
+    }
+  }
+};
+
+
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct are_inner_most_dims {
+  static const bool value = false;
+};
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct preserve_inner_most_dims {
+  static const bool value = false;
+};
+
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
+  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
+  static const bool value = tmp1 & tmp2 & tmp3;
+};
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
+  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool value = tmp1 & tmp2 & tmp3;
+
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
+  static const bool value = tmp1 & tmp2;
+
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool value = tmp1 & tmp2;
+};
+#endif
+
+
+template <int DimIndex, typename Self, typename Op>
+struct GenericDimReducer {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+      GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<0, Self, Op> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
+    for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+      reducer.reduce(self.m_impl.coeff(input), accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<-1, Self, Op> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
+    reducer.reduce(self.m_impl.coeff(index), accum);
+  }
+};
+
+template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct InnerMostDimReducer {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer.finalize(accum);
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
+    typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
+    for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
+      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
+    }
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
+      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer.finalizeBoth(accum, p);
+  }
+};
+
+template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct InnerMostDimPreserver {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
+    eigen_assert(false && "should never be called");
+  }
+};
+
+template <int DimIndex, typename Self, typename Op>
+struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+      InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
+    }
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<0, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+    for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+      reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<-1, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
+    eigen_assert(false && "should never be called");
+  }
+};
+
+// Default full reducer
+template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct FullReducer {
+  static const bool HasOptimizedImplementation = false;
+
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
+    const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
+  }
+};
+
+
+#ifdef EIGEN_USE_THREADS
+// Multithreaded full reducers
+template <typename Self, typename Op,
+          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct FullReducerShard {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
+                  typename Self::Index numValuesToReduce, Op& reducer,
+                  typename Self::CoeffReturnType* output) {
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
+        self, firstIndex, numValuesToReduce, reducer);
+  }
+};
+
+// Multithreaded full reducer
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
+  static const bool HasOptimizedImplementation = !Op::IsStateful;
+  static const int PacketSize =
+      unpacket_traits<typename Self::PacketReturnType>::size;
+
+  // launch one reducer per thread and accumulate the result.
+  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
+                  typename Self::CoeffReturnType* output) {
+    typedef typename Self::Index Index;
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    if (num_coeffs == 0) {
+      *output = reducer.finalize(reducer.initialize());
+      return;
+    }
+    const TensorOpCost cost =
+        self.m_impl.costPerCoeff(Vectorizable) +
+        TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
+                     PacketSize);
+    const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+        num_coeffs, cost, device.numThreads());
+    if (num_threads == 1) {
+      *output =
+          InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
+      return;
+    }
+    const Index blocksize =
+        std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
+    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    eigen_assert(num_coeffs >= numblocks * blocksize);
+
+    Barrier barrier(internal::convert_index<unsigned int>(numblocks));
+    MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
+    for (Index i = 0; i < numblocks; ++i) {
+      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
+                                  self, i * blocksize, blocksize, reducer,
+                                  &shards[i]);
+    }
+    typename Self::CoeffReturnType finalShard;
+    if (numblocks * blocksize < num_coeffs) {
+      finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
+          self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
+          reducer);
+    } else {
+      finalShard = reducer.initialize();
+    }
+    barrier.Wait();
+
+    for (Index i = 0; i < numblocks; ++i) {
+      reducer.reduce(shards[i], &finalShard);
+    }
+    *output = reducer.finalize(finalShard);
+  }
+};
+
+#endif
+
+
+// Default inner reducer
+template <typename Self, typename Op, typename Device>
+struct InnerReducer {
+  static const bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+
+// Default outer reducer
+template <typename Self, typename Op, typename Device>
+struct OuterReducer {
+  static const bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <int B, int N, typename S, typename R, typename I>
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename S, typename R, typename I>
+__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+template <int B, int N, typename S, typename R, typename I>
+__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+
+#endif
+
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+
+template <int NPT, typename S, typename R, typename I>
+__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+#endif
+
+}  // end namespace internal
+
+
+template <typename Op, typename Dims, typename XprType,  template <class> class MakePointer_>
+class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
+  public:
+    typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
+    { }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
+    { }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const XprType& expression() const { return m_expr; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Dims& dims() const { return m_dims; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Op& reducer() const { return m_reducer; }
+
+  protected:
+    typename XprType::Nested m_expr;
+    const Dims m_dims;
+    const Op m_reducer;
+};
+
+
+// Eval as rvalue
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+{
+  typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
+  typedef typename XprType::Index Index;
+  typedef ArgType ChildType;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  static const int NumInputDims = internal::array_size<InputDimensions>::value;
+  static const int NumReducedDims = internal::array_size<Dims>::value;
+  static const int NumOutputDims = NumInputDims - NumReducedDims;
+  typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
+  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
+  static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
+  static const bool RunningFullReduction = (NumOutputDims==0);
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims())
+  {
+    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Build the bitmap indicating if an input dimension is reduced or not.
+    for (int i = 0; i < NumInputDims; ++i) {
+      m_reduced[i] = false;
+    }
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op.dims()[i] >= 0);
+      eigen_assert(op.dims()[i] < NumInputDims);
+      m_reduced[op.dims()[i]] = true;
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
+
+    // Precompute output strides.
+    if (NumOutputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        }
+      } else {
+        m_outputStrides.back() = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        }
+      }
+    }
+
+    // Precompute input strides.
+    if (NumInputDims > 0) {
+      array<Index, NumInputDims> input_strides;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        input_strides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          input_strides[i] = input_strides[i-1] * input_dims[i-1];
+        }
+      } else {
+        input_strides.back() = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      int outputIndex = 0;
+      int reduceIndex = 0;
+      for (int i = 0; i < NumInputDims; ++i) {
+        if (m_reduced[i]) {
+          m_reducedStrides[reduceIndex] = input_strides[i];
+          ++reduceIndex;
+        } else {
+          m_preservedStrides[outputIndex] = input_strides[i];
+          ++outputIndex;
+        }
+      }
+    }
+
+    // Special case for full reductions
+    if (NumOutputDims == 0) {
+      m_preservedStrides[0] = internal::array_prod(input_dims);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+
+    // Use the FullReducer if possible.
+    if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
+        internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+        ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
+         !RunningOnGPU))) {
+      bool need_assign = false;
+      if (!data) {
+        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
+        data = m_result;
+        need_assign = true;
+      }
+      Op reducer(m_reducer);
+      internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
+      return need_assign;
+    }
+    else if(RunningOnSycl){
+      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+      const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+      if (!data) {
+        data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+        m_result = data;
+      }
+      Op reducer(m_reducer);
+      internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+      return (m_result != NULL);
+    }
+
+    // Attempt to use an optimized reduction.
+    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
+      bool reducing_inner_dims = true;
+      for (int i = 0; i < NumReducedDims; ++i) {
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          reducing_inner_dims &= m_reduced[i];
+        } else {
+          reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
+        }
+      }
+      if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
+          (reducing_inner_dims || ReducingInnerMostDims)) {
+        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
+        }
+        Op reducer(m_reducer);
+        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
+      }
+
+      bool preserving_inner_dims = true;
+      for (int i = 0; i < NumReducedDims; ++i) {
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
+        } else {
+          preserving_inner_dims &= m_reduced[i];
+        }
+      }
+      if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
+          preserving_inner_dims) {
+        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
+        }
+        Op reducer(m_reducer);
+        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
+      }
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+    if (m_result) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
+      return *(m_result + index);
+    }
+    Op reducer(m_reducer);
+    if (ReducingInnerMostDims || RunningFullReduction) {
+      const Index num_values_to_reduce =
+        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
+      return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
+                                                             num_values_to_reduce, reducer);
+    } else {
+      typename Self::CoeffReturnType accum = reducer.initialize();
+      internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
+      return reducer.finalize(accum);
+    }
+  }
+
+  // TODO(bsteiner): provide a more efficient implementation.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
+
+    if (RunningOnGPU && m_result) {
+      return internal::pload<PacketReturnType>(m_result + index);
+    }
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    if (ReducingInnerMostDims) {
+      const Index num_values_to_reduce =
+        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
+      const Index firstIndex = firstInput(index);
+      for (Index i = 0; i < PacketSize; ++i) {
+        Op reducer(m_reducer);
+        values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
+                                                                    num_values_to_reduce, reducer);
+      }
+    } else if (PreservingInnerMostDims) {
+      const Index firstIndex = firstInput(index);
+      const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
+      // TBD: extend this the the n innermost dimensions that we preserve.
+      if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
+        Op reducer(m_reducer);
+        typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
+        internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
+        return reducer.finalizePacket(accum);
+      } else {
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = coeff(index + i);
+        }
+      }
+    } else {
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = coeff(index + i);
+      }
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  // Must be called after evalSubExprsIfNeeded().
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    if (RunningFullReduction && m_result) {
+      return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+    } else {
+      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+      const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
+      return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
+          TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  /// added for sycl in order to construct the buffer from the sycl device
+  const Device& device() const{return m_device;}
+  /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel
+  const Dims& xprDims() const {return m_xpr_dims;}
+
+
+  private:
+  template <int, typename, typename> friend struct internal::GenericDimReducer;
+  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
+  template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
+  template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
+#ifdef EIGEN_USE_THREADS
+  template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
+#endif
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+#ifdef EIGEN_HAS_CUDA_FP16
+  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+#endif
+  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+
+  template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+#endif
+
+  template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+
+  // Returns the Index in the input tensor of the first value that needs to be
+  // used to compute the reduction at output index "index".
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    if (ReducingInnerMostDims) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        return index * m_preservedStrides[0];
+      } else {
+        return index * m_preservedStrides[NumPreservedStrides - 1];
+      }
+    }
+    // TBD: optimize the case where we preserve the innermost dimensions.
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        // This is index_i in the output tensor.
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (PreservingInnerMostDims) {
+        eigen_assert(m_preservedStrides[0] == 1);
+        startInput += index;
+      } else {
+        startInput += index * m_preservedStrides[0];
+      }
+    } else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        // This is index_i in the output tensor.
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (PreservingInnerMostDims) {
+        eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
+        startInput += index;
+      } else {
+        startInput += index * m_preservedStrides[NumPreservedStrides - 1];
+      }
+    }
+    return startInput;
+  }
+
+  // Bitmap indicating if an input dimension is reduced or not.
+  array<bool, NumInputDims> m_reduced;
+  // Dimensions of the output of the operation.
+  Dimensions m_dimensions;
+  // Precomputed strides for the output tensor.
+  array<Index, NumOutputDims> m_outputStrides;
+  // Subset of strides of the input tensor for the non-reduced dimensions.
+  // Indexed by output dimensions.
+  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+  array<Index, NumPreservedStrides> m_preservedStrides;
+
+  // Subset of strides of the input tensor for the reduced dimensions.
+  // Indexed by reduced dimensions.
+  array<Index, NumReducedDims> m_reducedStrides;
+  // Size of the input dimensions that are reduced.
+  // Indexed by reduced dimensions.
+  array<Index, NumReducedDims> m_reducedDims;
+
+  // Evaluator for the input expression.
+  TensorEvaluator<ArgType, Device> m_impl;
+
+  // Operation to apply for computing the reduction.
+  Op m_reducer;
+
+  // For full reductions
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
+  static const bool RunningOnSycl = false;
+#elif defined(EIGEN_USE_SYCL)
+static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
+static const bool RunningOnGPU = false;
+#else
+  static const bool RunningOnGPU = false;
+  static const bool RunningOnSycl = false;
+#endif
+  typename MakePointer_<CoeffReturnType>::Type m_result;
+
+  const Device& m_device;
+  const Dims& m_xpr_dims;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
new file mode 100644
index 000000000..65638b6a8
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -0,0 +1,750 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple cuda thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another cuda thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if __CUDA_ARCH__ >= 300
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    assert(0 && "Wordsize not supported");
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+#endif
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if __CUDA_ARCH__ >= 300
+  atomicAdd(output, accum);
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if __CUDA_ARCH__ >= 300
+  // Initialize the output value
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  if (num_coeffs % 2 != 0) {
+    half last = input.m_impl.coeff(num_coeffs-1);
+    *scratch = __halves2half2(last, reducer.initialize());
+  } else {
+    *scratch = reducer.template initializePacket<half2>();
+  }
+}
+
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index num_packets = num_coeffs / 2;
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    ((half2*)output)[i] = reducer.template initializePacket<half2>();
+  }
+
+  if (thread_id == 0 && num_coeffs % 2 != 0) {
+    output[num_coeffs-1] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                    half* output, half2* scratch) {
+  eigen_assert(NumPerThread % 2 == 0);
+
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+  if (gridDim.x == 1 && first_index == 0) {
+    if (num_coeffs % 2 != 0) {
+      half last = input.m_impl.coeff(num_coeffs-1);
+      *scratch = __halves2half2(last, reducer.initialize());
+    } else {
+      *scratch = reducer.template initializePacket<half2>();
+    }
+    __syncthreads();
+  }
+
+  half2 accum = reducer.template initializePacket<half2>();
+  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + 2*i;
+    eigen_assert(index + 1 < num_coeffs);
+    half2 val = input.m_impl.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+    reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(scratch, accum, reducer);
+  }
+
+  __syncthreads();
+
+  if (gridDim.x == 1 && first_index == 0) {
+    half tmp = __low2half(*scratch);
+    reducer.reduce(__high2half(*scratch), &tmp);
+    *output = tmp;
+  }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  half tmp = __low2half(*scratch);
+  reducer.reduce(__high2half(*scratch), &tmp);
+  *output = tmp;
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+    typedef typename Self::CoeffReturnType Scalar;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    half2* scratch = static_cast<half2*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
+    }
+
+    LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+                         1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_CUDA_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+  }
+};
+
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                         typename Self::CoeffReturnType* output) {
+#if __CUDA_ARCH__ >= 300
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = 2*thread_id;
+    for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
+      half* loc = output + i;
+      *((half2*)loc) = reducer.template initializePacket<half2>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      half2 reduced_val1 = reducer.template initializePacket<half2>();
+      half2 reduced_val2 = reducer.template initializePacket<half2>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
+            const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            // Peel;
+            const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            const half2 val1 = __halves2half2(last1, reducer.initialize());
+            reducer.reducePacket(val1, &reduced_val1);
+            const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
+            const half2 val2 = __halves2half2(last2, reducer.initialize());
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * 2;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+        reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
+        reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
+      }
+
+      half val1 =  __low2half(reduced_val1);
+      reducer.reduce(__high2half(reduced_val1), &val1);
+      half val2 =  __low2half(reduced_val2);
+      reducer.reduce(__high2half(reduced_val2), &val2);
+      half2 val = __halves2half2(val1, val2);
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
+    }
+
+    LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_CUDA_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                     typename Self::CoeffReturnType* output) {
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumCudaMultiProcessors() *
+                             device.maxCudaThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
new file mode 100644
index 000000000..3daecb045
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -0,0 +1,242 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclPlaceHolderExpr.h
+ *
+ * \brief:
+ *  This is the specialisation of the placeholder expression based on the
+ * operation type
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+
+namespace Eigen {
+namespace internal {
+
+template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
+template<typename BufferTOut, typename BufferTIn>
+static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+  do {
+          auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
+            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
+                                    cl::sycl::range<1>{std::min(length, local)}};
+            /* Two accessors are used: one to the buffer that is being reduced,
+             * and a second to local memory, used to store intermediate data. */
+            auto aI =
+                bufI.template get_access<cl::sycl::access::mode::read_write>(h);
+            auto aOut =
+                bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
+            cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
+                               cl::sycl::access::target::local>
+                scratch(cl::sycl::range<1>(local), h);
+
+            /* The parallel_for invocation chosen is the variant with an nd_item
+             * parameter, since the code requires barriers for correctness. */
+            h.parallel_for<KernelName>(
+                r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
+                  size_t globalid = id.get_global(0);
+                  size_t localid = id.get_local(0);
+                  /* All threads collectively read from global memory into local.
+                   * The barrier ensures all threads' IO is resolved before
+                   * execution continues (strictly speaking, all threads within
+                   * a single work-group - there is no co-ordination between
+                   * work-groups, only work-items). */
+                  if (globalid < length) {
+                    scratch[localid] = aI[globalid];
+                  }
+                  id.barrier(cl::sycl::access::fence_space::local_space);
+
+                  /* Apply the reduction operation between the current local
+                   * id and the one on the other half of the vector. */
+                  if (globalid < length) {
+                    int min = (length < local) ? length : local;
+                    for (size_t offset = min / 2; offset > 0; offset /= 2) {
+                      if (localid < offset) {
+                        scratch[localid] += scratch[localid + offset];
+                      }
+                      id.barrier(cl::sycl::access::fence_space::local_space);
+                    }
+                    /* The final result will be stored in local id 0. */
+                    if (localid == 0) {
+                      aI[id.get_group(0)] = scratch[localid];
+                      if((length<=local) && globalid ==0){
+                        aOut[globalid]=scratch[localid];
+                      }
+                    }
+                  }
+                });
+          };
+            dev.m_queue.submit(f);
+            dev.m_queue.throw_asynchronous();
+
+          /* At this point, you could queue::wait_and_throw() to ensure that
+           * errors are caught quickly. However, this would likely impact
+           * performance negatively. */
+          length = length / local;
+
+        } while (length > 1);
+
+
+
+}
+
+};
+
+/// For now let's start with a full reducer
+/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
+/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
+/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
+// a leafNode.
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
+
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  static const bool HasOptimizedImplementation = false;
+
+  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
+    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
+    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+    auto functors = TensorSycl::internal::extractFunctors(self.impl());
+    int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
+    size_t inputSize =self.impl().dimensions().TotalSize();
+    size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
+    size_t remaining = inputSize% red_factor;
+    if(rng ==0) {
+      red_factor=1;
+    };
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+    size_t GRange=std::max((size_t )1, rng);
+
+    // convert global range to power of 2 for redecution
+    GRange--;
+    GRange |= GRange >> 1;
+    GRange |= GRange >> 2;
+    GRange |= GRange >> 4;
+    GRange |= GRange >> 8;
+    GRange |= GRange >> 16;
+#if __x86_64__ || __ppc64__ || _WIN64
+    GRange |= GRange >> 32;
+#endif
+    GRange++;
+    size_t  outTileSize = tileSize;
+    /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
+    if (GRange < outTileSize) outTileSize=GRange;
+    // getting final out buffer at the moment the created buffer is true because there is no need for assign
+    auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
+    /// creating the shared memory for calculating reduction.
+    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
+    /// recursively apply reduction on it in order to reduce the whole.
+    auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
+    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+    Dims dims= self.xprDims();
+    Op functor = reducer;
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+      // create a tuple of accessors from Evaluator
+      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+      auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
+
+      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+        /// the device_evaluator is detectable and recognisable on the device.
+        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+        /// const cast added as a naive solution to solve the qualifier drop error
+        auto globalid=itemID.get_global_linear_id();
+
+        if(globalid<rng)
+          tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
+        else
+          tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+
+        if(remaining!=0 && globalid==0 )
+          // this will add the rest of input buffer when the input size is not devidable to red_factor.
+          tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
+      });
+    });
+  dev.m_queue.throw_asynchronous();
+
+/// This is used to recursively reduce the tmp value to an element of 1;
+  syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+  }
+
+};
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
+
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  static const bool HasOptimizedImplementation = false;
+
+  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
+    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
+    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+    auto functors = TensorSycl::internal::extractFunctors(self.impl());
+
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+
+    size_t GRange=num_coeffs_to_preserve;
+    if (tileSize>GRange) tileSize=GRange;
+    else if(GRange>tileSize){
+      size_t xMode = GRange % tileSize;
+      if (xMode != 0) GRange += (tileSize - xMode);
+    }
+    // getting final out buffer at the moment the created buffer is true because there is no need for assign
+    /// creating the shared memory for calculating reduction.
+    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
+    /// recursively apply reduction on it in order to reduce the whole.
+    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+    Dims dims= self.xprDims();
+    Op functor = reducer;
+
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+      // create a tuple of accessors from Evaluator
+      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
+
+      cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+        /// the device_evaluator is detectable and recognisable on the device.
+        typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
+        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+        /// const cast added as a naive solution to solve the qualifier drop error
+        auto globalid=itemID.get_global_linear_id();
+        if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
+          typename DeiceSelf::CoeffReturnType accum = functor.initialize();
+          GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
+          functor.finalize(accum);
+          output_accessor.get_pointer()[globalid]= accum;
+        }
+      });
+    });
+  dev.m_queue.throw_asynchronous();
+    return false;
+  }
+};
+
+}  // end namespace internal
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
new file mode 100644
index 000000000..99245f778
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -0,0 +1,429 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Dimensions, typename Scalar>
+class TensorLazyBaseEvaluator {
+ public:
+  TensorLazyBaseEvaluator() : m_refcount(0) { }
+  virtual ~TensorLazyBaseEvaluator() { }
+
+  EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0;
+  EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0;
+
+  EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0;
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0;
+
+  void incrRefCount() { ++m_refcount; }
+  void decrRefCount() { --m_refcount; }
+  int refCount() const { return m_refcount; }
+
+ private:
+  // No copy, no assigment;
+  TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
+  TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
+
+  int m_refcount;
+};
+
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
+ public:
+  //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+
+  TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
+    m_dims = m_impl.dimensions();
+    m_impl.evalSubExprsIfNeeded(NULL);
+  }
+  virtual ~TensorLazyEvaluatorReadOnly() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const {
+    return m_dims;
+  }
+  EIGEN_DEVICE_FUNC virtual const Scalar* data() const {
+    return m_impl.data();
+  }
+
+  EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const {
+    return m_impl.coeff(index);
+  }
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) {
+    eigen_assert(false && "can't reference the coefficient of a rvalue");
+    return m_dummy;
+  };
+
+ protected:
+  TensorEvaluator<Expr, Device> m_impl;
+  Dimensions m_dims;
+  Scalar m_dummy;
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
+ public:
+  typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
+  typedef typename Base::Scalar Scalar;
+
+  TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
+  }
+  virtual ~TensorLazyEvaluatorWritable() {
+  }
+
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) {
+    return this->m_impl.coeffRef(index);
+  }
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value),
+                            TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+                            TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type {
+ public:
+  typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value),
+                                         TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+                                         TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base;
+  typedef typename Base::Scalar Scalar;
+
+  TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {
+  }
+  virtual ~TensorLazyEvaluator() {
+  }
+};
+
+}  // namespace internal
+
+
+/** \class TensorRef
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A reference to a tensor expression
+  * The expression will be evaluated lazily (as much as possible).
+  *
+  */
+template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> >
+{
+  public:
+    typedef TensorRef<PlainObjectType> Self;
+    typedef typename PlainObjectType::Base Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+    typedef typename internal::traits<PlainObjectType>::Index Index;
+    typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef Scalar* PointerType;
+    typedef PointerType PointerArgType;
+
+    static const Index NumIndices = PlainObjectType::NumIndices;
+    typedef typename PlainObjectType::Dimensions Dimensions;
+
+    enum {
+      IsAligned = false,
+      PacketAccess = false,
+      Layout = PlainObjectType::Layout,
+      CoordAccess = false,  // to be implemented
+      RawAccess = false
+    };
+
+    EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
+    }
+
+    template <typename Expression>
+    EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
+      m_evaluator->incrRefCount();
+    }
+
+    template <typename Expression>
+    EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) {
+      unrefEvaluator();
+      m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice());
+      m_evaluator->incrRefCount();
+      return *this;
+    }
+
+    ~TensorRef() {
+      unrefEvaluator();
+    }
+
+    TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) {
+      eigen_assert(m_evaluator->refCount() > 0);
+      m_evaluator->incrRefCount();
+    }
+
+    TensorRef& operator = (const TensorRef& other) {
+      if (this != &other) {
+        unrefEvaluator();
+        m_evaluator = other.m_evaluator;
+        eigen_assert(m_evaluator->refCount() > 0);
+        m_evaluator->incrRefCount();
+      }
+      return *this;
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index index) const
+    {
+      return m_evaluator->coeff(index);
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      const std::size_t num_indices = (sizeof...(otherIndices) + 1);
+      const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
+      return coeff(indices);
+    }
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+    {
+      const std::size_t num_indices = (sizeof...(otherIndices) + 1);
+      const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
+      return coeffRef(indices);
+    }
+#else
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const
+    {
+      array<Index, 2> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const
+    {
+      array<Index, 3> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      array<Index, 4> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      array<Index, 5> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      indices[4] = i4;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1)
+    {
+      array<Index, 2> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      return coeffRef(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2)
+    {
+      array<Index, 3> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      return coeffRef(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      array<Index, 4> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      return coeffRef(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      array<Index, 5> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      indices[4] = i4;
+      return coeffRef(indices);
+    }
+#endif
+
+    template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const
+    {
+      const Dimensions& dims = this->dimensions();
+      Index index = 0;
+      if (PlainObjectType::Options & RowMajor) {
+        index += indices[0];
+        for (size_t i = 1; i < NumIndices; ++i) {
+          index = index * dims[i] + indices[i];
+        }
+      } else {
+        index += indices[NumIndices-1];
+        for (int i = NumIndices-2; i >= 0; --i) {
+          index = index * dims[i] + indices[i];
+        }
+      }
+      return m_evaluator->coeff(index);
+    }
+    template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+    {
+      const Dimensions& dims = this->dimensions();
+      Index index = 0;
+      if (PlainObjectType::Options & RowMajor) {
+        index += indices[0];
+        for (size_t i = 1; i < NumIndices; ++i) {
+          index = index * dims[i] + indices[i];
+        }
+      } else {
+        index += indices[NumIndices-1];
+        for (int i = NumIndices-2; i >= 0; --i) {
+          index = index * dims[i] + indices[i];
+        }
+      }
+      return m_evaluator->coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+    {
+      return m_evaluator->coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      return m_evaluator->coeffRef(index);
+    }
+
+  private:
+    EIGEN_STRONG_INLINE void unrefEvaluator() {
+      if (m_evaluator) {
+        m_evaluator->decrRefCount();
+        if (m_evaluator->refCount() == 0) {
+          delete m_evaluator;
+        }
+      }
+    }
+
+  internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
+};
+
+
+// evaluator for rvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<const TensorRef<Derived>, Device>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    Layout = TensorRef<Derived>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
+      : m_ref(m)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_ref.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return m_ref.coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); }
+
+ protected:
+  TensorRef<Derived> m_ref;
+};
+
+
+// evaluator for lvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return this->m_ref.coeffRef(index);
+  }
+};
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
new file mode 100644
index 000000000..14e392e36
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -0,0 +1,288 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+namespace Eigen {
+
+/** \class TensorReverse
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reverse elements class.
+  *
+  */
+namespace internal {
+template<typename ReverseDimensions, typename XprType>
+struct traits<TensorReverseOp<ReverseDimensions,
+                              XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename ReverseDimensions, typename XprType>
+struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense>
+{
+  typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
+};
+
+template<typename ReverseDimensions, typename XprType>
+struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
+            typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type>
+{
+  typedef TensorReverseOp<ReverseDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+template<typename ReverseDimensions, typename XprType>
+class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
+                                          XprType>, WriteAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
+                                                                    StorageKind;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
+      const XprType& expr, const ReverseDimensions& reverse_dims)
+      : m_xpr(expr), m_reverse_dims(reverse_dims) { }
+
+    EIGEN_DEVICE_FUNC
+    const ReverseDimensions& reverse() const { return m_reverse_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other)
+    {
+      typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const ReverseDimensions m_reverse_dims;
+};
+
+// Eval as rvalue
+template<typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
+{
+  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<ReverseDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_impl(op.expression(), device), m_reverse(op.reverse())
+  {
+    // Reversing a scalar isn't supported yet. It would be a no-op anyway.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Compute strides
+    m_dimensions = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
+      }
+    } else {
+      m_strides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(
+      Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        Index idx = index / m_strides[i];
+        index -= idx * m_strides[i];
+        if (m_reverse[i]) {
+          idx = m_dimensions[i] - idx - 1;
+        }
+        inputIndex += idx * m_strides[i] ;
+      }
+      if (m_reverse[0]) {
+        inputIndex += (m_dimensions[0] - index - 1);
+      } else {
+        inputIndex += index;
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        Index idx = index / m_strides[i];
+        index -= idx * m_strides[i];
+        if (m_reverse[i]) {
+          idx = m_dimensions[i] - idx - 1;
+        }
+        inputIndex += idx * m_strides[i] ;
+      }
+      if (m_reverse[NumDims-1]) {
+        inputIndex += (m_dimensions[NumDims-1] - index - 1);
+      } else {
+        inputIndex += index;
+      }
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(
+      Index index) const  {
+    return m_impl.coeff(reverseIndex(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    // TODO(ndjaitly): write a better packing routine that uses
+    // local structure.
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
+                                                            values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+                                     2 * TensorOpCost::MulCost<Index>() +
+                                     TensorOpCost::DivCost<Index>());
+    for (int i = 0; i < NumDims; ++i) {
+      if (m_reverse[i]) {
+        compute_cost += 2 * TensorOpCost::AddCost<Index>();
+      }
+    }
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  ReverseDimensions m_reverse;
+};
+
+// Eval as lvalue
+
+template <typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
+    : public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
+                             Device> {
+  typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
+                          Device> Base;
+  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<ReverseDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : Base(op, device) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Dimensions& dimensions() const { return this->m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return this->m_impl.coeffRef(this->reverseIndex(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x) {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    // This code is pilfered from TensorMorphing.h
+    EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    for (int i = 0; i < PacketSize; ++i) {
+      this->coeffRef(index+i) = values[i];
+    }
+  }
+
+};
+
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
new file mode 100644
index 000000000..8501466ce
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -0,0 +1,287 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Op, typename XprType>
+struct traits<TensorScanOp<Op, XprType> >
+    : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Op, typename XprType>
+struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
+{
+  typedef const TensorScanOp<Op, XprType>& type;
+};
+
+template<typename Op, typename XprType>
+struct nested<TensorScanOp<Op, XprType>, 1,
+            typename eval<TensorScanOp<Op, XprType> >::type>
+{
+  typedef TensorScanOp<Op, XprType> type;
+};
+} // end namespace internal
+
+/** \class TensorScan
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor scan class.
+  */
+template <typename Op, typename XprType>
+class TensorScanOp
+    : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
+public:
+  typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
+      const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
+      : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Index axis() const { return m_axis; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const XprType& expression() const { return m_expr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Op accumulator() const { return m_accumulator; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool exclusive() const { return m_exclusive; }
+
+protected:
+  typename XprType::Nested m_expr;
+  const Index m_axis;
+  const Op m_accumulator;
+  const bool m_exclusive;
+};
+
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher;
+
+// Eval as rvalue
+template <typename Op, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
+
+  typedef TensorScanOp<Op, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = true
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_impl(op.expression(), device),
+        m_device(device),
+        m_exclusive(op.exclusive()),
+        m_accumulator(op.accumulator()),
+        m_size(m_impl.dimensions()[op.axis()]),
+        m_stride(1),
+        m_output(NULL) {
+
+    // Accumulating a scalar isn't supported.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
+
+    // Compute stride of scan axis
+    const Dimensions& dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < op.axis(); ++i) {
+        m_stride = m_stride * dims[i];
+      }
+    } else {
+      for (int i = NumDims - 1; i > op.axis(); --i) {
+        m_stride = m_stride * dims[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
+    return m_stride;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
+    return m_size;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
+    return m_accumulator;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
+    return m_exclusive;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
+    return m_impl;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
+    return m_device;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    ScanLauncher<Self, Op, Device> launcher;
+    if (data) {
+      launcher(*this, data);
+      return false;
+    }
+
+    const Index total_size = internal::array_prod(dimensions());
+    m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+    launcher(*this, m_output);
+    return true;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
+  {
+    return m_output;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_output[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_output != NULL) {
+      m_device.deallocate(m_output);
+      m_output = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  const bool m_exclusive;
+  Op m_accumulator;
+  const Index m_size;
+  Index m_stride;
+  CoeffReturnType* m_output;
+};
+
+// CPU implementation of scan
+// TODO(ibab) This single-threaded implementation should be parallelized,
+// at least by running multiple scans at the same time.
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher {
+  void operator()(Self& self, typename Self::CoeffReturnType *data) {
+    Index total_size = internal::array_prod(self.dimensions());
+
+    // We fix the index along the scan axis to 0 and perform a
+    // scan per remaining entry. The iteration is split into two nested
+    // loops to avoid an integer division by keeping track of each idx1 and idx2.
+    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+      for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+        // Calculate the starting offset for the scan
+        Index offset = idx1 + idx2;
+
+        // Compute the scan along the axis, starting at the calculated offset
+        typename Self::CoeffReturnType accum = self.accumulator().initialize();
+        for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+          Index curr = offset + idx3 * self.stride();
+
+          if (self.exclusive()) {
+            data[curr] = self.accumulator().finalize(accum);
+            self.accumulator().reduce(self.inner().coeff(curr), &accum);
+          } else {
+            self.accumulator().reduce(self.inner().coeff(curr), &accum);
+            data[curr] = self.accumulator().finalize(accum);
+          }
+        }
+      }
+    }
+  }
+};
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+  // Compute offset as in the CPU version
+  Index val = threadIdx.x + blockIdx.x * blockDim.x;
+  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+  if (offset + (self.size() - 1) * self.stride() < total_size) {
+    // Compute the scan along the axis, starting at the calculated offset
+    typename Self::CoeffReturnType accum = self.accumulator().initialize();
+    for (Index idx = 0; idx < self.size(); idx++) {
+      Index curr = offset + idx * self.stride();
+      if (self.exclusive()) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      } else {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+  __syncthreads();
+
+}
+
+template <typename Self, typename Reducer>
+struct ScanLauncher<Self, Reducer, GpuDevice> {
+  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+     Index total_size = internal::array_prod(self.dimensions());
+     Index num_blocks = (total_size / self.size() + 63) / 64;
+     Index block_size = 64;
+     LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+  }
+};
+#endif  // EIGEN_USE_GPU && __CUDACC__
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
new file mode 100644
index 000000000..113c060e3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -0,0 +1,264 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+
+namespace Eigen {
+
+/** \class TensorShuffling
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor shuffling class.
+  *
+  *
+  */
+namespace internal {
+template<typename Shuffle, typename XprType>
+struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Shuffle, typename XprType>
+struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
+{
+  typedef const TensorShufflingOp<Shuffle, XprType>& type;
+};
+
+template<typename Shuffle, typename XprType>
+struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
+{
+  typedef TensorShufflingOp<Shuffle, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Shuffle, typename XprType>
+class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle)
+      : m_xpr(expr), m_shuffle(shuffle) {}
+
+    EIGEN_DEVICE_FUNC
+    const Shuffle& shufflePermutation() const { return m_shuffle; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other)
+    {
+      typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Shuffle m_shuffle;
+};
+
+
+// Eval as rvalue
+template<typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
+{
+  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Shuffle& shuffle = op.shufflePermutation();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = input_dims[shuffle[i]];
+    }
+
+    array<Index, NumDims> inputStrides;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      inputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = inputStrides[shuffle[i]];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+                                           2 * TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>());
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      return inputIndex + index * m_inputStrides[0];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      return inputIndex + index * m_inputStrides[NumDims - 1];
+    }
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+// Eval as lvalue
+template<typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
+    : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
+
+  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : Base(op, device)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    for (int i = 0; i < PacketSize; ++i) {
+      this->coeffRef(index+i) = values[i];
+    }
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
new file mode 100644
index 000000000..2854a4a17
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -0,0 +1,146 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
+#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
+
+#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN
+  #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN;
+#else
+  #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
+#endif
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorStorage
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Stores the data of a tensor
+  *
+  * This class stores the data of fixed-size, dynamic-size or mixed tensors
+  * in a way as compact as possible.
+  *
+  * \sa Tensor
+  */
+template<typename T, typename Dimensions, int Options_> class TensorStorage;
+
+
+// Pure fixed-size storage
+template<typename T, int Options_, typename FixedDimensions>
+class TensorStorage<T, FixedDimensions, Options_>
+{
+ private:
+  static const std::size_t Size = FixedDimensions::total_size;
+
+  // Allocate an array of size at least one to prevent compiler warnings.
+  static const std::size_t MinSize = max_n_1<Size>::size;
+  EIGEN_ALIGN_MAX T m_data[MinSize];
+
+  FixedDimensions m_dimensions;
+
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorStorage() {
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T *data() { return m_data; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+};
+
+
+// pure dynamic
+template<typename T, int Options_, typename IndexType, int NumIndices_>
+class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
+{
+  public:
+    typedef IndexType Index;
+    typedef DSizes<IndexType, NumIndices_> Dimensions;
+    typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
+
+    EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {
+      if (NumIndices_ == 0) {
+	m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
+      }
+    }
+    EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(0), m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {}
+    EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
+        : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
+      { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template <typename... DenseIndex>
+    EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
+      m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions));
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC TensorStorage(const Self& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
+      , m_dimensions(other.m_dimensions)
+    {
+      internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data);
+    }
+    EIGEN_DEVICE_FUNC Self& operator=(const Self& other)
+    {
+      if (this != &other) {
+        Self tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }
+
+    EIGEN_DEVICE_FUNC  ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
+    EIGEN_DEVICE_FUNC  void swap(Self& other)
+    { numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {return m_dimensions;}
+
+    EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions)
+    {
+      const Index currentSz = internal::array_prod(m_dimensions);
+      if(size != currentSz)
+      {
+        internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
+        if (size)
+          m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
+        else if (NumIndices_ == 0) {
+	  m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
+	}
+	else 
+          m_data = 0;
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      }
+      m_dimensions = nbDimensions;
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+
+ private:
+  T *m_data;
+  Dimensions m_dimensions;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
new file mode 100644
index 000000000..6c35bfdb6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -0,0 +1,338 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+
+namespace Eigen {
+
+/** \class TensorStriding
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor striding class.
+  *
+  *
+  */
+namespace internal {
+template<typename Strides, typename XprType>
+struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Strides, typename XprType>
+struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
+{
+  typedef const TensorStridingOp<Strides, XprType>& type;
+};
+
+template<typename Strides, typename XprType>
+struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type>
+{
+  typedef TensorStridingOp<Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Strides, typename XprType>
+class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const Strides& strides() const { return m_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other)
+    {
+      typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Strides m_dims;
+};
+
+
+// Eval as rvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
+{
+  typedef TensorStridingOp<Strides, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    m_dimensions = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_inputStrides[i-1] *= op.strides()[i-1];
+      }
+      m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
+    } else {  // RowMajor
+      m_outputStrides[NumDims-1] = 1;
+      m_inputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_inputStrides[i+1] *= op.strides()[i+1];
+      }
+      m_inputStrides[0] *= op.strides()[0];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + PacketSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStrides[i];
+        const Index idx1 = indices[1] / m_outputStrides[i];
+        inputIndices[0] += idx0 * m_inputStrides[i];
+        inputIndices[1] += idx1 * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * m_inputStrides[0];
+      inputIndices[1] += indices[1] * m_inputStrides[0];
+    } else {  // RowMajor
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStrides[i];
+        const Index idx1 = indices[1] / m_outputStrides[i];
+        inputIndices[0] += idx0 * m_inputStrides[i];
+        inputIndices[1] += idx1 * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
+      inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
+    }
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < PacketSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() +
+                                           TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>()) +
+        TensorOpCost::MulCost<Index>();
+    if (vectorized) {
+      compute_cost *= 2;  // packet() computes two indices
+    }
+    const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
+    return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
+        // Computation is not vectorized per se, but it is done once per packet.
+        TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += index * m_inputStrides[0];
+    } else {  // RowMajor
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += index * m_inputStrides[NumDims-1];
+    }
+    return inputIndex;
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+// Eval as lvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
+    : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
+{
+  typedef TensorStridingOp<Strides, ArgType> XprType;
+  typedef TensorEvaluator<const XprType, Device> Base;
+  //  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  //  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : Base(op, device) { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + PacketSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / this->m_outputStrides[i];
+        const Index idx1 = indices[1] / this->m_outputStrides[i];
+        inputIndices[0] += idx0 * this->m_inputStrides[i];
+        inputIndices[1] += idx1 * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * this->m_inputStrides[0];
+      inputIndices[1] += indices[1] * this->m_inputStrides[0];
+    } else {  // RowMajor
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / this->m_outputStrides[i];
+        const Index idx1 = indices[1] / this->m_outputStrides[i];
+        inputIndices[0] += idx0 * this->m_inputStrides[i];
+        inputIndices[1] += idx1 * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
+      inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
+    }
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+      this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
+    }
+    else {
+      EIGEN_ALIGN_MAX Scalar values[PacketSize];
+      internal::pstore<Scalar, PacketReturnType>(values, x);
+      this->m_impl.coeffRef(inputIndices[0]) = values[0];
+      this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
+      for (int i = 1; i < PacketSize-1; ++i) {
+        this->coeffRef(index+i) = values[i];
+      }
+    }
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
new file mode 100644
index 000000000..bb8800d45
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -0,0 +1,82 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: eigen@codeplay.com
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// General include header of SYCL target for Tensor Module
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
+
+#ifdef EIGEN_USE_SYCL
+
+// global pointer to set different attribute state for a class
+template <class T>
+struct MakeGlobalPointer {
+  typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
+};
+
+// global pointer to set different attribute state for a class
+template <class T>
+struct MakeLocalPointer {
+  typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
+};
+
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
+  struct NoOP;
+
+template<bool IsConst, typename T> struct GetType{
+  typedef const T Type;
+};
+template<typename T> struct GetType<false, T>{
+  typedef T Type;
+};
+
+}
+}
+}
+
+// tuple construction
+#include "TensorSyclTuple.h"
+
+// counting number of leaf at compile time
+#include "TensorSyclLeafCount.h"
+
+// The index PlaceHolder takes the actual expression and replaces the actual
+// data on it with the place holder. It uses the same pre-order expression tree
+// traverse as the leaf count in order to give the right access number to each
+// node in the expression
+#include "TensorSyclPlaceHolderExpr.h"
+
+// creation of an accessor tuple from a tuple of SYCL buffers
+#include "TensorSyclExtractAccessor.h"
+
+// this is used to change the address space type in tensor map for GPU
+#include "TensorSyclConvertToDeviceExpression.h"
+
+// this is used to extract the functors
+#include "TensorSyclExtractFunctors.h"
+
+// this is used to create tensormap on the device
+// this is used to construct the expression on the device
+#include "TensorSyclExprConstructor.h"
+
+/// this is used for extracting tensor reduction
+#include "TensorReductionSycl.h"
+
+// kernel execution using fusion
+#include "TensorSyclRun.h"
+
+#endif  // end of EIGEN_USE_SYCL
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
new file mode 100644
index 000000000..8729c86ee
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@@ -0,0 +1,121 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclConvertToDeviceExpression.h
+ *
+ * \brief:
+ *  Conversion from host pointer to device pointer
+ *  inside leaf nodes of the expression.
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+/// \struct ConvertToDeviceExpression
+/// \brief This struct is used to convert the MakePointer in the host expression
+/// to the MakeGlobalPointer for the device expression. For the leafNodes
+/// containing the pointer. This is due to the fact that the address space of
+/// the pointer T* is different on the host and the device.
+template <typename Expr>
+struct ConvertToDeviceExpression;
+
+template<template<class...> class NonOpCategory, bool IsConst, typename... Args>
+struct NonOpConversion{
+  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type...> >::Type Type;
+};
+
+
+template<template<class, template <class> class > class NonOpCategory, bool IsConst, typename Args>
+struct DeviceConvertor{
+  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type, MakeGlobalPointer> >::Type Type;
+};
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node
+/// type is TensorMap
+#define TENSORMAPCONVERT(CVQual)\
+template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_>\
+struct ConvertToDeviceExpression<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_> > {\
+  typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
+};
+
+TENSORMAPCONVERT(const)
+TENSORMAPCONVERT()
+#undef TENSORMAPCONVERT
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node
+/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp
+#define CATEGORYCONVERT(CVQual)\
+template <template<class, class...> class Category, typename OP, typename... subExprs>\
+struct ConvertToDeviceExpression<CVQual Category<OP, subExprs...> > {\
+  typedef CVQual Category<OP, typename ConvertToDeviceExpression<subExprs>::Type... > Type;\
+};
+CATEGORYCONVERT(const)
+CATEGORYCONVERT()
+#undef CATEGORYCONVERT
+
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node
+/// type is  TensorCwiseSelectOp
+#define SELECTOPCONVERT(CVQual, Res)\
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
+struct ConvertToDeviceExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >\
+: NonOpConversion<TensorSelectOp, Res, IfExpr, ThenExpr, ElseExpr> {};
+SELECTOPCONVERT(const, true)
+SELECTOPCONVERT(, false)
+#undef SELECTOPCONVERT
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node
+/// type is const AssingOP
+#define ASSIGNCONVERT(CVQual, Res)\
+template <typename LHSExpr, typename RHSExpr>\
+struct ConvertToDeviceExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr> >\
+: NonOpConversion<TensorAssignOp, Res, LHSExpr, RHSExpr>{};
+
+ASSIGNCONVERT(const, true)
+ASSIGNCONVERT(, false)
+#undef ASSIGNCONVERT
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node
+/// type is either TensorForcedEvalOp or TensorEvalToOp
+#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\
+template <typename Expr>\
+struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
+: DeviceConvertor<ExprNode, Res, Expr>{};
+
+KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
+KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
+KERNELBROKERCONVERT(const, true, TensorEvalToOp)
+KERNELBROKERCONVERT(, false, TensorEvalToOp)
+#undef KERNELBROKERCONVERT
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
+#define KERNELBROKERCONVERTREDUCTION(CVQual)\
+template <typename OP, typename Dim, typename subExpr, template <class> class MakePointer_>\
+struct ConvertToDeviceExpression<CVQual TensorReductionOp<OP, Dim, subExpr, MakePointer_> > {\
+  typedef CVQual TensorReductionOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type, MakeGlobalPointer> Type;\
+};
+
+KERNELBROKERCONVERTREDUCTION(const)
+KERNELBROKERCONVERTREDUCTION()
+#undef KERNELBROKERCONVERTREDUCTION
+
+}  // namespace internal
+}  // namespace TensorSycl
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX1
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
new file mode 100644
index 000000000..7ed3a3a56
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@@ -0,0 +1,239 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclExprConstructor.h
+ *
+ * \brief:
+ *  This file re-create an expression on the SYCL device in order
+ *  to use the original tensor evaluator.
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+/// this class is used by EvalToOp in order to create an lhs expression which is
+/// a pointer from an accessor on device-only buffer
+template <typename PtrType, size_t N, typename... Params>
+struct EvalToLHSConstructor {
+  PtrType expr;
+  EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t): expr((&(*(utility::tuple::get<N>(t).get_pointer())))) {}
+};
+
+/// \struct ExprConstructor is used to reconstruct the expression on the device and
+/// recreate the expression with MakeGlobalPointer containing the device address
+/// space for the TensorMap pointers used in eval function.
+/// It receives the original expression type, the functor of the node, the tuple
+/// of accessors, and the device expression type to re-instantiate the
+/// expression tree for the device
+template <typename OrigExpr, typename IndexExpr, typename... Params>
+struct ExprConstructor;
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorMap
+#define TENSORMAP(CVQual)\
+template <typename Scalar_, int Options_, int Options2_, int Options3_, int NumIndices_, typename IndexType_,\
+template <class> class MakePointer_, size_t N, typename... Params>\
+struct ExprConstructor< CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>,\
+CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\
+  typedef  CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>  Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
+};
+
+TENSORMAP(const)
+TENSORMAP()
+#undef TENSORMAP
+
+#define UNARYCATEGORY(CVQual)\
+template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
+struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
+  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_type;\
+  my_type rhsExpr;\
+  typedef CVQual UnaryCategory<OP, typename my_type::Type> Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\
+};
+
+UNARYCATEGORY(const)
+UNARYCATEGORY()
+#undef UNARYCATEGORY
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorBinaryOp
+#define BINARYCATEGORY(CVQual)\
+template <template<class, class, class> class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\
+typename RHSExpr, typename... Params>\
+struct ExprConstructor<CVQual BinaryCategory<OP, OrigLHSExpr, OrigRHSExpr>,  CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Params...> {\
+  typedef  ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
+  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
+  typedef  CVQual BinaryCategory<OP, typename my_left_type::Type, typename my_right_type::Type> Type;\
+  my_left_type lhsExpr;\
+  my_right_type rhsExpr;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\
+};
+
+BINARYCATEGORY(const)
+BINARYCATEGORY()
+#undef BINARYCATEGORY
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorCwiseTernaryOp
+#define TERNARYCATEGORY(CVQual)\
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename OrigArg1Expr, typename OrigArg2Expr,typename OrigArg3Expr,\
+typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename... Params>\
+struct ExprConstructor<CVQual TernaryCategory<OP, OrigArg1Expr, OrigArg2Expr, OrigArg3Expr>, CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Params...> {\
+  typedef ExprConstructor<OrigArg1Expr, Arg1Expr, Params...> my_arg1_type;\
+  typedef ExprConstructor<OrigArg2Expr, Arg2Expr, Params...> my_arg2_type;\
+  typedef ExprConstructor<OrigArg3Expr, Arg3Expr, Params...> my_arg3_type;\
+  typedef  CVQual TernaryCategory<OP, typename my_arg1_type::Type, typename my_arg2_type::Type, typename my_arg3_type::Type> Type;\
+  my_arg1_type arg1Expr;\
+  my_arg2_type arg2Expr;\
+  my_arg3_type arg3Expr;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD,const utility::tuple::Tuple<Params...> &t)\
+  : arg1Expr(funcD.arg1Expr, t), arg2Expr(funcD.arg2Expr, t), arg3Expr(funcD.arg3Expr, t), expr(arg1Expr.expr, arg2Expr.expr, arg3Expr.expr, funcD.func) {}\
+};
+
+TERNARYCATEGORY(const)
+TERNARYCATEGORY()
+#undef TERNARYCATEGORY
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorCwiseSelectOp
+#define SELECTOP(CVQual)\
+template <typename OrigIfExpr, typename OrigThenExpr, typename OrigElseExpr, typename IfExpr, typename ThenExpr, typename ElseExpr, typename... Params>\
+struct ExprConstructor< CVQual TensorSelectOp<OrigIfExpr, OrigThenExpr, OrigElseExpr>, CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Params...> {\
+  typedef  ExprConstructor<OrigIfExpr, IfExpr, Params...> my_if_type;\
+  typedef  ExprConstructor<OrigThenExpr, ThenExpr, Params...> my_then_type;\
+  typedef  ExprConstructor<OrigElseExpr, ElseExpr, Params...> my_else_type;\
+  typedef CVQual TensorSelectOp<typename my_if_type::Type, typename my_then_type::Type, typename my_else_type::Type> Type;\
+  my_if_type ifExpr;\
+  my_then_type thenExpr;\
+  my_else_type elseExpr;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : ifExpr(funcD.ifExpr, t), thenExpr(funcD.thenExpr, t), elseExpr(funcD.elseExpr, t), expr(ifExpr.expr, thenExpr.expr, elseExpr.expr) {}\
+};
+
+SELECTOP(const)
+SELECTOP()
+#undef SELECTOP
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// const TensorAssignOp
+#define ASSIGN(CVQual)\
+template <typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr, typename RHSExpr, typename... Params>\
+struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>,  CVQual TensorAssignOp<LHSExpr, RHSExpr>, Params...> {\
+  typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
+  typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
+  typedef CVQual TensorAssignOp<typename my_left_type::Type, typename my_right_type::Type>  Type;\
+  my_left_type lhsExpr;\
+  my_right_type rhsExpr;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : lhsExpr(funcD.lhsExpr, t), rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr) {}\
+ };
+
+ ASSIGN(const)
+ ASSIGN()
+ #undef ASSIGN
+/// specialisation of the \ref ExprConstructor struct when the node type is
+///  TensorEvalToOp
+#define EVALTO(CVQual)\
+template <typename OrigExpr, typename Expr, typename... Params>\
+struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
+  typedef ExprConstructor<OrigExpr, Expr, Params...> my_expr_type;\
+  typedef typename TensorEvalToOp<OrigExpr, MakeGlobalPointer>::PointerType my_buffer_type;\
+  typedef CVQual TensorEvalToOp<typename my_expr_type::Type, MakeGlobalPointer> Type;\
+  my_expr_type nestedExpression;\
+  EvalToLHSConstructor<my_buffer_type, 0, Params...> buffer;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : nestedExpression(funcD.rhsExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\
+};
+
+EVALTO(const)
+EVALTO()
+#undef EVALTO
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorForcedEvalOp
+#define FORCEDEVAL(CVQual)\
+template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
+struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
+CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
+  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
+  TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, 0, typename TensorForcedEvalOp<DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
+};
+
+FORCEDEVAL(const)
+FORCEDEVAL()
+#undef FORCEDEVAL
+
+template <bool Conds,  size_t X , size_t Y > struct ValueCondition {
+  static const size_t Res =X;
+};
+template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
+  static const size_t Res =Y;
+};
+
+/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
+#define SYCLREDUCTIONEXPR(CVQual)\
+template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
+struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
+CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
+  static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0,  1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
+  typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
+  NumIndices, 0, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
+};
+
+SYCLREDUCTIONEXPR(const)
+SYCLREDUCTIONEXPR()
+#undef SYCLREDUCTIONEXPR
+
+/// template deduction for \ref ExprConstructor struct
+template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
+auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
+    -> decltype(ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t)) {
+  return ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t);
+}
+
+} /// namespace TensorSycl
+} /// namespace internal
+} /// namespace Eigen
+
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
new file mode 100644
index 000000000..b1da6858e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@@ -0,0 +1,204 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclExtractAccessor.h
+ *
+ * \brief:
+ * ExtractAccessor takes Expression placeHolder expression and the tuple of sycl
+ * buffers as an input. Using pre-order tree traversal, ExtractAccessor
+ * recursively calls itself for its children in the expression tree. The
+ * leaf node in the PlaceHolder expression is nothing but a container preserving
+ * the order of the actual data in the tuple of sycl buffer. By invoking the
+ * extract accessor for the PlaceHolder<N>, an accessor is created for the Nth
+ * buffer in the tuple of buffers. This accessor is then added as an Nth
+ * element in the tuple of accessors. In this case we preserve the order of data
+ * in the expression tree.
+ *
+ * This is the specialisation of extract accessor method for different operation
+ * type in the PlaceHolder expression.
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+/// \struct ExtractAccessor: Extract Accessor Class is used to extract the
+/// accessor from a buffer.
+/// Depending on the type of the leaf node we can get a read accessor or a
+/// read_write accessor
+template <typename Evaluator>
+struct ExtractAccessor;
+
+struct AccessorConstructor{
+  template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, Arg eval)
+  -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
+  return ExtractAccessor<Arg>::getTuple(cgh, eval);
+  }
+
+  template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1, Arg2 eval2)
+  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
+    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
+  }
+  template<typename Arg1, typename Arg2, typename Arg3>	static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1 , Arg2 eval2 , Arg3 eval3)
+  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
+    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
+  }
+  template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval)
+  -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM,
+  typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){
+    return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()));
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp and const TensorBroadcastingOp
+template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){
+    return AccessorConstructor::getTuple(cgh, eval.impl());
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp,  TensorCwiseUnaryOp and  TensorBroadcastingOp
+template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
+    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
+  }
+};
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// const TensorCwiseTernaryOp
+template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){
+    return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp
+template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// const TensorCwiseSelectOp. This is a special case where there is no OP
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){
+    return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// TensorCwiseSelectOp. This is a special case where there is no OP
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
+    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
+ }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
+#define TENSORMAPEXPR(CVQual, ACCType)\
+template <typename PlainObjectType, int Options_, typename Dev>\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
+  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> eval)\
+  -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
+    return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
+  }\
+};
+TENSORMAPEXPR(const, cl::sycl::access::mode::read)
+TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
+#undef TENSORMAPEXPR
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> eval)
+  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorForcedEvalOp<Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<const TensorEvalToOp<Expr>, Dev> eval)
+  -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){
+    return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorEvalToOp<Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> eval)
+  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorReductionOp<OP, Dim, Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> >{};
+
+/// template deduction for \ref ExtractAccessor
+template <typename Evaluator>
+auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr)
+-> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, expr)) {
+  return ExtractAccessor<Evaluator>::getTuple(cgh, expr);
+}
+
+} /// namespace TensorSycl
+} /// namespace internal
+} /// namespace Eigen
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
new file mode 100644
index 000000000..427125343
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -0,0 +1,177 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclextractFunctors.h
+ *
+ * \brief:
+ *  Used to extract all the functors allocated to each node of the expression
+*tree.
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+/// \struct FunctorExtractor:  This struct is used to extract the functors
+/// constructed on
+/// the host-side, to pack them and reuse them in reconstruction of the
+/// expression on the device.
+/// We have to do that as in Eigen the functors are not stateless so we cannot
+/// re-instantiate them on the device.
+/// We have to pass instantiated functors to the device.
+// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
+template <typename Evaluator> struct FunctorExtractor{
+  typedef typename Evaluator::Dimensions Dimensions;
+  const Dimensions m_dimensions;
+  const Dimensions& dimensions() const { return m_dimensions; }
+  FunctorExtractor(const Evaluator& expr)
+  : m_dimensions(expr.dimensions()) {}
+
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp, and const TensorBroadcastingOp
+template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  OP func;
+  FunctorExtractor(const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev>& expr)
+  : rhsExpr(expr.impl()), func(expr.functor()) {}
+};
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
+template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> >{};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  OP func;
+  FunctorExtractor(const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)
+  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseBinaryOp
+template <template <class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<BinaryCategory<OP,  LHSExpr, RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const BinaryCategory<OP,  LHSExpr, RHSExpr>, Dev> >{};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseTernaryOp
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;
+  FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;
+  FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;
+  OP func;
+  FunctorExtractor(const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)
+  : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseTernaryOp
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct FunctorExtractor<TensorEvaluator< TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
+:FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;
+  FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;
+  FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;
+  FunctorExtractor(const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)
+  : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
+:FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  FunctorExtractor(const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)
+  : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorAssignOp. This is an specialisation without OP so it has to be separated.
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
+:FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
+
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorEvalToOp, This is an specialisation without OP so it has to be separated.
+template <typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  FunctorExtractor(const TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev>& expr)
+  : rhsExpr(expr.impl()) {}
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorEvalToOp. This is a specialisation without OP so it has to be separated.
+template <typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorEvalToOp<RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {};
+
+template<typename Dim, size_t NumOutputDim> struct DimConstr {
+template<typename InDim>
+  static inline Dim getDim(InDim dims ) {return dims;}
+};
+
+template<typename Dim> struct DimConstr<Dim, 0> {
+  template<typename InDim>
+    static inline Dim getDim(InDim dims ) {return Dim(dims.TotalSize());}
+};
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{
+  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;
+  typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;
+  const Dimensions m_dimensions;
+  const Dimensions& dimensions() const { return m_dimensions; }
+  FunctorExtractor(const TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)
+  : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}
+};
+
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>
+: FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{};
+/// template deduction function for FunctorExtractor
+template <typename Evaluator>
+auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
+  return FunctorExtractor<Evaluator>(evaluator);
+}
+}  // namespace internal
+}  // namespace TensorSycl
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
new file mode 100644
index 000000000..25d1fac9b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@@ -0,0 +1,114 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclLeafCount.h
+ *
+ * \brief:
+ *  The leaf count used the pre-order expression tree traverse in order to name
+ *  count the number of leaf nodes in the expression
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+/// \brief LeafCount used to counting terminal nodes. The total number of
+/// leaf nodes is used by MakePlaceHolderExprHelper to find the order
+/// of the leaf node in a expression tree at compile time.
+template <typename Expr>
+struct LeafCount;
+
+template<typename... Args> struct CategoryCount;
+
+template<> struct CategoryCount<>
+{
+  static const size_t Count =0;
+};
+
+template<typename Arg, typename... Args>
+struct CategoryCount<Arg,Args...>{
+  static const size_t Count = LeafCount<Arg>::Count + CategoryCount<Args...>::Count;
+};
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorMap
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> > {
+  static const size_t Count =1;
+};
+
+/// specialisation of the \ref LeafCount struct when the node type is TensorMap
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct LeafCount<TensorMap<PlainObjectType, Options_, MakePointer_> > :LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> >{};
+
+// const TensorCwiseUnaryOp, const TensorCwiseNullaryOp, const TensorCwiseBinaryOp, const TensorCwiseTernaryOp, and Const TensorBroadcastingOp
+template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
+struct LeafCount<const CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
+// TensorCwiseUnaryOp,  TensorCwiseNullaryOp,  TensorCwiseBinaryOp,  TensorCwiseTernaryOp, and  TensorBroadcastingOp
+template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
+struct LeafCount<CategoryExpr<OP, RHSExpr...> > :LeafCount<const CategoryExpr<OP, RHSExpr...> >{};
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>
+struct LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
+/// specialisation of the \ref LeafCount struct when the node type is TensorSelectOp
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>
+struct LeafCount<TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >: LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > {};
+
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorAssignOp
+template <typename LHSExpr, typename RHSExpr>
+struct LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
+
+/// specialisation of the \ref LeafCount struct when the node type is
+/// TensorAssignOp is an exception. It is not the same as Unary
+template <typename LHSExpr, typename RHSExpr>
+struct LeafCount<TensorAssignOp<LHSExpr, RHSExpr> > :LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >{};
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
+template <typename Expr>
+struct LeafCount<const TensorForcedEvalOp<Expr> > {
+    static const size_t Count =1;
+};
+
+/// specialisation of the \ref LeafCount struct when the node type is TensorForcedEvalOp
+template <typename Expr>
+struct LeafCount<TensorForcedEvalOp<Expr> >: LeafCount<const TensorForcedEvalOp<Expr> > {};
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorEvalToOp
+template <typename Expr>
+struct LeafCount<const TensorEvalToOp<Expr> > {
+  static const size_t Count = 1 + CategoryCount<Expr>::Count;
+};
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
+template <typename OP, typename Dim, typename Expr>
+struct LeafCount<const TensorReductionOp<OP, Dim, Expr> > {
+    static const size_t Count =1;
+};
+
+/// specialisation of the \ref LeafCount struct when the node type is TensorReductionOp
+template <typename OP, typename Dim, typename Expr>
+struct LeafCount<TensorReductionOp<OP, Dim, Expr> >: LeafCount<const TensorReductionOp<OP, Dim, Expr> >{};
+
+/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
+template <typename Expr>
+struct LeafCount<TensorEvalToOp<Expr> >: LeafCount<const TensorEvalToOp<Expr> >{};
+
+} /// namespace TensorSycl
+} /// namespace internal
+} /// namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
new file mode 100644
index 000000000..d4c250c6d
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@@ -0,0 +1,181 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclPlaceHolderExpr.h
+ *
+ * \brief:
+ *  This is the specialisation of the placeholder expression based on the
+ * operation type
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+/// \struct PlaceHolder
+/// \brief PlaceHolder is used to replace the \ref TensorMap in the expression
+/// tree.
+/// PlaceHolder contains the order of the leaf node in the expression tree.
+template <typename Scalar, size_t N>
+struct PlaceHolder {
+  static constexpr size_t I = N;
+  typedef Scalar Type;
+};
+
+/// \sttruct PlaceHolderExpression
+/// \brief it is used to create the PlaceHolder expression. The PlaceHolder
+/// expression is a copy of expression type in which the TensorMap of the has
+/// been replaced with PlaceHolder.
+template <typename Expr, size_t N>
+struct PlaceHolderExpression;
+
+template<size_t N, typename... Args>
+struct CalculateIndex;
+
+template<size_t N, typename Arg>
+struct CalculateIndex<N, Arg>{
+  typedef typename PlaceHolderExpression<Arg, N>::Type ArgType;
+  typedef utility::tuple::Tuple<ArgType> ArgsTuple;
+};
+
+template<size_t N, typename Arg1, typename Arg2>
+struct CalculateIndex<N, Arg1, Arg2>{
+  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
+  typedef typename PlaceHolderExpression<Arg1, N - Arg2LeafCount>::Type Arg1Type;
+  typedef typename PlaceHolderExpression<Arg2, N>::Type Arg2Type;
+  typedef utility::tuple::Tuple<Arg1Type, Arg2Type> ArgsTuple;
+};
+
+template<size_t N, typename Arg1, typename Arg2, typename Arg3>
+struct CalculateIndex<N, Arg1, Arg2, Arg3> {
+  static const size_t Arg3LeafCount = LeafCount<Arg3>::Count;
+  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
+  typedef typename PlaceHolderExpression<Arg1, N - Arg3LeafCount - Arg2LeafCount>::Type Arg1Type;
+  typedef typename PlaceHolderExpression<Arg2, N - Arg3LeafCount>::Type Arg2Type;
+  typedef typename PlaceHolderExpression<Arg3, N>::Type Arg3Type;
+  typedef utility::tuple::Tuple<Arg1Type, Arg2Type, Arg3Type> ArgsTuple;
+};
+
+template<template<class...> class Category , class OP, class TPL>
+struct CategoryHelper;
+
+template<template<class...> class Category , class OP, class ...T >
+struct CategoryHelper<Category, OP, utility::tuple::Tuple<T...> > {
+  typedef Category<OP, T... > Type;
+};
+
+template<template<class...> class Category , class ...T >
+struct CategoryHelper<Category, NoOP, utility::tuple::Tuple<T...> > {
+  typedef Category<T... > Type;
+};
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorBroadcastingOp, TensorCwiseBinaryOp,  TensorCwiseTernaryOp
+#define OPEXPRCATEGORY(CVQual)\
+template <template <class, class... > class Category, typename OP, typename... SubExpr, size_t N>\
+struct PlaceHolderExpression<CVQual Category<OP, SubExpr...>, N>{\
+  typedef CVQual typename CategoryHelper<Category, OP, typename CalculateIndex<N, SubExpr...>::ArgsTuple>::Type Type;\
+};
+
+OPEXPRCATEGORY(const)
+OPEXPRCATEGORY()
+#undef OPEXPRCATEGORY
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorCwiseSelectOp
+#define SELECTEXPR(CVQual)\
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, N> {\
+  typedef CVQual typename CategoryHelper<TensorSelectOp, NoOP, typename CalculateIndex<N, IfExpr, ThenExpr, ElseExpr>::ArgsTuple>::Type Type;\
+};
+
+SELECTEXPR(const)
+SELECTEXPR()
+#undef SELECTEXPR
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorAssignOp
+#define ASSIGNEXPR(CVQual)\
+template <typename LHSExpr, typename RHSExpr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr>, N> {\
+  typedef CVQual typename CategoryHelper<TensorAssignOp, NoOP, typename CalculateIndex<N, LHSExpr, RHSExpr>::ArgsTuple>::Type Type;\
+};
+
+ASSIGNEXPR(const)
+ASSIGNEXPR()
+#undef ASSIGNEXPR
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorMap
+#define TENSORMAPEXPR(CVQual)\
+template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_, size_t N>\
+struct PlaceHolderExpression< CVQual TensorMap< Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> {\
+  typedef CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> Type;\
+};
+
+TENSORMAPEXPR(const)
+TENSORMAPEXPR()
+#undef TENSORMAPEXPR
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorForcedEvalOp
+#define FORCEDEVAL(CVQual)\
+template <typename Expr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorForcedEvalOp<Expr>, N> {\
+  typedef CVQual PlaceHolder<CVQual TensorForcedEvalOp<Expr>, N> Type;\
+};
+
+FORCEDEVAL(const)
+FORCEDEVAL()
+#undef FORCEDEVAL
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorEvalToOp
+#define EVALTO(CVQual)\
+template <typename Expr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorEvalToOp<Expr>, N> {\
+  typedef CVQual TensorEvalToOp<typename CalculateIndex <N, Expr>::ArgType> Type;\
+};
+
+EVALTO(const)
+EVALTO()
+#undef EVALTO
+
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorReductionOp
+#define SYCLREDUCTION(CVQual)\
+template <typename OP, typename Dims, typename Expr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\
+  typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\
+};
+SYCLREDUCTION(const)
+SYCLREDUCTION()
+#undef SYCLREDUCTION
+
+/// template deduction for \ref PlaceHolderExpression struct
+template <typename Expr>
+struct createPlaceHolderExpression {
+  static const size_t TotalLeaves = LeafCount<Expr>::Count;
+  typedef typename PlaceHolderExpression<Expr, TotalLeaves - 1>::Type Type;
+};
+
+}  // internal
+}  // TensorSycl
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
new file mode 100644
index 000000000..7914b6fad
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@@ -0,0 +1,70 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Cummins Chris PhD student at The University of Edinburgh.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclRun.h
+ *
+ * \brief:
+ * Schedule_kernel invoke an specialised version of kernel struct. The
+ * specialisation is based on the data dimension in sycl buffer
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+/// The run function in tensor sycl convert the expression tree to a buffer
+/// based expression tree;
+/// creates the expression tree for the device with accessor to buffers;
+/// construct the kernel and submit it to the sycl queue.
+template <typename Expr, typename Dev>
+void run(Expr &expr, Dev &dev) {
+  Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  if (needs_assign) {
+    typedef  typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
+    auto functors = internal::extractFunctors(evaluator);
+
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+
+      // create a tuple of accessors from Evaluator
+      auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
+      const auto range = utility::tuple::get<0>(tuple_of_accessors).get_range()[0];
+      size_t GRange=range;
+      if (tileSize>GRange) tileSize=GRange;
+      else if(GRange>tileSize){
+        size_t xMode = GRange % tileSize;
+        if (xMode != 0) GRange += (tileSize - xMode);
+      }
+      // run the kernel
+      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef  typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
+        auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+        if (itemID.get_global_linear_id() < range) {
+          device_evaluator.evalScalar(static_cast<int>(itemID.get_global_linear_id()));
+        }
+      });
+    });
+    dev.m_queue.throw_asynchronous();
+  }
+
+  evaluator.cleanup();
+}
+}  // namespace TensorSycl
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
new file mode 100644
index 000000000..063b027e8
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
@@ -0,0 +1,234 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensroSyclTuple.h
+ *
+ * \brief:
+ *  Minimal implementation of std::tuple that can be used inside a SYCL kernel.
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
+namespace utility {
+namespace tuple {
+/// \struct StaticIf
+/// \brief The StaticIf struct is used to statically choose the type based on the
+/// condition.
+template <bool, typename T = void> struct StaticIf;
+/// \brief specialisation of the \ref StaticIf when the condition is true
+template <typename T>
+struct StaticIf<true, T> {
+  typedef T type;
+};
+
+/// \struct Tuple
+/// \brief is a fixed-size collection of heterogeneous values
+/// \ztparam Ts...	-	the types of the elements that the tuple stores.
+/// Empty list is supported.
+template <class... Ts>
+struct Tuple {};
+
+/// \brief specialisation of the \ref Tuple class when the tuple has at least
+/// one element.
+/// \tparam T : the type of the first element in the tuple.
+/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
+template <class T, class... Ts>
+struct Tuple<T, Ts...> {
+  Tuple(T t, Ts... ts) : head(t), tail(ts...) {}
+  T head;
+  Tuple<Ts...> tail;
+};
+
+///\ struct ElemTypeHolder
+/// \brief ElemTypeHolder class is used to specify the types of the
+/// elements inside the tuple
+/// \tparam size_t the number of elements inside the tuple
+/// \tparam class the tuple class
+template <size_t, class>
+struct ElemTypeHolder;
+
+/// \brief specialisation of the \ref ElemTypeHolder class when the number of
+/// elements inside the tuple is 1
+template <class T, class... Ts>
+struct ElemTypeHolder<0, Tuple<T, Ts...> > {
+  typedef T type;
+};
+
+/// \brief specialisation of the \ref ElemTypeHolder class when the number of
+/// elements inside the tuple is bigger than 1. It recursively calls itself to
+/// detect the type of each element in the tuple
+/// \tparam T : the type of the first element in the tuple.
+/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
+/// \tparam K is the Kth element in the tuple
+template <size_t k, class T, class... Ts>
+struct ElemTypeHolder<k, Tuple<T, Ts...> > {
+  typedef typename ElemTypeHolder<k - 1, Tuple<Ts...> >::type type;
+};
+
+/// get
+/// \brief Extracts the first element from the tuple.
+/// K=0 represents the first element of the tuple. The tuple cannot be empty.
+/// \tparam Ts... are the type of the elements in the tuple.
+/// \param t is the tuple whose contents to extract
+/// \return  typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type
+
+#define TERMINATE_CONDS_TUPLE_GET(CVQual) \
+template <size_t k, class... Ts> \
+typename StaticIf<k == 0, CVQual typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type \
+get(CVQual Tuple<Ts...> &t) { \
+  static_assert(sizeof...(Ts)!=0, "The requseted value is bigger than the size of the tuple"); \
+  return t.head; \
+}
+
+TERMINATE_CONDS_TUPLE_GET(const)
+TERMINATE_CONDS_TUPLE_GET()
+#undef TERMINATE_CONDS_TUPLE_GET
+/// get
+/// \brief Extracts the Kth element from the tuple.
+///\tparam K is an integer value in [0,sizeof...(Types)).
+/// \tparam T is the (sizeof...(Types) -(K+1)) element in the tuple
+/// \tparam Ts... are the type of the elements  in the tuple.
+/// \param t is the tuple whose contents to extract
+/// \return  typename ElemTypeHolder<K, Tuple<Ts...> >::type &>::type
+#define RECURSIVE_TUPLE_GET(CVQual) \
+template <size_t k, class T, class... Ts> \
+typename StaticIf<k != 0, CVQual typename ElemTypeHolder<k, Tuple<T, Ts...> >::type &>::type \
+get(CVQual Tuple<T, Ts...> &t) { \
+  return utility::tuple::get<k - 1>(t.tail); \
+}
+RECURSIVE_TUPLE_GET(const)
+RECURSIVE_TUPLE_GET()
+#undef RECURSIVE_TUPLE_GET
+
+/// make_tuple
+/// \brief Creates a tuple object, deducing the target type from the types of
+/// arguments.
+/// \tparam Args the type of the arguments to construct the tuple from
+/// \param args zero or more arguments to construct the tuple from
+/// \return Tuple<Args...>
+template <typename... Args>
+Tuple<Args...> make_tuple(Args... args) {
+  return Tuple<Args...>(args...);
+}
+
+/// size
+/// \brief Provides access to the number of elements in a tuple as a
+/// compile-time constant expression.
+/// \tparam Args the type of the arguments to construct the tuple from
+/// \return size_t
+template <typename... Args>
+static constexpr size_t size(Tuple<Args...> &) {
+  return sizeof...(Args);
+}
+
+/// \struct IndexList
+/// \brief Creates a list of index from the elements in the tuple
+/// \tparam Is... a list of index from [0 to sizeof...(tuple elements))
+template <size_t... Is>
+struct IndexList {};
+
+/// \struct RangeBuilder
+/// \brief Collects internal details for generating index ranges [MIN, MAX)
+/// Declare primary template for index range builder
+/// \tparam MIN is the starting index in the tuple
+/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
+/// \tparam Is... are the list of generated index so far
+template <size_t MIN, size_t N, size_t... Is>
+struct RangeBuilder;
+
+/// \brief base Step: Specialisation of the \ref RangeBuilder when the
+/// MIN==MAX. In this case the Is... is [0 to sizeof...(tuple elements))
+/// \tparam MIN is the starting index of the tuple
+/// \tparam Is is [0 to sizeof...(tuple elements))
+template <size_t MIN, size_t... Is>
+struct RangeBuilder<MIN, MIN, Is...> {
+  typedef IndexList<Is...> type;
+};
+
+/// Induction step: Specialisation of the RangeBuilder class when N!=MIN
+/// in this case we are recursively subtracting N by one and adding one
+/// index to Is... list until MIN==N
+/// \tparam MIN is the starting index in the tuple
+/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
+/// \tparam Is... are the list of generated index so far
+template <size_t MIN, size_t N, size_t... Is>
+struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {};
+
+/// \brief IndexRange that returns a [MIN, MAX) index range
+/// \tparam MIN is the starting index in the tuple
+/// \tparam MAX is the size of the tuple
+template <size_t MIN, size_t MAX>
+struct IndexRange: RangeBuilder<MIN, MAX>::type {};
+
+/// append_base
+/// \brief unpacking the elements of the input tuple t and creating a new tuple
+/// by adding element a at the end of it.
+///\tparam Args... the type of the elements inside the tuple t
+/// \tparam T the type of the new element going to be added at the end of tuple
+/// \tparam I... is the list of index from [0 to sizeof...(t))
+/// \param t the tuple on which we want to append a.
+/// \param a the new elements going to be added to the tuple
+/// \return Tuple<Args..., T>
+template <typename... Args, typename T, size_t... I>
+Tuple<Args..., T> append_base(Tuple<Args...> t, T a,IndexList<I...>) {
+  return utility::tuple::make_tuple(get<I>(t)..., a);
+}
+
+/// append
+/// \brief the deduction function for \ref append_base that automatically
+/// generate the \ref IndexRange
+///\tparam Args... the type of the elements inside the tuple t
+/// \tparam T the type of the new element going to be added at the end of tuple
+/// \param t the tuple on which we want to append a.
+/// \param a the new elements going to be added to the tuple
+/// \return Tuple<Args..., T>
+template <typename... Args, typename T>
+Tuple<Args..., T> append(Tuple<Args...> t, T a) {
+  return utility::tuple::append_base(t, a,  IndexRange<0, sizeof...(Args)>());
+}
+
+/// append_base
+/// \brief This is a specialisation of \ref append_base when we want to
+/// concatenate
+/// tuple t2 at the end of the tuple t1. Here we unpack both tuples, generate the
+/// IndexRange for each of them and create an output tuple T that contains both
+/// elements of t1 and t2.
+///\tparam Args1... the type of the elements inside the tuple t1
+///\tparam Args2... the type of the elements inside the tuple t2
+/// \tparam I1... is the list of index from [0 to sizeof...(t1))
+/// \tparam I2... is the list of index from [0 to sizeof...(t2))
+/// \param t1 is the tuple on which we want to append t2.
+/// \param t2 is the tuple that is going to be added on t1.
+/// \return Tuple<Args1..., Args2...>
+template <typename... Args1, typename... Args2, size_t... I1, size_t... I2>
+Tuple<Args1..., Args2...> append_base(Tuple<Args1...> t1, Tuple<Args2...> t2, IndexList<I1...>, IndexList<I2...>) {
+  return utility::tuple::make_tuple(get<I1>(t1)...,get<I2>(t2)...);
+}
+
+/// append
+/// \brief deduction function for \ref append_base when we are appending tuple
+/// t1 by tuple t2. In this case the \ref IndexRange for both tuple are
+/// automatically generated.
+///\tparam Args1... the type of the elements inside the tuple t1
+///\tparam Args2... the type of the elements inside the tuple t2
+/// \param t1 is the tuple on which we want to append t2.
+/// \param t2 is the tuple that is going to be added on t1.
+/// \return Tuple<Args1..., Args2...>
+template <typename... Args1, typename... Args2>
+Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
+  return utility::tuple::append_base(t1, t2, IndexRange<0, sizeof...(Args1)>(), IndexRange<0, sizeof...(Args2)>());
+}
+}  // tuple
+}  // utility
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
new file mode 100644
index 000000000..ffcf8b00f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -0,0 +1,272 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+
+namespace Eigen {
+namespace internal {
+
+
+template<typename Scalar, int Options>
+class compute_tensor_flags
+{
+  enum {
+    is_dynamic_size_storage = 1,
+
+    is_aligned =
+    (
+        ((Options&DontAlign)==0) && (
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+            (!is_dynamic_size_storage)
+#else
+            0
+#endif
+            |
+#if EIGEN_MAX_ALIGN_BYTES>0
+            is_dynamic_size_storage
+#else
+            0
+#endif
+      )
+     ),
+    packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0
+  };
+
+  public:
+    enum { ret = packet_access_bit };
+};
+
+
+template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef IndexType_ Index;
+  static const int NumDimensions = NumIndices_;
+  static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
+  };
+  template <typename T> struct MakePointer {
+    typedef T* Type;
+  };
+};
+
+
+template<typename Scalar_, typename Dimensions, int Options_, typename IndexType_>
+struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
+{
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef IndexType_ Index;
+  static const int NumDimensions = array_size<Dimensions>::value;
+  static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit)
+  };
+  template <typename T> struct MakePointer {
+    typedef T* Type;
+  };
+};
+
+
+template<typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> BaseTraits;
+  typedef typename BaseTraits::Scalar Scalar;
+  typedef typename BaseTraits::StorageKind StorageKind;
+  typedef typename BaseTraits::Index Index;
+  static const int NumDimensions = BaseTraits::NumDimensions;
+  static const int Layout = BaseTraits::Layout;
+  enum {
+    Options = Options_,
+    Flags = BaseTraits::Flags
+  };
+  template <class T> struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+};
+
+template<typename PlainObjectType>
+struct traits<TensorRef<PlainObjectType> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> BaseTraits;
+  typedef typename BaseTraits::Scalar Scalar;
+  typedef typename BaseTraits::StorageKind StorageKind;
+  typedef typename BaseTraits::Index Index;
+  static const int NumDimensions = BaseTraits::NumDimensions;
+  static const int Layout = BaseTraits::Layout;
+  enum {
+    Options = BaseTraits::Options,
+    Flags = BaseTraits::Flags
+  };
+};
+
+
+template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
+struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
+{
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+};
+
+template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
+struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
+{
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+};
+
+template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+};
+
+template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+};
+
+template<typename PlainObjectType, int Options, template <class> class MakePointer>
+struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
+{
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+};
+
+template<typename PlainObjectType, int Options, template <class> class MakePointer>
+struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
+{
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+};
+
+template<typename PlainObjectType>
+struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+template<typename PlainObjectType>
+struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
+template<typename T, int n=1, typename PlainObject = void> struct nested
+{
+  typedef typename ref_selector<T>::type type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+};
+
+
+template <typename PlainObjectType, int Options, template <class> class MakePointer>
+struct nested<TensorMap<PlainObjectType, Options, MakePointer> >
+{
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+};
+
+template <typename PlainObjectType, int Options, template <class> class MakePointer>
+struct nested<const TensorMap<PlainObjectType, Options, MakePointer> >
+{
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+};
+
+template <typename PlainObjectType>
+struct nested<TensorRef<PlainObjectType> >
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+template <typename PlainObjectType>
+struct nested<const TensorRef<PlainObjectType> >
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+}  // end namespace internal
+
+// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
+// R, B), and convolve it with a set of filters, which can also be presented as
+// a tensor (D, K, K, M), where M is the number of filters, K is the filter
+// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
+// simplicity we assume that we always use square filters (which is usually the
+// case in images), hence the two Ks in the tensor dimension.  It also takes in
+// a few additional parameters:
+// Stride (S): The convolution stride is the offset between locations where we
+//             apply the filters.  A larger stride means that the output will be
+//             spatially smaller.
+// Padding (P): The padding we apply to the input tensor along the R and C
+//              dimensions.  This is usually used to make sure that the spatial
+//              dimensions of the output matches our intention.
+//
+// Two types of padding are often used:
+//   SAME: The pad value is computed so that the output will have size
+//         R/S and C/S.
+//   VALID: no padding is carried out.
+// When we do padding, the padded values at the padded locations are usually
+// zero.
+//
+// The output dimensions for convolution, when given all the parameters above,
+// are as follows:
+// When Padding = SAME: the output size is (B, R', C', M), where
+//   R' = ceil(float(R) / float(S))
+//   C' = ceil(float(C) / float(S))
+// where ceil is the ceiling function.  The input tensor is padded with 0 as
+// needed.  The number of padded rows and columns are computed as:
+//   Pr = ((R' - 1) * S + K - R) / 2
+//   Pc = ((C' - 1) * S + K - C) / 2
+// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
+// This is where SAME comes from - the output has the same size as the input has.
+// When Padding = VALID: the output size is computed as
+//   R' = ceil(float(R - K + 1) / float(S))
+//   C' = ceil(float(C - K + 1) / float(S))
+// and the number of padded rows and columns are computed in the same way as in
+// the SAME case.
+// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
+// Pc=0.
+typedef enum {
+  PADDING_VALID = 1,
+  PADDING_SAME = 2
+} PaddingType;
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
new file mode 100644
index 000000000..3523e7c94
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -0,0 +1,248 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
+#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
+
+namespace Eigen {
+namespace internal {
+
+
+template <uint64_t n>
+struct static_val {
+  static const uint64_t value = n;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+    eigen_assert(v == n);
+  }
+};
+
+
+template <typename HIGH = uint64_t, typename LOW = uint64_t>
+struct TensorUInt128
+{
+  HIGH high;
+  LOW low;
+
+  template<typename OTHER_HIGH, typename OTHER_LOW>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) : high(other.high), low(other.low) {
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  template<typename OTHER_HIGH, typename OTHER_LOW>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  TensorUInt128& operator = (const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) {
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    high = other.high;
+    low = other.low;
+    return *this;
+  }
+
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  explicit TensorUInt128(const T& x) : high(0), low(x) {
+    eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
+    eigen_assert(x >= 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  TensorUInt128(HIGH y, LOW x) : high(y), low(x) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const {
+    return low;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const {
+    return low;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const {
+    return high;
+  }
+};
+
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  return (lhs.high == rhs.high) & (lhs.low == rhs.low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  return (lhs.high != rhs.high) | (lhs.low != rhs.low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  if (lhs.high != rhs.high) {
+    return lhs.high > rhs.high;
+  }
+  return lhs.low >= rhs.low;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  if (lhs.high != rhs.high) {
+    return lhs.high < rhs.high;
+  }
+  return lhs.low < rhs.low;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
+  if (result.low < rhs.low) {
+    result.high += 1;
+  }
+  return result;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
+  if (result.low > lhs.low) {
+    result.high -= 1;
+  }
+  return result;
+}
+
+
+template <typename HL, typename LL, typename HR, typename LR>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  // Split each 128-bit integer into 4 32-bit integers, and then do the
+  // multiplications by hand as follow:
+  //   lhs      a  b  c  d
+  //   rhs      e  f  g  h
+  //           -----------
+  //           ah bh ch dh
+  //           bg cg dg
+  //           cf df
+  //           de
+  // The result is stored in 2 64bit integers, high and low.
+
+  const uint64_t LOW = 0x00000000FFFFFFFFLL;
+  const uint64_t HIGH = 0xFFFFFFFF00000000LL;
+
+  uint64_t d = lhs.low & LOW;
+  uint64_t c = (lhs.low & HIGH) >> 32LL;
+  uint64_t b = lhs.high & LOW;
+  uint64_t a = (lhs.high & HIGH) >> 32LL;
+
+  uint64_t h = rhs.low & LOW;
+  uint64_t g = (rhs.low & HIGH) >> 32LL;
+  uint64_t f = rhs.high & LOW;
+  uint64_t e = (rhs.high & HIGH) >> 32LL;
+
+  // Compute the low 32 bits of low
+  uint64_t acc = d * h;
+  uint64_t low = acc & LOW;
+  //  Compute the high 32 bits of low. Add a carry every time we wrap around
+  acc >>= 32LL;
+  uint64_t carry = 0;
+  uint64_t acc2 = acc + c * h;
+  if (acc2 < acc) {
+    carry++;
+  }
+  acc = acc2 + d * g;
+  if (acc < acc2) {
+    carry++;
+  }
+  low |= (acc << 32LL);
+
+  // Carry forward the high bits of acc to initiate the computation of the
+  // low 32 bits of high
+  acc2 = (acc >> 32LL) | (carry << 32LL);
+  carry = 0;
+
+  acc = acc2 + b * h;
+  if (acc < acc2) {
+    carry++;
+  }
+  acc2 = acc + c * g;
+  if (acc2 < acc) {
+    carry++;
+  }
+  acc = acc2 + d * f;
+  if (acc < acc2) {
+    carry++;
+  }
+  uint64_t high = acc & LOW;
+
+  // Start to compute the high 32 bits of high.
+  acc2 = (acc >> 32LL) | (carry << 32LL);
+
+  acc = acc2 + a * h;
+  acc2 = acc + b * g;
+  acc = acc2 + c * f;
+  acc2 = acc + d * e;
+  high |= (acc2 << 32LL);
+
+  return TensorUInt128<uint64_t, uint64_t>(high, low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
+    return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
+  } else if (lhs < rhs) {
+    return TensorUInt128<uint64_t, uint64_t>(0);
+  } else {
+    // calculate the biggest power of 2 times rhs that's less than or equal to lhs
+    TensorUInt128<uint64_t, uint64_t> power2(1);
+    TensorUInt128<uint64_t, uint64_t> d(rhs);
+    TensorUInt128<uint64_t, uint64_t> tmp(lhs - d);
+    while (lhs >= d) {
+      tmp = tmp - d;
+      d = d + d;
+      power2 = power2 + power2;
+    }
+
+    tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
+    TensorUInt128<uint64_t, uint64_t> result(0);
+    while (power2 != TensorUInt128<static_val<0>, static_val<0> >(0)) {
+      if (tmp >= d) {
+        tmp = tmp - d;
+        result = result + power2;
+      }
+      // Shift right
+      power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63));
+      d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63));
+    }
+
+    return result;
+  }
+}
+
+
+}  // namespace internal
+}  // namespace Eigen
+
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
new file mode 100644
index 000000000..0ca2cac84
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -0,0 +1,608 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorVolumePatch
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Patch extraction specialized for processing of volumetric data.
+  * This assumes that the input has a least 4 dimensions ordered as follows:
+  *  - channels
+  *  - planes
+  *  - rows
+  *  - columns
+  *  - (optional) additional dimensions such as time or batch size.
+  * Calling the volume patch code with patch_planes, patch_rows, and patch_cols
+  * is equivalent to calling the regular patch extraction code with parameters
+  * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional
+  * dimensions.
+  */
+namespace internal {
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
+{
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions + 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense>
+{
+  typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type;
+};
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1, typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type>
+{
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type;
+};
+
+}  // end namespace internal
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                            DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
+                                                            DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
+                                                            DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+                                                            PaddingType padding_type, Scalar padding_value)
+      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+        m_padding_type(padding_type), m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                           DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
+                                                           DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
+                                                           DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+                                                           DenseIndex padding_top_z, DenseIndex padding_bottom_z,
+                                                           DenseIndex padding_top, DenseIndex padding_bottom,
+                                                           DenseIndex padding_left, DenseIndex padding_right,
+                                                           Scalar padding_value)
+      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left), m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_planes() const { return m_patch_planes; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_rows() const { return m_patch_rows; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_cols() const { return m_patch_cols; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex plane_strides() const { return m_plane_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_strides() const { return m_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_strides() const { return m_col_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_plane_strides() const { return m_in_plane_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_row_strides() const { return m_in_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_col_strides() const { return m_in_col_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    bool padding_explicit() const { return m_padding_explicit; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_top_z() const { return m_padding_top_z; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_bottom_z() const { return m_padding_bottom_z; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_top() const { return m_padding_top; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_bottom() const { return m_padding_bottom; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_left() const { return m_padding_left; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_right() const { return m_padding_right; }
+    EIGEN_DEVICE_FUNC
+    PaddingType padding_type() const { return m_padding_type; }
+    EIGEN_DEVICE_FUNC
+    Scalar padding_value() const { return m_padding_value; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const DenseIndex m_patch_planes;
+    const DenseIndex m_patch_rows;
+    const DenseIndex m_patch_cols;
+    const DenseIndex m_plane_strides;
+    const DenseIndex m_row_strides;
+    const DenseIndex m_col_strides;
+    const DenseIndex m_in_plane_strides;
+    const DenseIndex m_in_row_strides;
+    const DenseIndex m_in_col_strides;
+    const DenseIndex m_plane_inflate_strides;
+    const DenseIndex m_row_inflate_strides;
+    const DenseIndex m_col_inflate_strides;
+    const bool m_padding_explicit;
+    const DenseIndex m_padding_top_z;
+    const DenseIndex m_padding_bottom_z;
+    const DenseIndex m_padding_top;
+    const DenseIndex m_padding_bottom;
+    const DenseIndex m_padding_left;
+    const DenseIndex m_padding_right;
+    const PaddingType m_padding_type;
+    const Scalar m_padding_value;
+};
+
+
+// Eval as rvalue
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device>
+{
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    m_paddingValue = op.padding_value();
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // Cache a few variables.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputDepth = input_dims[0];
+      m_inputPlanes = input_dims[1];
+      m_inputRows = input_dims[2];
+      m_inputCols = input_dims[3];
+    } else {
+      m_inputDepth = input_dims[NumInputDims-1];
+      m_inputPlanes = input_dims[NumInputDims-2];
+      m_inputRows = input_dims[NumInputDims-3];
+      m_inputCols = input_dims[NumInputDims-4];
+    }
+
+    m_plane_strides = op.plane_strides();
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // Input strides and effective input/patch size
+    m_in_plane_strides = op.in_plane_strides();
+    m_in_row_strides = op.in_row_strides();
+    m_in_col_strides = op.in_col_strides();
+    m_plane_inflate_strides = op.plane_inflate_strides();
+    m_row_inflate_strides = op.row_inflate_strides();
+    m_col_inflate_strides = op.col_inflate_strides();
+
+    // The "effective" spatial size after inflating data with zeros.
+    m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
+    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+    m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
+    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+    if (op.padding_explicit()) {
+      m_outputPlanes = numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+      m_planePaddingTop = op.padding_top_z();
+      m_rowPaddingTop = op.padding_top();
+      m_colPaddingLeft = op.padding_left();
+    } else {
+      // Computing padding from the type
+      switch (op.padding_type()) {
+        case PADDING_VALID:
+          m_outputPlanes = numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+          m_planePaddingTop = 0;
+          m_rowPaddingTop = 0;
+          m_colPaddingLeft = 0;
+          break;
+        case PADDING_SAME: {
+          m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
+          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+          const Index dz = m_outputPlanes * m_plane_strides + m_patch_planes_eff - 1 - m_input_planes_eff;
+          const Index dy = m_outputRows * m_row_strides + m_patch_rows_eff - 1 - m_input_rows_eff;
+          const Index dx = m_outputCols * m_col_strides + m_patch_cols_eff - 1 - m_input_cols_eff;
+          m_planePaddingTop = dz - dz / 2;
+          m_rowPaddingTop = dy - dy / 2;
+          m_colPaddingLeft = dx - dx / 2;
+          break;
+        }
+        default:
+          eigen_assert(false && "unexpected padding");
+      }
+    }
+    eigen_assert(m_outputRows > 0);
+    eigen_assert(m_outputCols > 0);
+    eigen_assert(m_outputPlanes > 0);
+
+    // Dimensions for result of extraction.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // ColMajor
+      // 0: depth
+      // 1: patch_planes
+      // 2: patch_rows
+      // 3: patch_cols
+      // 4: number of patches
+      // 5 and beyond: anything else (such as batch).
+      m_dimensions[0] = input_dims[0];
+      m_dimensions[1] = op.patch_planes();
+      m_dimensions[2] = op.patch_rows();
+      m_dimensions[3] = op.patch_cols();
+      m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = 5; i < NumDims; ++i) {
+        m_dimensions[i] = input_dims[i-1];
+      }
+    } else {
+      // RowMajor
+      // NumDims-1: depth
+      // NumDims-2: patch_planes
+      // NumDims-3: patch_rows
+      // NumDims-4: patch_cols
+      // NumDims-5: number of patches
+      // NumDims-6 and beyond: anything else (such as batch).
+      m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
+      m_dimensions[NumDims-2] = op.patch_planes();
+      m_dimensions[NumDims-3] = op.patch_rows();
+      m_dimensions[NumDims-4] = op.patch_cols();
+      m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = NumDims-6; i >= 0; --i) {
+        m_dimensions[i] = input_dims[i];
+      }
+    }
+
+    // Strides for the output tensor.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_rowStride = m_dimensions[1];
+      m_colStride = m_dimensions[2] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
+      m_otherStride = m_patchStride * m_dimensions[4];
+    } else {
+      m_rowStride = m_dimensions[NumDims-2];
+      m_colStride = m_dimensions[NumDims-3] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1];
+      m_otherStride = m_patchStride * m_dimensions[NumDims-5];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_planeInputStride = m_inputDepth;
+    m_rowInputStride = m_inputDepth * m_inputPlanes;
+    m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
+    m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
+
+    m_outputPlanesRows = m_outputPlanes * m_outputRows;
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
+    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+    m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
+    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+    m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
+    m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows);
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+    } else {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+
+    // Spatial offset within the patch. This has to be translated into 3D
+    // coordinates within the patch.
+    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    // Batch, etc.
+    const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
+    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    // Calculate column index in the input original tensor.
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate row index in the original input tensor.
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate plane index in the original input tensor.
+    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
+    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
+    const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
+        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+    const Index inputIndex = depth +
+        origInputRow * m_rowInputStride +
+        origInputCol * m_colInputStride +
+        origInputPlane * m_planeInputStride +
+        otherIndex * m_otherInputStride;
+
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
+        m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index indices[2] = {index, index + PacketSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffsets[2] = {
+      patchOffsets[0] / m_fastColStride,
+      patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {
+      colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
+      colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputCols[0] != inputCols[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffsets[2] = {
+      (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
+      (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
+    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+    // Calculate col indices in the original input tensor.
+    const Index inputRows[2] = {
+      rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
+      rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+    if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputRows[0] != inputRows[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffsets[2] = {
+      patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride,
+      patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride};
+    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
+    const Index inputPlanes[2] = {
+      planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
+      planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
+
+    if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+      // no padding
+      const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+      const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+      const Index inputIndex = depth +
+          inputRows[0] * m_rowInputStride +
+          inputCols[0] * m_colInputStride +
+          m_planeInputStride * inputPlanes[0] +
+          otherIndex * m_otherInputStride;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double compute_cost =
+        10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() +
+        8 * TensorOpCost::AddCost<Index>();
+    return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Index planePaddingTop() const { return m_planePaddingTop; }
+  Index rowPaddingTop() const { return m_rowPaddingTop; }
+  Index colPaddingLeft() const { return m_colPaddingLeft; }
+  Index outputPlanes() const { return m_outputPlanes; }
+  Index outputRows() const { return m_outputRows; }
+  Index outputCols() const { return m_outputCols; }
+  Index userPlaneStride() const { return m_plane_strides; }
+  Index userRowStride() const { return m_row_strides; }
+  Index userColStride() const { return m_col_strides; }
+  Index userInPlaneStride() const { return m_in_plane_strides; }
+  Index userInRowStride() const { return m_in_row_strides; }
+  Index userInColStride() const { return m_in_col_strides; }
+  Index planeInflateStride() const { return m_plane_inflate_strides; }
+  Index rowInflateStride() const { return m_row_inflate_strides; }
+  Index colInflateStride() const { return m_col_inflate_strides; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+  {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  // Parameters passed to the costructor.
+  Index m_plane_strides;
+  Index m_row_strides;
+  Index m_col_strides;
+
+  Index m_outputPlanes;
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_planePaddingTop;
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  Index m_in_plane_strides;
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+
+  Index m_plane_inflate_strides;
+  Index m_row_inflate_strides;
+  Index m_col_inflate_strides;
+
+  // Cached input size.
+  Index m_inputDepth;
+  Index m_inputPlanes;
+  Index m_inputRows;
+  Index m_inputCols;
+
+  // Other cached variables.
+  Index m_outputPlanesRows;
+
+  // Effective input/patch post-inflation size.
+  Index m_input_planes_eff;
+  Index m_input_rows_eff;
+  Index m_input_cols_eff;
+  Index m_patch_planes_eff;
+  Index m_patch_rows_eff;
+  Index m_patch_cols_eff;
+
+  // Strides for the output tensor.
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_rowStride;
+  Index m_colStride;
+
+  // Strides for the input tensor.
+  Index m_planeInputStride;
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_otherInputStride;
+
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+  internal::TensorIntDivisor<Index> m_fastRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+  internal::TensorIntDivisor<Index> m_fastInputColsEff;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanes;
+  internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+  Scalar m_paddingValue;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
new file mode 100644
index 000000000..bc4f2025f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
@@ -0,0 +1,293 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+
+namespace Eigen {
+
+class DynamicSGroup
+{
+  public:
+    inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); }
+    inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { }
+    inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); }
+    inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
+    inline DynamicSGroup& operator=(DynamicSGroup&& o) { m_numIndices = o.m_numIndices; std::swap(m_elements, o.m_elements); m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
+
+    void add(int one, int two, int flags = 0);
+
+    template<typename Gen_>
+    inline void add(Gen_) { add(Gen_::One, Gen_::Two, Gen_::Flags); }
+    inline void addSymmetry(int one, int two) { add(one, two, 0); }
+    inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); }
+    inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); }
+    inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); }
+
+    template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
+    inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const
+    {
+      eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
+      for (std::size_t i = 0; i < size(); i++)
+        initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags, initial, std::forward<Args>(args)...);
+      return initial;
+    }
+
+    template<typename Op, typename RV, typename Index, typename... Args>
+    inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const
+    {
+      eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
+      for (std::size_t i = 0; i < size(); i++)
+        initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...);
+      return initial;
+    }
+
+    inline int globalFlags() const { return m_globalFlags; }
+    inline std::size_t size() const { return m_elements.size(); }
+
+    template<typename Tensor_, typename... IndexTypes>
+    inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
+    {
+      static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+    }
+
+    template<typename Tensor_>
+    inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
+    {
+      return internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup>(tensor, *this, indices);
+    }
+  private:
+    struct GroupElement {
+      std::vector<int> representation;
+      int flags;
+      bool isId() const
+      {
+        for (std::size_t i = 0; i < representation.size(); i++)
+          if (i != (size_t)representation[i])
+            return false;
+        return true;
+      }
+    };
+    struct Generator {
+      int one;
+      int two;
+      int flags;
+      constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
+    };
+
+    std::size_t m_numIndices;
+    std::vector<GroupElement> m_elements;
+    std::vector<Generator> m_generators;
+    int m_globalFlags;
+
+    template<typename Index, std::size_t N, int... n>
+    inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const
+    {
+      return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }};
+    }
+
+    template<typename Index>
+    inline std::vector<Index> h_permute(std::size_t which, std::vector<Index> idx) const
+    {
+      std::vector<Index> result;
+      result.reserve(idx.size());
+      for (auto k : m_elements[which].representation)
+        result.push_back(idx[k]);
+      for (std::size_t i = m_numIndices; i < idx.size(); i++)
+        result.push_back(idx[i]);
+      return result;
+    }
+
+    inline GroupElement ge(Generator const& g) const
+    {
+      GroupElement result;
+      result.representation.reserve(m_numIndices);
+      result.flags = g.flags;
+      for (std::size_t k = 0; k < m_numIndices; k++) {
+        if (k == (std::size_t)g.one)
+          result.representation.push_back(g.two);
+        else if (k == (std::size_t)g.two)
+          result.representation.push_back(g.one);
+        else
+          result.representation.push_back(int(k));
+      }
+      return result;
+    }
+
+    GroupElement mul(GroupElement, GroupElement) const;
+    inline GroupElement mul(Generator g1, GroupElement g2) const
+    {
+      return mul(ge(g1), g2);
+    }
+
+    inline GroupElement mul(GroupElement g1, Generator g2) const
+    {
+      return mul(g1, ge(g2));
+    }
+
+    inline GroupElement mul(Generator g1, Generator g2) const
+    {
+      return mul(ge(g1), ge(g2));
+    }
+
+    inline int findElement(GroupElement e) const
+    {
+      for (auto ee : m_elements) {
+        if (ee.representation == e.representation)
+          return ee.flags ^ e.flags;
+      }
+      return -1;
+    }
+
+    void updateGlobalFlags(int flagDiffOfSameGenerator);
+};
+
+// dynamic symmetry group that auto-adds the template parameters in the constructor
+template<typename... Gen>
+class DynamicSGroupFromTemplateArgs : public DynamicSGroup
+{
+  public:
+    inline DynamicSGroupFromTemplateArgs() : DynamicSGroup()
+    {
+      add_all(internal::type_list<Gen...>());
+    }
+    inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { }
+    inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { }
+    inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(const DynamicSGroupFromTemplateArgs<Gen...>& o) { DynamicSGroup::operator=(o); return *this; }
+    inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(DynamicSGroupFromTemplateArgs<Gen...>&& o) { DynamicSGroup::operator=(o); return *this; }
+  
+  private:
+    template<typename Gen1, typename... GenNext>
+    inline void add_all(internal::type_list<Gen1, GenNext...>)
+    {
+      add(Gen1());
+      add_all(internal::type_list<GenNext...>());
+    }
+
+    inline void add_all(internal::type_list<>)
+    {
+    }
+};
+
+inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const
+{
+  eigen_internal_assert(g1.representation.size() == m_numIndices);
+  eigen_internal_assert(g2.representation.size() == m_numIndices);
+
+  GroupElement result;
+  result.representation.reserve(m_numIndices);
+  for (std::size_t i = 0; i < m_numIndices; i++) {
+    int v = g2.representation[g1.representation[i]];
+    eigen_assert(v >= 0);
+    result.representation.push_back(v);
+  }
+  result.flags = g1.flags ^ g2.flags;
+  return result;
+}
+
+inline void DynamicSGroup::add(int one, int two, int flags)
+{
+  eigen_assert(one >= 0);
+  eigen_assert(two >= 0);
+  eigen_assert(one != two);
+
+  if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) {
+    std::size_t newNumIndices = (one > two) ? one : two + 1;
+    for (auto& gelem : m_elements) {
+      gelem.representation.reserve(newNumIndices);
+      for (std::size_t i = m_numIndices; i < newNumIndices; i++)
+        gelem.representation.push_back(i);
+    }
+    m_numIndices = newNumIndices;
+  }
+
+  Generator g{one, two, flags};
+  GroupElement e = ge(g);
+
+  /* special case for first generator */
+  if (m_elements.size() == 1) {
+    while (!e.isId()) {
+      m_elements.push_back(e);
+      e = mul(e, g);
+    }
+
+    if (e.flags > 0)
+      updateGlobalFlags(e.flags);
+
+    // only add in case we didn't have identity
+    if (m_elements.size() > 1)
+      m_generators.push_back(g);
+    return;
+  }
+
+  int p = findElement(e);
+  if (p >= 0) {
+    updateGlobalFlags(p);
+    return;
+  }
+
+  std::size_t coset_order = m_elements.size();
+  m_elements.push_back(e);
+  for (std::size_t i = 1; i < coset_order; i++)
+    m_elements.push_back(mul(m_elements[i], e));
+  m_generators.push_back(g);
+
+  std::size_t coset_rep = coset_order;
+  do {
+    for (auto g : m_generators) {
+      e = mul(m_elements[coset_rep], g);
+      p = findElement(e);
+      if (p < 0) {
+        // element not yet in group
+        m_elements.push_back(e);
+        for (std::size_t i = 1; i < coset_order; i++)
+          m_elements.push_back(mul(m_elements[i], e));
+      } else if (p > 0) {
+        updateGlobalFlags(p);
+      }
+    }
+    coset_rep += coset_order;
+  } while (coset_rep < m_elements.size());
+}
+
+inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator)
+{
+    switch (flagDiffOfSameGenerator) {
+      case 0:
+      default:
+        // nothing happened
+        break;
+      case NegationFlag:
+        // every element is it's own negative => whole tensor is zero
+        m_globalFlags |= GlobalZeroFlag;
+        break;
+      case ConjugationFlag:
+        // every element is it's own conjugate => whole tensor is real
+        m_globalFlags |= GlobalRealFlag;
+        break;
+      case (NegationFlag | ConjugationFlag):
+        // every element is it's own negative conjugate => whole tensor is imaginary
+        m_globalFlags |= GlobalImagFlag;
+        break;
+      /* NOTE:
+       *   since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator
+       *   causes the tensor to be real and the next one to be imaginary, this will
+       *   trivially give the correct result
+       */
+    }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
new file mode 100644
index 000000000..942293bd7
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename list> struct tensor_static_symgroup_permutate;
+
+template<int... nn>
+struct tensor_static_symgroup_permutate<numeric_list<int, nn...>>
+{
+  constexpr static std::size_t N = sizeof...(nn);
+
+  template<typename T>
+  constexpr static inline std::array<T, N> run(const std::array<T, N>& indices)
+  {
+    return {{indices[nn]...}};
+  }
+};
+
+template<typename indices_, int flags_>
+struct tensor_static_symgroup_element
+{
+  typedef indices_ indices;
+  constexpr static int flags = flags_;
+};
+
+template<typename Gen, int N>
+struct tensor_static_symgroup_element_ctor
+{
+  typedef tensor_static_symgroup_element<
+    typename gen_numeric_list_swapped_pair<int, N, Gen::One, Gen::Two>::type,
+    Gen::Flags
+  > type;
+};
+
+template<int N>
+struct tensor_static_symgroup_identity_ctor
+{
+  typedef tensor_static_symgroup_element<
+    typename gen_numeric_list<int, N>::type,
+    0
+  > type;
+};
+
+template<typename iib>
+struct tensor_static_symgroup_multiply_helper
+{
+  template<int... iia>
+  constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
+    return numeric_list<int, get<iia, iib>::value...>();
+  }
+};
+
+template<typename A, typename B>
+struct tensor_static_symgroup_multiply
+{
+  private:
+    typedef typename A::indices iia;
+    typedef typename B::indices iib;
+    constexpr static int ffa = A::flags;
+    constexpr static int ffb = B::flags;
+  
+  public:
+    static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices.");
+
+    typedef tensor_static_symgroup_element<
+      decltype(tensor_static_symgroup_multiply_helper<iib>::helper(iia())),
+      ffa ^ ffb
+    > type;
+};
+
+template<typename A, typename B>
+struct tensor_static_symgroup_equality
+{
+    typedef typename A::indices iia;
+    typedef typename B::indices iib;
+    constexpr static int ffa = A::flags;
+    constexpr static int ffb = B::flags;
+    static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices.");
+
+    constexpr static bool value = is_same<iia, iib>::value;
+
+  private:
+    /* this should be zero if they are identical, or else the tensor
+     * will be forced to be pure real, pure imaginary or even pure zero
+     */
+    constexpr static int flags_cmp_ = ffa ^ ffb;
+
+    /* either they are not equal, then we don't care whether the flags
+     * match, or they are equal, and then we have to check
+     */
+    constexpr static bool is_zero      = value && flags_cmp_ == NegationFlag;
+    constexpr static bool is_real      = value && flags_cmp_ == ConjugationFlag;
+    constexpr static bool is_imag      = value && flags_cmp_ == (NegationFlag | ConjugationFlag);
+
+  public:
+    constexpr static int global_flags = 
+      (is_real ? GlobalRealFlag : 0) |
+      (is_imag ? GlobalImagFlag : 0) |
+      (is_zero ? GlobalZeroFlag : 0);
+};
+
+template<std::size_t NumIndices, typename... Gen>
+struct tensor_static_symgroup
+{
+  typedef StaticSGroup<Gen...> type;
+  constexpr static std::size_t size = type::static_size;
+};
+
+template<typename Index, std::size_t N, int... ii, int... jj>
+constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx, internal::numeric_list<int, ii...>, internal::numeric_list<int, jj...>)
+{
+  return {{ idx[ii]..., idx[jj]... }};
+}
+
+template<typename Index, int... ii>
+static inline std::vector<Index> tensor_static_symgroup_index_permute(std::vector<Index> idx, internal::numeric_list<int, ii...>)
+{
+  std::vector<Index> result{{ idx[ii]... }};
+  std::size_t target_size = idx.size();
+  for (std::size_t i = result.size(); i < target_size; i++)
+    result.push_back(idx[i]);
+  return result;
+}
+
+template<typename T> struct tensor_static_symgroup_do_apply;
+
+template<typename first, typename... next>
+struct tensor_static_symgroup_do_apply<internal::type_list<first, next...>>
+{
+  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
+  static inline RV run(const std::array<Index, NumIndices>& idx, RV initial, Args&&... args)
+  {
+    static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    typedef typename internal::gen_numeric_list<int, NumIndices - SGNumIndices, SGNumIndices>::type remaining_indices;
+    initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward<Args>(args)...);
+    return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
+  }
+
+  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
+  static inline RV run(const std::vector<Index>& idx, RV initial, Args&&... args)
+  {
+    eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward<Args>(args)...);
+    return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
+  }
+};
+
+template<EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)>
+struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>>
+{
+  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
+  static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...)
+  {
+    // do nothing
+    return initial;
+  }
+
+  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
+  static inline RV run(const std::vector<Index>&, RV initial, Args&&...)
+  {
+    // do nothing
+    return initial;
+  }
+};
+
+} // end namespace internal
+
+template<typename... Gen>
+class StaticSGroup
+{
+    constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
+    typedef internal::group_theory::enumerate_group_elements<
+      internal::tensor_static_symgroup_multiply,
+      internal::tensor_static_symgroup_equality,
+      typename internal::tensor_static_symgroup_identity_ctor<NumIndices>::type,
+      internal::type_list<typename internal::tensor_static_symgroup_element_ctor<Gen, NumIndices>::type...>
+    > group_elements;
+    typedef typename group_elements::type ge;
+  public:
+    constexpr inline StaticSGroup() {}
+    constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {}
+    constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {}
+
+    template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
+    static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args)
+    {
+      return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
+    }
+
+    template<typename Op, typename RV, typename Index, typename... Args>
+    static inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args)
+    {
+      eigen_assert(idx.size() == NumIndices);
+      return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
+    }
+
+    constexpr static std::size_t static_size = ge::count;
+
+    constexpr static inline std::size_t size() {
+      return ge::count;
+    }
+    constexpr static inline int globalFlags() { return group_elements::global_flags; }
+
+    template<typename Tensor_, typename... IndexTypes>
+    inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
+    {
+      static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+    }
+
+    template<typename Tensor_>
+    inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
+    {
+      return internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>>(tensor, *this, indices);
+    }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
new file mode 100644
index 000000000..879d6cd77
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
@@ -0,0 +1,338 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+
+namespace Eigen {
+
+enum {
+  NegationFlag           = 0x01,
+  ConjugationFlag        = 0x02
+};
+
+enum {
+  GlobalRealFlag         = 0x01,
+  GlobalImagFlag         = 0x02,
+  GlobalZeroFlag         = 0x03
+};
+
+namespace internal {
+
+template<std::size_t NumIndices, typename... Sym>                   struct tensor_symmetry_pre_analysis;
+template<std::size_t NumIndices, typename... Sym>                   struct tensor_static_symgroup;
+template<bool instantiate, std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup_if;
+template<typename Tensor_> struct tensor_symmetry_calculate_flags;
+template<typename Tensor_> struct tensor_symmetry_assign_value;
+template<typename... Sym> struct tensor_symmetry_num_indices;
+
+} // end namespace internal
+
+template<int One_, int Two_>
+struct Symmetry
+{
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = 0;
+};
+
+template<int One_, int Two_>
+struct AntiSymmetry
+{
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = NegationFlag;
+};
+
+template<int One_, int Two_>
+struct Hermiticity
+{
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = ConjugationFlag;
+};
+
+template<int One_, int Two_>
+struct AntiHermiticity
+{
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = ConjugationFlag | NegationFlag;
+};
+
+/** \class DynamicSGroup
+  * \ingroup TensorSymmetry_Module
+  *
+  * \brief Dynamic symmetry group
+  *
+  * The %DynamicSGroup class represents a symmetry group that need not be known at
+  * compile time. It is useful if one wants to support arbitrary run-time defineable
+  * symmetries for tensors, but it is also instantiated if a symmetry group is defined
+  * at compile time that would be either too large for the compiler to reasonably
+  * generate (using templates to calculate this at compile time is very inefficient)
+  * or that the compiler could generate the group but that it wouldn't make sense to
+  * unroll the loop for setting coefficients anymore.
+  */
+class DynamicSGroup;
+
+/** \internal
+  *
+  * \class DynamicSGroupFromTemplateArgs
+  * \ingroup TensorSymmetry_Module
+  *
+  * \brief Dynamic symmetry group, initialized from template arguments
+  *
+  * This class is a child class of DynamicSGroup. It uses the template arguments
+  * specified to initialize itself.
+  */
+template<typename... Gen>
+class DynamicSGroupFromTemplateArgs;
+
+/** \class StaticSGroup
+  * \ingroup TensorSymmetry_Module
+  *
+  * \brief Static symmetry group
+  *
+  * This class represents a symmetry group that is known and resolved completely
+  * at compile time. Ideally, no run-time penalty is incurred compared to the
+  * manual unrolling of the symmetry.
+  *
+  * <b><i>CAUTION:</i></b>
+  *
+  * Do not use this class directly for large symmetry groups. The compiler
+  * may run into a limit, or segfault or in the very least will take a very,
+  * very, very long time to compile the code. Use the SGroup class instead
+  * if you want a static group. That class contains logic that will
+  * automatically select the DynamicSGroup class instead if the symmetry
+  * group becomes too large. (In that case, unrolling may not even be
+  * beneficial.)
+  */
+template<typename... Gen>
+class StaticSGroup;
+
+/** \class SGroup
+  * \ingroup TensorSymmetry_Module
+  *
+  * \brief Symmetry group, initialized from template arguments
+  *
+  * This class represents a symmetry group whose generators are already
+  * known at compile time. It may or may not be resolved at compile time,
+  * depending on the estimated size of the group.
+  *
+  * \sa StaticSGroup
+  * \sa DynamicSGroup
+  */
+template<typename... Gen>
+class SGroup : public internal::tensor_symmetry_pre_analysis<internal::tensor_symmetry_num_indices<Gen...>::value, Gen...>::root_type
+{
+  public:
+    constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
+    typedef typename internal::tensor_symmetry_pre_analysis<NumIndices, Gen...>::root_type Base;
+
+    // make standard constructors + assignment operators public
+    inline SGroup() : Base() { }
+    inline SGroup(const SGroup<Gen...>& other) : Base(other) { }
+    inline SGroup(SGroup<Gen...>&& other) : Base(other) { }
+    inline SGroup<Gen...>& operator=(const SGroup<Gen...>& other) { Base::operator=(other); return *this; }
+    inline SGroup<Gen...>& operator=(SGroup<Gen...>&& other) { Base::operator=(other); return *this; }
+
+    // all else is defined in the base class
+};
+
+namespace internal {
+
+template<typename... Sym> struct tensor_symmetry_num_indices
+{
+  constexpr static std::size_t value = 1;
+};
+
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...>
+{
+private:
+  constexpr static std::size_t One = static_cast<std::size_t>(One_);
+  constexpr static std::size_t Two = static_cast<std::size_t>(Two_);
+  constexpr static std::size_t Three = tensor_symmetry_num_indices<Sym...>::value;
+
+  // don't use std::max, since it's not constexpr until C++14...
+  constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1;
+public:
+  constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three;
+};
+
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiSymmetry<One_, Two_>, Sym...>
+  : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Hermiticity<One_, Two_>, Sym...>
+  : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiHermiticity<One_, Two_>, Sym...>
+  : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+
+/** \internal
+  *
+  * \class tensor_symmetry_pre_analysis
+  * \ingroup TensorSymmetry_Module
+  *
+  * \brief Pre-select whether to use a static or dynamic symmetry group
+  *
+  * When a symmetry group could in principle be determined at compile time,
+  * this template implements the logic whether to actually do that or whether
+  * to rather defer that to runtime.
+  *
+  * The logic is as follows:
+  * <dl>
+  * <dt><b>No generators (trivial symmetry):</b></dt>
+  * <dd>Use a trivial static group. Ideally, this has no performance impact
+  *     compared to not using symmetry at all. In practice, this might not
+  *     be the case.</dd>
+  * <dt><b>More than 4 generators:</b></dt>
+  * <dd>Calculate the group at run time, it is likely far too large for the
+  *     compiler to be able to properly generate it in a realistic time.</dd>
+  * <dt><b>Up to and including 4 generators:</b></dt>
+  * <dd>Actually enumerate all group elements, but then check how many there
+  *     are. If there are more than 16, it is unlikely that unrolling the
+  *     loop (as is done in the static compile-time case) is sensible, so
+  *     use a dynamic group instead. If there are at most 16 elements, actually
+  *     use that static group. Note that the largest group with 4 generators
+  *     still compiles with reasonable resources.</dd>
+  * </dl>
+  *
+  * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470
+  *       with 16 GiB RAM (all generators non-redundant and the subgroups don't
+  *       factorize):
+  *
+  *          # Generators          -O0 -ggdb               -O2
+  *          -------------------------------------------------------------------
+  *          1                 0.5 s  /   250 MiB     0.45s /   230 MiB
+  *          2                 0.5 s  /   260 MiB     0.5 s /   250 MiB
+  *          3                 0.65s  /   310 MiB     0.62s /   310 MiB
+  *          4                 2.2 s  /   860 MiB     1.7 s /   770 MiB
+  *          5               130   s  / 13000 MiB   120   s / 11000 MiB
+  *
+  * It is clear that everything is still very efficient up to 4 generators, then
+  * the memory and CPU requirements become unreasonable. Thus we only instantiate
+  * the template group theory logic if the number of generators supplied is 4 or
+  * lower, otherwise this will be forced to be done during runtime, where the
+  * algorithm is reasonably fast.
+  */
+template<std::size_t NumIndices>
+struct tensor_symmetry_pre_analysis<NumIndices>
+{
+  typedef StaticSGroup<> root_type;
+};
+
+template<std::size_t NumIndices, typename Gen_, typename... Gens_>
+struct tensor_symmetry_pre_analysis<NumIndices, Gen_, Gens_...>
+{
+  constexpr static std::size_t max_static_generators = 4;
+  constexpr static std::size_t max_static_elements = 16;
+  typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper;
+  constexpr static std::size_t possible_size = helper::size;
+
+  typedef typename conditional<
+    possible_size == 0 || possible_size >= max_static_elements,
+    DynamicSGroupFromTemplateArgs<Gen_, Gens_...>,
+    typename helper::type
+  >::type root_type;
+};
+
+template<bool instantiate, std::size_t NumIndices, typename... Gens>
+struct tensor_static_symgroup_if
+{
+  constexpr static std::size_t size = 0;
+  typedef void type;
+};
+
+template<std::size_t NumIndices, typename... Gens>
+struct tensor_static_symgroup_if<true, NumIndices, Gens...> : tensor_static_symgroup<NumIndices, Gens...> {};
+
+template<typename Tensor_>
+struct tensor_symmetry_assign_value
+{
+  typedef typename Tensor_::Index Index;
+  typedef typename Tensor_::Scalar Scalar;
+  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+  static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transformation_flags, int dummy, Tensor_& tensor, const Scalar& value_)
+  {
+    Scalar value(value_);
+    if (transformation_flags & ConjugationFlag)
+      value = numext::conj(value);
+    if (transformation_flags & NegationFlag)
+      value = -value;
+    tensor.coeffRef(transformed_indices) = value;
+    return dummy;
+  }
+};
+
+template<typename Tensor_>
+struct tensor_symmetry_calculate_flags
+{
+  typedef typename Tensor_::Index Index;
+  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+  static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transform_flags, int current_flags, const std::array<Index, NumIndices>& orig_indices)
+  {
+    if (transformed_indices == orig_indices) {
+      if (transform_flags & (ConjugationFlag | NegationFlag))
+        return current_flags | GlobalImagFlag; // anti-hermitian diagonal
+      else if (transform_flags & ConjugationFlag)
+        return current_flags | GlobalRealFlag; // hermitian diagonal
+      else if (transform_flags & NegationFlag)
+        return current_flags | GlobalZeroFlag; // anti-symmetric diagonal
+    }
+    return current_flags;
+  }
+};
+
+template<typename Tensor_, typename Symmetry_, int Flags = 0>
+class tensor_symmetry_value_setter
+{
+  public:
+    typedef typename Tensor_::Index Index;
+    typedef typename Tensor_::Scalar Scalar;
+    constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+    inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, std::array<Index, NumIndices> const& indices)
+      : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) { }
+
+    inline tensor_symmetry_value_setter<Tensor_, Symmetry_, Flags>& operator=(Scalar const& value)
+    {
+      doAssign(value);
+      return *this;
+    }
+  private:
+    Tensor_& m_tensor;
+    Symmetry_ m_symmetry;
+    std::array<Index, NumIndices> m_indices;
+
+    inline void doAssign(Scalar const& value)
+    {
+      #ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES
+        int value_flags = m_symmetry.template apply<internal::tensor_symmetry_calculate_flags<Tensor_>, int>(m_indices, m_symmetry.globalFlags(), m_indices);
+        if (value_flags & GlobalRealFlag)
+          eigen_assert(numext::imag(value) == 0);
+        if (value_flags & GlobalImagFlag)
+          eigen_assert(numext::real(value) == 0);
+      #endif
+      m_symmetry.template apply<internal::tensor_symmetry_assign_value<Tensor_>, int>(m_indices, 0, m_tensor, value);
+    }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
new file mode 100644
index 000000000..0fe0b7c46
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
@@ -0,0 +1,666 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+
+namespace Eigen {
+
+namespace internal {
+
+namespace group_theory {
+
+/** \internal
+  * \file CXX11/Tensor/util/TemplateGroupTheory.h
+  * This file contains C++ templates that implement group theory algorithms.
+  *
+  * The algorithms allow for a compile-time analysis of finite groups.
+  *
+  * Currently only Dimino's algorithm is implemented, which returns a list
+  * of all elements in a group given a set of (possibly redundant) generators.
+  * (One could also do that with the so-called orbital algorithm, but that
+  * is much more expensive and usually has no advantages.)
+  */
+
+/**********************************************************************
+ *                "Ok kid, here is where it gets complicated."
+ *                         - Amelia Pond in the "Doctor Who" episode
+ *                           "The Big Bang"
+ *
+ * Dimino's algorithm
+ * ==================
+ *
+ * The following is Dimino's algorithm in sequential form:
+ *
+ * Input: identity element, list of generators, equality check,
+ *        multiplication operation
+ * Output: list of group elements
+ *
+ * 1. add identity element
+ * 2. remove identities from list of generators
+ * 3. add all powers of first generator that aren't the
+ *    identity element
+ * 4. go through all remaining generators:
+ *        a. if generator is already in the list of elements
+ *                -> do nothing
+ *        b. otherwise
+ *                i.   remember current # of elements
+ *                     (i.e. the size of the current subgroup)
+ *                ii.  add all current elements (which includes
+ *                     the identity) each multiplied from right
+ *                     with the current generator to the group
+ *                iii. add all remaining cosets that are generated
+ *                     by products of the new generator with itself
+ *                     and all other generators seen so far
+ *
+ * In functional form, this is implemented as a long set of recursive
+ * templates that have a complicated relationship.
+ *
+ * The main interface for Dimino's algorithm is the template
+ * enumerate_group_elements. All lists are implemented as variadic
+ * type_list<typename...> and numeric_list<typename = int, int...>
+ * templates.
+ *
+ * 'Calling' templates is usually done via typedefs.
+ *
+ * This algorithm is an extended version of the basic version. The
+ * extension consists in the fact that each group element has a set
+ * of flags associated with it. Multiplication of two group elements
+ * with each other results in a group element whose flags are the
+ * XOR of the flags of the previous elements. Each time the algorithm
+ * notices that a group element it just calculated is already in the
+ * list of current elements, the flags of both will be compared and
+ * added to the so-called 'global flags' of the group.
+ *
+ * The rationale behind this extension is that this allows not only
+ * for the description of symmetries between tensor indices, but
+ * also allows for the description of hermiticity, antisymmetry and
+ * antihermiticity. Negation and conjugation each are specific bit
+ * in the flags value and if two different ways to reach a group
+ * element lead to two different flags, this poses a constraint on
+ * the allowed values of the resulting tensor. For example, if a
+ * group element is reach both with and without the conjugation
+ * flags, it is clear that the resulting tensor has to be real.
+ *
+ * Note that this flag mechanism is quite generic and may have other
+ * uses beyond tensor properties.
+ *
+ * IMPORTANT: 
+ *     This algorithm assumes the group to be finite. If you try to
+ *     run it with a group that's infinite, the algorithm will only
+ *     terminate once you hit a compiler limit (max template depth).
+ *     Also note that trying to use this implementation to create a
+ *     very large group will probably either make you hit the same
+ *     limit, cause the compiler to segfault or at the very least
+ *     take a *really* long time (hours, days, weeks - sic!) to
+ *     compile. It is not recommended to plug in more than 4
+ *     generators, unless they are independent of each other.
+ */
+
+/** \internal
+  *
+  * \class strip_identities
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Cleanse a list of group elements of the identity element
+  *
+  * This template is used to make a first pass through all initial
+  * generators of Dimino's algorithm and remove the identity
+  * elements.
+  *
+  * \sa enumerate_group_elements
+  */
+template<template<typename, typename> class Equality, typename id, typename L> struct strip_identities;
+
+template<
+  template<typename, typename> class Equality,
+  typename id,
+  typename t,
+  typename... ts
+>
+struct strip_identities<Equality, id, type_list<t, ts...>>
+{
+  typedef typename conditional<
+    Equality<id, t>::value,
+    typename strip_identities<Equality, id, type_list<ts...>>::type,
+    typename concat<type_list<t>, typename strip_identities<Equality, id, type_list<ts...>>::type>::type
+  >::type type;
+  constexpr static int global_flags = Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags;
+};
+
+template<
+  template<typename, typename> class Equality,
+  typename id
+  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts)
+>
+struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>>
+{
+  typedef type_list<> type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+  *
+  * \class dimino_first_step_elements_helper 
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Recursive template that adds powers of the first generator to the list of group elements
+  *
+  * This template calls itself recursively to add powers of the first
+  * generator to the list of group elements. It stops if it reaches
+  * the identity element again.
+  *
+  * \sa enumerate_group_elements, dimino_first_step_elements
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename g,
+  typename current_element,
+  typename elements,
+  bool dont_add_current_element   // = false
+>
+struct dimino_first_step_elements_helper :
+  public dimino_first_step_elements_helper<
+    Multiply,
+    Equality,
+    id,
+    g,
+    typename Multiply<current_element, g>::type,
+    typename concat<elements, type_list<current_element>>::type,
+    Equality<typename Multiply<current_element, g>::type, id>::value
+  > {};
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename g,
+  typename current_element,
+  typename elements
+>
+struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true>
+{
+  typedef elements type;
+  constexpr static int global_flags = Equality<current_element, id>::global_flags;
+};
+
+/** \internal
+  *
+  * \class dimino_first_step_elements
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Add all powers of the first generator to the list of group elements
+  *
+  * This template takes the first non-identity generator and generates the initial
+  * list of elements which consists of all powers of that generator. For a group
+  * with just one generated, it would be enumerated after this.
+  *
+  * \sa enumerate_group_elements
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename generators
+>
+struct dimino_first_step_elements
+{
+  typedef typename get<0, generators>::type first_generator;
+  typedef typename skip<1, generators>::type next_generators;
+  typedef type_list<first_generator> generators_done;
+
+  typedef dimino_first_step_elements_helper<
+    Multiply,
+    Equality,
+    id,
+    first_generator,
+    first_generator,
+    type_list<id>,
+    false
+  > helper;
+  typedef typename helper::type type;
+  constexpr static int global_flags = helper::global_flags;
+};
+
+/** \internal
+  *
+  * \class dimino_get_coset_elements
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Generate all elements of a specific coset
+  *
+  * This template generates all the elements of a specific coset by
+  * multiplying all elements in the given subgroup with the new
+  * coset representative. Note that the first element of the
+  * subgroup is always the identity element, so the first element of
+  * ther result of this template is going to be the coset
+  * representative itself.
+  *
+  * Note that this template accepts an additional boolean parameter
+  * that specifies whether to actually generate the coset (true) or
+  * just return an empty list (false).
+  *
+  * \sa enumerate_group_elements, dimino_add_cosets_for_rep
+  */
+template<
+  template<typename, typename> class Multiply,
+  typename sub_group_elements,
+  typename new_coset_rep,
+  bool generate_coset      // = true
+>
+struct dimino_get_coset_elements
+{
+  typedef typename apply_op_from_right<Multiply, new_coset_rep, sub_group_elements>::type type;
+};
+
+template<
+  template<typename, typename> class Multiply,
+  typename sub_group_elements,
+  typename new_coset_rep
+>
+struct dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, false>
+{
+  typedef type_list<> type;
+};
+
+/** \internal
+  *
+  * \class dimino_add_cosets_for_rep
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Recursive template for adding coset spaces
+  *
+  * This template multiplies the coset representative with a generator
+  * from the list of previous generators. If the new element is not in
+  * the group already, it adds the corresponding coset. Finally it
+  * proceeds to call itself with the next generator from the list.
+  *
+  * \sa enumerate_group_elements, dimino_add_all_coset_spaces
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename sub_group_elements,
+  typename elements,
+  typename generators,
+  typename rep_element,
+  int sub_group_size
+>
+struct dimino_add_cosets_for_rep;
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename sub_group_elements,
+  typename elements,
+  typename g,
+  typename... gs,
+  typename rep_element,
+  int sub_group_size
+>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<g, gs...>, rep_element, sub_group_size>
+{
+  typedef typename Multiply<rep_element, g>::type new_coset_rep;
+  typedef contained_in_list_gf<Equality, new_coset_rep, elements> _cil;
+  constexpr static bool add_coset = !_cil::value;
+
+  typedef typename dimino_get_coset_elements<
+    Multiply,
+    sub_group_elements,
+    new_coset_rep,
+    add_coset
+  >::type coset_elements;
+
+  typedef dimino_add_cosets_for_rep<
+    Multiply,
+    Equality,
+    id,
+    sub_group_elements,
+    typename concat<elements, coset_elements>::type,
+    type_list<gs...>,
+    rep_element,
+    sub_group_size
+  > _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _cil::global_flags | _helper::global_flags;
+
+  /* Note that we don't have to update global flags here, since
+   * we will only add these elements if they are not part of
+   * the group already. But that only happens if the coset rep
+   * is not already in the group, so the check for the coset rep
+   * will catch this.
+   */
+};
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename sub_group_elements,
+  typename elements
+  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
+  typename rep_element,
+  int sub_group_size
+>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size>
+{
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+  *
+  * \class dimino_add_all_coset_spaces
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Recursive template for adding all coset spaces for a new generator
+  *
+  * This template tries to go through the list of generators (with
+  * the help of the dimino_add_cosets_for_rep template) as long as
+  * it still finds elements that are not part of the group and add
+  * the corresponding cosets.
+  *
+  * \sa enumerate_group_elements, dimino_add_cosets_for_rep
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename sub_group_elements,
+  typename elements,
+  typename generators,
+  int sub_group_size,
+  int rep_pos,
+  bool stop_condition        // = false
+>
+struct dimino_add_all_coset_spaces
+{
+  typedef typename get<rep_pos, elements>::type rep_element;
+  typedef dimino_add_cosets_for_rep<
+    Multiply,
+    Equality,
+    id,
+    sub_group_elements,
+    elements,
+    generators,
+    rep_element,
+    sub_group_elements::count
+  > _ac4r;
+  typedef typename _ac4r::type new_elements;
+  
+  constexpr static int new_rep_pos = rep_pos + sub_group_elements::count;
+  constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count;
+
+  typedef dimino_add_all_coset_spaces<
+    Multiply,
+    Equality,
+    id,
+    sub_group_elements,
+    new_elements,
+    generators,
+    sub_group_size,
+    new_rep_pos,
+    new_stop_condition
+  > _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags;
+};
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename sub_group_elements,
+  typename elements,
+  typename generators,
+  int sub_group_size,
+  int rep_pos
+>
+struct dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, elements, generators, sub_group_size, rep_pos, true>
+{
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+  *
+  * \class dimino_add_generator
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Enlarge the group by adding a new generator.
+  *
+  * It accepts a boolean parameter that determines if the generator is redundant,
+  * i.e. was already seen in the group. In that case, it reduces to a no-op.
+  *
+  * \sa enumerate_group_elements, dimino_add_all_coset_spaces
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename elements,
+  typename generators_done,
+  typename current_generator,
+  bool redundant          // = false
+>
+struct dimino_add_generator
+{
+  /* this template is only called if the generator is not redundant
+   * => all elements of the group multiplied with the new generator
+   *    are going to be new elements of the most trivial coset space
+   */
+  typedef typename apply_op_from_right<Multiply, current_generator, elements>::type multiplied_elements;
+  typedef typename concat<elements, multiplied_elements>::type new_elements;
+
+  constexpr static int rep_pos = elements::count;
+
+  typedef dimino_add_all_coset_spaces<
+    Multiply,
+    Equality,
+    id,
+    elements, // elements of previous subgroup
+    new_elements,
+    typename concat<generators_done, type_list<current_generator>>::type,
+    elements::count, // size of previous subgroup
+    rep_pos,
+    false // don't stop (because rep_pos >= new_elements::count is always false at this point)
+  > _helper;
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _helper::global_flags;
+};
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename elements,
+  typename generators_done,
+  typename current_generator
+>
+struct dimino_add_generator<Multiply, Equality, id, elements, generators_done, current_generator, true>
+{
+  // redundant case
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+  *
+  * \class dimino_add_remaining_generators
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Recursive template that adds all remaining generators to a group
+  *
+  * Loop through the list of generators that remain and successively
+  * add them to the group.
+  *
+  * \sa enumerate_group_elements, dimino_add_generator
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename generators_done,
+  typename remaining_generators,
+  typename elements
+>
+struct dimino_add_remaining_generators
+{
+  typedef typename get<0, remaining_generators>::type first_generator;
+  typedef typename skip<1, remaining_generators>::type next_generators;
+
+  typedef contained_in_list_gf<Equality, first_generator, elements> _cil;
+
+  typedef dimino_add_generator<
+    Multiply,
+    Equality,
+    id,
+    elements,
+    generators_done,
+    first_generator,
+    _cil::value
+  > _helper;
+
+  typedef typename _helper::type new_elements;
+
+  typedef dimino_add_remaining_generators<
+    Multiply,
+    Equality,
+    id,
+    typename concat<generators_done, type_list<first_generator>>::type,
+    next_generators,
+    new_elements
+  > _next_iter;
+
+  typedef typename _next_iter::type type;
+  constexpr static int global_flags =
+    _cil::global_flags |
+    _helper::global_flags |
+    _next_iter::global_flags;
+};
+
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename generators_done,
+  typename elements
+>
+struct dimino_add_remaining_generators<Multiply, Equality, id, generators_done, type_list<>, elements>
+{
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+  *
+  * \class enumerate_group_elements_noid
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Helper template that implements group element enumeration
+  *
+  * This is a helper template that implements the actual enumeration
+  * of group elements. This has been split so that the list of
+  * generators can be cleansed of the identity element before
+  * performing the actual operation.
+  *
+  * \sa enumerate_group_elements
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename generators,
+  int initial_global_flags = 0
+>
+struct enumerate_group_elements_noid
+{
+  typedef dimino_first_step_elements<Multiply, Equality, id, generators> first_step;
+  typedef typename first_step::type first_step_elements;
+
+  typedef dimino_add_remaining_generators<
+    Multiply,
+    Equality,
+    id,
+    typename first_step::generators_done,
+    typename first_step::next_generators, // remaining_generators
+    typename first_step::type // first_step elements
+  > _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags =
+    initial_global_flags |
+    first_step::global_flags |
+    _helper::global_flags;
+};
+
+// in case when no generators are specified
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  int initial_global_flags
+>
+struct enumerate_group_elements_noid<Multiply, Equality, id, type_list<>, initial_global_flags>
+{
+  typedef type_list<id> type;
+  constexpr static int global_flags = initial_global_flags;
+};
+
+/** \internal
+  *
+  * \class enumerate_group_elements
+  * \ingroup CXX11_TensorSymmetry_Module
+  *
+  * \brief Enumerate all elements in a finite group
+  *
+  * This template enumerates all elements in a finite group. It accepts
+  * the following template parameters:
+  *
+  * \tparam Multiply      The multiplication operation that multiplies two group elements
+  *                       with each other.
+  * \tparam Equality      The equality check operation that checks if two group elements
+  *                       are equal to another.
+  * \tparam id            The identity element
+  * \tparam _generators   A list of (possibly redundant) generators of the group
+  */
+template<
+  template<typename, typename> class Multiply,
+  template<typename, typename> class Equality,
+  typename id,
+  typename _generators
+>
+struct enumerate_group_elements
+  : public enumerate_group_elements_noid<
+      Multiply,
+      Equality,
+      id,
+      typename strip_identities<Equality, id, _generators>::type,
+      strip_identities<Equality, id, _generators>::global_flags
+    >
+{
+};
+
+} // end namespace group_theory
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
new file mode 100644
index 000000000..71d55552d
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -0,0 +1,233 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
+#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
+
+namespace Eigen {
+
+// EventCount allows to wait for arbitrary predicates in non-blocking
+// algorithms. Think of condition variable, but wait predicate does not need to
+// be protected by a mutex. Usage:
+// Waiting thread does:
+//
+//   if (predicate)
+//     return act();
+//   EventCount::Waiter& w = waiters[my_index];
+//   ec.Prewait(&w);
+//   if (predicate) {
+//     ec.CancelWait(&w);
+//     return act();
+//   }
+//   ec.CommitWait(&w);
+//
+// Notifying thread does:
+//
+//   predicate = true;
+//   ec.Notify(true);
+//
+// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
+// cheap, but they are executed only if the preceeding predicate check has
+// failed.
+//
+// Algorihtm outline:
+// There are two main variables: predicate (managed by user) and state_.
+// Operation closely resembles Dekker mutual algorithm:
+// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
+// Waiting thread sets state_ then checks predicate, Notifying thread sets
+// predicate then checks state_. Due to seq_cst fences in between these
+// operations it is guaranteed than either waiter will see predicate change
+// and won't block, or notifying thread will see state_ change and will unblock
+// the waiter, or both. But it can't happen that both threads don't see each
+// other changes, which would lead to deadlock.
+class EventCount {
+ public:
+  class Waiter;
+
+  EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
+    eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
+    // Initialize epoch to something close to overflow to test overflow.
+    state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
+  }
+
+  ~EventCount() {
+    // Ensure there are no waiters.
+    eigen_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask);
+  }
+
+  // Prewait prepares for waiting.
+  // After calling this function the thread must re-check the wait predicate
+  // and call either CancelWait or CommitWait passing the same Waiter object.
+  void Prewait(Waiter* w) {
+    w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed);
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+  }
+
+  // CommitWait commits waiting.
+  void CommitWait(Waiter* w) {
+    w->state = Waiter::kNotSignaled;
+    // Modification epoch of this waiter.
+    uint64_t epoch =
+        (w->epoch & kEpochMask) +
+        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+    uint64_t state = state_.load(std::memory_order_seq_cst);
+    for (;;) {
+      if (int64_t((state & kEpochMask) - epoch) < 0) {
+        // The preceeding waiter has not decided on its fate. Wait until it
+        // calls either CancelWait or CommitWait, or is notified.
+        EIGEN_THREAD_YIELD();
+        state = state_.load(std::memory_order_seq_cst);
+        continue;
+      }
+      // We've already been notified.
+      if (int64_t((state & kEpochMask) - epoch) > 0) return;
+      // Remove this thread from prewait counter and add it to the waiter list.
+      eigen_assert((state & kWaiterMask) != 0);
+      uint64_t newstate = state - kWaiterInc + kEpochInc;
+      newstate = (newstate & ~kStackMask) | (w - &waiters_[0]);
+      if ((state & kStackMask) == kStackMask)
+        w->next.store(nullptr, std::memory_order_relaxed);
+      else
+        w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_release))
+        break;
+    }
+    Park(w);
+  }
+
+  // CancelWait cancels effects of the previous Prewait call.
+  void CancelWait(Waiter* w) {
+    uint64_t epoch =
+        (w->epoch & kEpochMask) +
+        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+    uint64_t state = state_.load(std::memory_order_relaxed);
+    for (;;) {
+      if (int64_t((state & kEpochMask) - epoch) < 0) {
+        // The preceeding waiter has not decided on its fate. Wait until it
+        // calls either CancelWait or CommitWait, or is notified.
+        EIGEN_THREAD_YIELD();
+        state = state_.load(std::memory_order_relaxed);
+        continue;
+      }
+      // We've already been notified.
+      if (int64_t((state & kEpochMask) - epoch) > 0) return;
+      // Remove this thread from prewait counter.
+      eigen_assert((state & kWaiterMask) != 0);
+      if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc,
+                                       std::memory_order_relaxed))
+        return;
+    }
+  }
+
+  // Notify wakes one or all waiting threads.
+  // Must be called after changing the associated wait predicate.
+  void Notify(bool all) {
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+    uint64_t state = state_.load(std::memory_order_acquire);
+    for (;;) {
+      // Easy case: no waiters.
+      if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0)
+        return;
+      uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+      uint64_t newstate;
+      if (all) {
+        // Reset prewait counter and empty wait list.
+        newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
+      } else if (waiters) {
+        // There is a thread in pre-wait state, unblock it.
+        newstate = state + kEpochInc - kWaiterInc;
+      } else {
+        // Pop a waiter from list and unpark it.
+        Waiter* w = &waiters_[state & kStackMask];
+        Waiter* wnext = w->next.load(std::memory_order_relaxed);
+        uint64_t next = kStackMask;
+        if (wnext != nullptr) next = wnext - &waiters_[0];
+        // Note: we don't add kEpochInc here. ABA problem on the lock-free stack
+        // can't happen because a waiter is re-pushed onto the stack only after
+        // it was in the pre-wait state which inevitably leads to epoch
+        // increment.
+        newstate = (state & kEpochMask) + next;
+      }
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acquire)) {
+        if (!all && waiters) return;  // unblocked pre-wait thread
+        if ((state & kStackMask) == kStackMask) return;
+        Waiter* w = &waiters_[state & kStackMask];
+        if (!all) w->next.store(nullptr, std::memory_order_relaxed);
+        Unpark(w);
+        return;
+      }
+    }
+  }
+
+  class Waiter {
+    friend class EventCount;
+    // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
+    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
+    std::mutex mu;
+    std::condition_variable cv;
+    uint64_t epoch;
+    unsigned state;
+    enum {
+      kNotSignaled,
+      kWaiting,
+      kSignaled,
+    };
+  };
+
+ private:
+  // State_ layout:
+  // - low kStackBits is a stack of waiters committed wait.
+  // - next kWaiterBits is count of waiters in prewait state.
+  // - next kEpochBits is modification counter.
+  static const uint64_t kStackBits = 16;
+  static const uint64_t kStackMask = (1ull << kStackBits) - 1;
+  static const uint64_t kWaiterBits = 16;
+  static const uint64_t kWaiterShift = 16;
+  static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
+                                      << kWaiterShift;
+  static const uint64_t kWaiterInc = 1ull << kWaiterBits;
+  static const uint64_t kEpochBits = 32;
+  static const uint64_t kEpochShift = 32;
+  static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
+  static const uint64_t kEpochInc = 1ull << kEpochShift;
+  std::atomic<uint64_t> state_;
+  MaxSizeVector<Waiter>& waiters_;
+
+  void Park(Waiter* w) {
+    std::unique_lock<std::mutex> lock(w->mu);
+    while (w->state != Waiter::kSignaled) {
+      w->state = Waiter::kWaiting;
+      w->cv.wait(lock);
+    }
+  }
+
+  void Unpark(Waiter* waiters) {
+    Waiter* next = nullptr;
+    for (Waiter* w = waiters; w; w = next) {
+      next = w->next.load(std::memory_order_relaxed);
+      unsigned state;
+      {
+        std::unique_lock<std::mutex> lock(w->mu);
+        state = w->state;
+        w->state = Waiter::kSignaled;
+      }
+      // Avoid notifying if it wasn't waiting.
+      if (state == Waiter::kWaiting) w->cv.notify_one();
+    }
+  }
+
+  EventCount(const EventCount&) = delete;
+  void operator=(const EventCount&) = delete;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
new file mode 100644
index 000000000..354bce52a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -0,0 +1,274 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+
+
+namespace Eigen {
+
+template <typename Environment>
+class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
+ public:
+  typedef typename Environment::Task Task;
+  typedef RunQueue<Task, 1024> Queue;
+
+  NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
+      : env_(env),
+        threads_(num_threads),
+        queues_(num_threads),
+        coprimes_(num_threads),
+        waiters_(num_threads),
+        blocked_(0),
+        spinning_(0),
+        done_(false),
+        ec_(waiters_) {
+    waiters_.resize(num_threads);
+
+    // Calculate coprimes of num_threads.
+    // Coprimes are used for a random walk over all threads in Steal
+    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+    // a walk starting thread index t and calculate num_threads - 1 subsequent
+    // indices as (t + coprime) % num_threads, we will cover all threads without
+    // repetitions (effectively getting a presudo-random permutation of thread
+    // indices).
+    for (int i = 1; i <= num_threads; i++) {
+      unsigned a = i;
+      unsigned b = num_threads;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes_.push_back(i);
+      }
+    }
+    for (int i = 0; i < num_threads; i++) {
+      queues_.push_back(new Queue());
+    }
+    for (int i = 0; i < num_threads; i++) {
+      threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+    }
+  }
+
+  ~NonBlockingThreadPoolTempl() {
+    done_ = true;
+    // Now if all threads block without work, they will start exiting.
+    // But note that threads can continue to work arbitrary long,
+    // block, submit new work, unblock and otherwise live full life.
+    ec_.Notify(true);
+
+    // Join threads explicitly to avoid destruction order issues.
+    for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
+    for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
+  }
+
+  void Schedule(std::function<void()> fn) {
+    Task t = env_.CreateTask(std::move(fn));
+    PerThread* pt = GetPerThread();
+    if (pt->pool == this) {
+      // Worker thread of this pool, push onto the thread's queue.
+      Queue* q = queues_[pt->thread_id];
+      t = q->PushFront(std::move(t));
+    } else {
+      // A free-standing thread (or worker of another pool), push onto a random
+      // queue.
+      Queue* q = queues_[Rand(&pt->rand) % queues_.size()];
+      t = q->PushBack(std::move(t));
+    }
+    // Note: below we touch this after making w available to worker threads.
+    // Strictly speaking, this can lead to a racy-use-after-free. Consider that
+    // Schedule is called from a thread that is neither main thread nor a worker
+    // thread of this pool. Then, execution of w directly or indirectly
+    // completes overall computations, which in turn leads to destruction of
+    // this. We expect that such scenario is prevented by program, that is,
+    // this is kept alive while any threads can potentially be in Schedule.
+    if (!t.f)
+      ec_.Notify(false);
+    else
+      env_.ExecuteTask(t);  // Push failed, execute directly.
+  }
+
+  int NumThreads() const final {
+    return static_cast<int>(threads_.size());
+  }
+
+  int CurrentThreadId() const final {
+    const PerThread* pt =
+        const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
+ private:
+  typedef typename Environment::EnvThread Thread;
+
+  struct PerThread {
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
+    NonBlockingThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    uint64_t rand;  // Random generator state.
+    int thread_id;  // Worker thread index in pool.
+  };
+
+  Environment env_;
+  MaxSizeVector<Thread*> threads_;
+  MaxSizeVector<Queue*> queues_;
+  MaxSizeVector<unsigned> coprimes_;
+  MaxSizeVector<EventCount::Waiter> waiters_;
+  std::atomic<unsigned> blocked_;
+  std::atomic<bool> spinning_;
+  std::atomic<bool> done_;
+  EventCount ec_;
+
+  // Main worker thread loop.
+  void WorkerLoop(int thread_id) {
+    PerThread* pt = GetPerThread();
+    pt->pool = this;
+    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+    pt->thread_id = thread_id;
+    Queue* q = queues_[thread_id];
+    EventCount::Waiter* waiter = &waiters_[thread_id];
+    for (;;) {
+      Task t = q->PopFront();
+      if (!t.f) {
+        t = Steal();
+        if (!t.f) {
+          // Leave one thread spinning. This reduces latency.
+          // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
+          // Also, the time it takes to attempt to steal work 1000 times depends
+          // on the size of the thread pool. However the speed at which the user
+          // of the thread pool submit tasks is independent of the size of the
+          // pool. Consider a time based limit instead.
+          if (!spinning_ && !spinning_.exchange(true)) {
+            for (int i = 0; i < 1000 && !t.f; i++) {
+              t = Steal();
+            }
+            spinning_ = false;
+          }
+          if (!t.f) {
+            if (!WaitForWork(waiter, &t)) {
+              return;
+            }
+          }
+        }
+      }
+      if (t.f) {
+        env_.ExecuteTask(t);
+      }
+    }
+  }
+
+  // Steal tries to steal work from other worker threads in best-effort manner.
+  Task Steal() {
+    PerThread* pt = GetPerThread();
+    const size_t size = queues_.size();
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = coprimes_[r % coprimes_.size()];
+    unsigned victim = r % size;
+    for (unsigned i = 0; i < size; i++) {
+      Task t = queues_[victim]->PopBack();
+      if (t.f) {
+        return t;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
+      }
+    }
+    return Task();
+  }
+
+  // WaitForWork blocks until new work is available (returns true), or if it is
+  // time to exit (returns false). Can optionally return a task to execute in t
+  // (in such case t.f != nullptr on return).
+  bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
+    eigen_assert(!t->f);
+    // We already did best-effort emptiness check in Steal, so prepare for
+    // blocking.
+    ec_.Prewait(waiter);
+    // Now do a reliable emptiness check.
+    int victim = NonEmptyQueueIndex();
+    if (victim != -1) {
+      ec_.CancelWait(waiter);
+      *t = queues_[victim]->PopBack();
+      return true;
+    }
+    // Number of blocked threads is used as termination condition.
+    // If we are shutting down and all worker threads blocked without work,
+    // that's we are done.
+    blocked_++;
+    if (done_ && blocked_ == threads_.size()) {
+      ec_.CancelWait(waiter);
+      // Almost done, but need to re-check queues.
+      // Consider that all queues are empty and all worker threads are preempted
+      // right after incrementing blocked_ above. Now a free-standing thread
+      // submits work and calls destructor (which sets done_). If we don't
+      // re-check queues, we will exit leaving the work unexecuted.
+      if (NonEmptyQueueIndex() != -1) {
+        // Note: we must not pop from queues before we decrement blocked_,
+        // otherwise the following scenario is possible. Consider that instead
+        // of checking for emptiness we popped the only element from queues.
+        // Now other worker threads can start exiting, which is bad if the
+        // work item submits other work. So we just check emptiness here,
+        // which ensures that all worker threads exit at the same time.
+        blocked_--;
+        return true;
+      }
+      // Reached stable termination state.
+      ec_.Notify(true);
+      return false;
+    }
+    ec_.CommitWait(waiter);
+    blocked_--;
+    return true;
+  }
+
+  int NonEmptyQueueIndex() {
+    PerThread* pt = GetPerThread();
+    const size_t size = queues_.size();
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = coprimes_[r % coprimes_.size()];
+    unsigned victim = r % size;
+    for (unsigned i = 0; i < size; i++) {
+      if (!queues_[victim]->Empty()) {
+        return victim;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
+      }
+    }
+    return -1;
+  }
+
+  static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+    EIGEN_THREAD_LOCAL PerThread per_thread_;
+    PerThread* pt = &per_thread_;
+    return pt;
+  }
+
+  static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+    uint64_t current = *state;
+    // Update the internal state
+    *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+    // Generate the random output (using the PCG-XSH-RS scheme)
+    return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+  }
+};
+
+typedef NonBlockingThreadPoolTempl<StlThreadEnvironment> NonBlockingThreadPool;
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
new file mode 100644
index 000000000..05ed76cbe
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -0,0 +1,210 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
+#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
+
+
+namespace Eigen {
+
+// RunQueue is a fixed-size, partially non-blocking deque or Work items.
+// Operations on front of the queue must be done by a single thread (owner),
+// operations on back of the queue can be done by multiple threads concurrently.
+//
+// Algorithm outline:
+// All remote threads operating on the queue back are serialized by a mutex.
+// This ensures that at most two threads access state: owner and one remote
+// thread (Size aside). The algorithm ensures that the occupied region of the
+// underlying array is logically continuous (can wraparound, but no stray
+// occupied elements). Owner operates on one end of this region, remote thread
+// operates on the other end. Synchronization between these threads
+// (potential consumption of the last element and take up of the last empty
+// element) happens by means of state variable in each element. States are:
+// empty, busy (in process of insertion of removal) and ready. Threads claim
+// elements (empty->busy and ready->busy transitions) by means of a CAS
+// operation. The finishing transition (busy->empty and busy->ready) are done
+// with plain store as the element is exclusively owned by the current thread.
+//
+// Note: we could permit only pointers as elements, then we would not need
+// separate state variable as null/non-null pointer value would serve as state,
+// but that would require malloc/free per operation for large, complex values
+// (and this is designed to store std::function<()>).
+template <typename Work, unsigned kSize>
+class RunQueue {
+ public:
+  RunQueue() : front_(0), back_(0) {
+    // require power-of-two for fast masking
+    eigen_assert((kSize & (kSize - 1)) == 0);
+    eigen_assert(kSize > 2);            // why would you do this?
+    eigen_assert(kSize <= (64 << 10));  // leave enough space for counter
+    for (unsigned i = 0; i < kSize; i++)
+      array_[i].state.store(kEmpty, std::memory_order_relaxed);
+  }
+
+  ~RunQueue() { eigen_assert(Size() == 0); }
+
+  // PushFront inserts w at the beginning of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushFront(Work w) {
+    unsigned front = front_.load(std::memory_order_relaxed);
+    Elem* e = &array_[front & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kEmpty ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return w;
+    front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
+    e->w = std::move(w);
+    e->state.store(kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PopFront removes and returns the first element in the queue.
+  // If the queue was empty returns default-constructed Work.
+  Work PopFront() {
+    unsigned front = front_.load(std::memory_order_relaxed);
+    Elem* e = &array_[(front - 1) & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kReady ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return Work();
+    Work w = std::move(e->w);
+    e->state.store(kEmpty, std::memory_order_release);
+    front = ((front - 1) & kMask2) | (front & ~kMask2);
+    front_.store(front, std::memory_order_relaxed);
+    return w;
+  }
+
+  // PushBack adds w at the end of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushBack(Work w) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem* e = &array_[(back - 1) & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kEmpty ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return w;
+    back = ((back - 1) & kMask2) | (back & ~kMask2);
+    back_.store(back, std::memory_order_relaxed);
+    e->w = std::move(w);
+    e->state.store(kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PopBack removes and returns the last elements in the queue.
+  // Can fail spuriously.
+  Work PopBack() {
+    if (Empty()) return Work();
+    std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
+    if (!lock) return Work();
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem* e = &array_[back & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kReady ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return Work();
+    Work w = std::move(e->w);
+    e->state.store(kEmpty, std::memory_order_release);
+    back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+    return w;
+  }
+
+  // PopBackHalf removes and returns half last elements in the queue.
+  // Returns number of elements removed. But can also fail spuriously.
+  unsigned PopBackHalf(std::vector<Work>* result) {
+    if (Empty()) return 0;
+    std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
+    if (!lock) return 0;
+    unsigned back = back_.load(std::memory_order_relaxed);
+    unsigned size = Size();
+    unsigned mid = back;
+    if (size > 1) mid = back + (size - 1) / 2;
+    unsigned n = 0;
+    unsigned start = 0;
+    for (; static_cast<int>(mid - back) >= 0; mid--) {
+      Elem* e = &array_[mid & kMask];
+      uint8_t s = e->state.load(std::memory_order_relaxed);
+      if (n == 0) {
+        if (s != kReady ||
+            !e->state.compare_exchange_strong(s, kBusy,
+                                              std::memory_order_acquire))
+          continue;
+        start = mid;
+      } else {
+        // Note: no need to store temporal kBusy, we exclusively own these
+        // elements.
+        eigen_assert(s == kReady);
+      }
+      result->push_back(std::move(e->w));
+      e->state.store(kEmpty, std::memory_order_release);
+      n++;
+    }
+    if (n != 0)
+      back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
+    return n;
+  }
+
+  // Size returns current queue size.
+  // Can be called by any thread at any time.
+  unsigned Size() const {
+    // Emptiness plays critical role in thread pool blocking. So we go to great
+    // effort to not produce false positives (claim non-empty queue as empty).
+    for (;;) {
+      // Capture a consistent snapshot of front/tail.
+      unsigned front = front_.load(std::memory_order_acquire);
+      unsigned back = back_.load(std::memory_order_acquire);
+      unsigned front1 = front_.load(std::memory_order_relaxed);
+      if (front != front1) continue;
+      int size = (front & kMask2) - (back & kMask2);
+      // Fix overflow.
+      if (size < 0) size += 2 * kSize;
+      // Order of modification in push/pop is crafted to make the queue look
+      // larger than it is during concurrent modifications. E.g. pop can
+      // decrement size before the corresponding push has incremented it.
+      // So the computed size can be up to kSize + 1, fix it.
+      if (size > static_cast<int>(kSize)) size = kSize;
+      return size;
+    }
+  }
+
+  // Empty tests whether container is empty.
+  // Can be called by any thread at any time.
+  bool Empty() const { return Size() == 0; }
+
+ private:
+  static const unsigned kMask = kSize - 1;
+  static const unsigned kMask2 = (kSize << 1) - 1;
+  struct Elem {
+    std::atomic<uint8_t> state;
+    Work w;
+  };
+  enum {
+    kEmpty,
+    kBusy,
+    kReady,
+  };
+  std::mutex mutex_;
+  // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
+  // front/back, repsectively. The remaining bits contain modification counters
+  // that are incremented on Push operations. This allows us to (1) distinguish
+  // between empty and full conditions (if we would use log(kSize) bits for
+  // position, these conditions would be indistinguishable); (2) obtain
+  // consistent snapshot of front_/back_ for Size operation using the
+  // modification counters.
+  std::atomic<unsigned> front_;
+  std::atomic<unsigned> back_;
+  Elem array_[kSize];
+
+  RunQueue(const RunQueue&) = delete;
+  void operator=(const RunQueue&) = delete;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
new file mode 100644
index 000000000..e75d0f467
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
+
+namespace Eigen {
+
+// The implementation of the ThreadPool type ensures that the Schedule method
+// runs the functions it is provided in FIFO order when the scheduling is done
+// by a single thread.
+// Environment provides a way to create threads and also allows to intercept
+// task submission and execution.
+template <typename Environment>
+class SimpleThreadPoolTempl : public ThreadPoolInterface {
+ public:
+  // Construct a pool that contains "num_threads" threads.
+  explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
+      : env_(env), threads_(num_threads), waiters_(num_threads) {
+    for (int i = 0; i < num_threads; i++) {
+      threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
+    }
+  }
+
+  // Wait until all scheduled work has finished and then destroy the
+  // set of threads.
+  ~SimpleThreadPoolTempl() {
+    {
+      // Wait for all work to get done.
+      std::unique_lock<std::mutex> l(mu_);
+      while (!pending_.empty()) {
+        empty_.wait(l);
+      }
+      exiting_ = true;
+
+      // Wakeup all waiters.
+      for (auto w : waiters_) {
+        w->ready = true;
+        w->task.f = nullptr;
+        w->cv.notify_one();
+      }
+    }
+
+    // Wait for threads to finish.
+    for (auto t : threads_) {
+      delete t;
+    }
+  }
+
+  // Schedule fn() for execution in the pool of threads. The functions are
+  // executed in the order in which they are scheduled.
+  void Schedule(std::function<void()> fn) final {
+    Task t = env_.CreateTask(std::move(fn));
+    std::unique_lock<std::mutex> l(mu_);
+    if (waiters_.empty()) {
+      pending_.push_back(std::move(t));
+    } else {
+      Waiter* w = waiters_.back();
+      waiters_.pop_back();
+      w->ready = true;
+      w->task = std::move(t);
+      w->cv.notify_one();
+    }
+  }
+
+  int NumThreads() const final {
+    return static_cast<int>(threads_.size());
+  }
+
+  int CurrentThreadId() const final {
+    const PerThread* pt = this->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
+ protected:
+  void WorkerLoop(int thread_id) {
+    std::unique_lock<std::mutex> l(mu_);
+    PerThread* pt = GetPerThread();
+    pt->pool = this;
+    pt->thread_id = thread_id;
+    Waiter w;
+    Task t;
+    while (!exiting_) {
+      if (pending_.empty()) {
+        // Wait for work to be assigned to me
+        w.ready = false;
+        waiters_.push_back(&w);
+        while (!w.ready) {
+          w.cv.wait(l);
+        }
+        t = w.task;
+        w.task.f = nullptr;
+      } else {
+        // Pick up pending work
+        t = std::move(pending_.front());
+        pending_.pop_front();
+        if (pending_.empty()) {
+          empty_.notify_all();
+        }
+      }
+      if (t.f) {
+        mu_.unlock();
+        env_.ExecuteTask(t);
+        t.f = nullptr;
+        mu_.lock();
+      }
+    }
+  }
+
+ private:
+  typedef typename Environment::Task Task;
+  typedef typename Environment::EnvThread Thread;
+
+  struct Waiter {
+    std::condition_variable cv;
+    Task task;
+    bool ready;
+  };
+
+  struct PerThread {
+    constexpr PerThread() : pool(NULL), thread_id(-1) { }
+    SimpleThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    int thread_id;                // Worker thread index in pool.
+  };
+
+  Environment env_;
+  std::mutex mu_;
+  MaxSizeVector<Thread*> threads_;  // All threads
+  MaxSizeVector<Waiter*> waiters_;  // Stack of waiting threads.
+  std::deque<Task> pending_;        // Queue of pending work
+  std::condition_variable empty_;   // Signaled on pending_.empty()
+  bool exiting_ = false;
+
+  PerThread* GetPerThread() const {
+    EIGEN_THREAD_LOCAL PerThread per_thread;
+    return &per_thread;
+  }
+};
+
+typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
new file mode 100644
index 000000000..399f95cc1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -0,0 +1,38 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+
+namespace Eigen {
+
+struct StlThreadEnvironment {
+  struct Task {
+    std::function<void()> f;
+  };
+
+  // EnvThread constructor must start the thread,
+  // destructor must join the thread.
+  class EnvThread {
+   public:
+    EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
+    ~EnvThread() { thr_.join(); }
+
+   private:
+    std::thread thr_;
+  };
+
+  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
+  Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
+  void ExecuteTask(const Task& t) { t.f(); }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
new file mode 100644
index 000000000..cfa221732
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
@@ -0,0 +1,22 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+
+// Try to come up with a portable implementation of thread local variables
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_LOCAL static __thread
+#elif EIGEN_COMP_CLANG
+#define EIGEN_THREAD_LOCAL static __thread
+#else
+#define EIGEN_THREAD_LOCAL static thread_local
+#endif
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
new file mode 100644
index 000000000..a65ee97c9
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+
+namespace Eigen {
+
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+  virtual void Schedule(std::function<void()> fn) = 0;
+
+  // Returns the number of threads in the pool.
+  virtual int NumThreads() const = 0;
+
+  // Returns a logical thread index between 0 and NumThreads() - 1 if called
+  // from one of the threads in the pool. Returns -1 otherwise.
+  virtual int CurrentThreadId() const = 0;
+
+  virtual ~ThreadPoolInterface() {}
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
new file mode 100644
index 000000000..a859c7ba3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
@@ -0,0 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+
+// Try to come up with a portable way to yield
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_YIELD() sched_yield()
+#else
+#define EIGEN_THREAD_YIELD() std::this_thread::yield()
+#endif
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
new file mode 100644
index 000000000..ec27eddb8
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -0,0 +1,542 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11META_H
+#define EIGEN_CXX11META_H
+
+#include <vector>
+#include "EmulateArray.h"
+
+// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
+// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
+// supports enough of the standard for our needs
+#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+
+#include "CXX11Workarounds.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \file CXX11/util/CXX11Meta.h
+  * This file contains generic metaprogramming classes which are not specifically related to Eigen.
+  * This file expands upon Core/util/Meta.h and adds support for C++11 specific features.
+  */
+
+template<typename... tt>
+struct type_list { constexpr static int count = sizeof...(tt); };
+
+template<typename t, typename... tt>
+struct type_list<t, tt...> { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; };
+
+template<typename T, T... nn>
+struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
+
+template<typename T, T n, T... nn>
+struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; };
+
+/* numeric list constructors
+ *
+ * equivalencies:
+ *     constructor                                              result
+ *     typename gen_numeric_list<int, 5>::type                  numeric_list<int, 0,1,2,3,4>
+ *     typename gen_numeric_list_reversed<int, 5>::type         numeric_list<int, 4,3,2,1,0>
+ *     typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4>
+ *     typename gen_numeric_list_repeated<int, 0, 5>::type      numeric_list<int, 0,0,0,0,0>
+ */
+
+template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list                     : gen_numeric_list<T, n-1, start, start + n-1, ii...> {};
+template<typename T, T start, T... ii>                    struct gen_numeric_list<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list_reversed                     : gen_numeric_list_reversed<T, n-1, start, ii..., start + n-1> {};
+template<typename T, T start, T... ii>                    struct gen_numeric_list_reversed<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T a, T b, T start = 0, T... ii> struct gen_numeric_list_swapped_pair                           : gen_numeric_list_swapped_pair<T, n-1, a, b, start, (start + n-1) == a ? b : ((start + n-1) == b ? a : (start + n-1)), ii...> {};
+template<typename T, T a, T b, T start, T... ii>                    struct gen_numeric_list_swapped_pair<T, 0, a, b, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T V, T... nn> struct gen_numeric_list_repeated                 : gen_numeric_list_repeated<T, n-1, V, V, nn...> {};
+template<typename T, T V, T... nn>                struct gen_numeric_list_repeated<T, 0, V, nn...> { typedef numeric_list<T, nn...> type; };
+
+/* list manipulation: concatenate */
+
+template<class a, class b> struct concat;
+
+template<typename... as, typename... bs> struct concat<type_list<as...>,       type_list<bs...>>        { typedef type_list<as..., bs...> type; };
+template<typename T, T... as, T... bs>   struct concat<numeric_list<T, as...>, numeric_list<T, bs...> > { typedef numeric_list<T, as..., bs...> type; };
+
+template<typename... p> struct mconcat;
+template<typename a>                             struct mconcat<a>           { typedef a type; };
+template<typename a, typename b>                 struct mconcat<a, b>        : concat<a, b> {};
+template<typename a, typename b, typename... cs> struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {};
+
+/* list manipulation: extract slices */
+
+template<int n, typename x> struct take;
+template<int n, typename a, typename... as> struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n-1, type_list<as...>>::type> {};
+template<int n>                             struct take<n, type_list<>>         { typedef type_list<> type; };
+template<typename a, typename... as>        struct take<0, type_list<a, as...>> { typedef type_list<> type; };
+template<>                                  struct take<0, type_list<>>         { typedef type_list<> type; };
+
+template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {};
+template<typename T, int n>               struct take<n, numeric_list<T>>           { typedef numeric_list<T> type; };
+template<typename T, T a, T... as>        struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; };
+template<typename T>                      struct take<0, numeric_list<T>>           { typedef numeric_list<T> type; };
+
+template<typename T, int n, T... ii>      struct h_skip_helper_numeric;
+template<typename T, int n, T i, T... ii> struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n-1, ii...> {};
+template<typename T, T i, T... ii>        struct h_skip_helper_numeric<T, 0, i, ii...> { typedef numeric_list<T, i, ii...> type; };
+template<typename T, int n>               struct h_skip_helper_numeric<T, n>           { typedef numeric_list<T> type; };
+template<typename T>                      struct h_skip_helper_numeric<T, 0>           { typedef numeric_list<T> type; };
+
+template<int n, typename... tt>             struct h_skip_helper_type;
+template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n-1, tt...> {};
+template<typename t, typename... tt>        struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
+template<int n>                             struct h_skip_helper_type<n>           { typedef type_list<> type; };
+template<>                                  struct h_skip_helper_type<0>           { typedef type_list<> type; };
+
+template<int n>
+struct h_skip {
+  template<typename T, T... ii>
+  constexpr static inline typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
+  template<typename... tt>
+  constexpr static inline typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
+};
+
+template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
+
+template<int start, int count, typename a> struct slice : take<count, typename skip<start, a>::type> {};
+
+/* list manipulation: retrieve single element from list */
+
+template<int n, typename x> struct get;
+
+template<int n, typename a, typename... as>               struct get<n, type_list<a, as...>>   : get<n-1, type_list<as...>> {};
+template<typename a, typename... as>                      struct get<0, type_list<a, as...>>   { typedef a type; };
+
+template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
+template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
+
+/* always get type, regardless of dummy; good for parameter pack expansion */
+
+template<typename T, T dummy, typename t> struct id_numeric  { typedef t type; };
+template<typename dummy, typename t>      struct id_type     { typedef t type; };
+
+/* equality checking, flagged version */
+
+template<typename a, typename b> struct is_same_gf : is_same<a, b> { constexpr static int global_flags = 0; };
+
+/* apply_op to list */
+
+template<
+  bool from_left, // false
+  template<typename, typename> class op,
+  typename additional_param,
+  typename... values
+>
+struct h_apply_op_helper                                        { typedef type_list<typename op<values, additional_param>::type...> type; };
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename... values
+>
+struct h_apply_op_helper<true, op, additional_param, values...> { typedef type_list<typename op<additional_param, values>::type...> type; };
+
+template<
+  bool from_left,
+  template<typename, typename> class op,
+  typename additional_param
+>
+struct h_apply_op
+{
+  template<typename... values>
+  constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(type_list<values...>)
+  { return typename h_apply_op_helper<from_left, op, additional_param, values...>::type(); }
+};
+
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename a
+>
+struct apply_op_from_left { typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type; };
+
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename a
+>
+struct apply_op_from_right { typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type; };
+
+/* see if an element is in a list */
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  bool last_check_positive = false
+>
+struct contained_in_list;
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list
+>
+struct contained_in_list<test, check_against, h_list, true>
+{
+  constexpr static bool value = true;
+};
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename a,
+  typename... as
+>
+struct contained_in_list<test, check_against, type_list<a, as...>, false> : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {};
+
+template<
+  template<typename, typename> class test,
+  typename check_against
+  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty)
+>
+struct contained_in_list<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, false> { constexpr static bool value = false; };
+
+/* see if an element is in a list and check for global flags */
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  int default_flags = 0,
+  bool last_check_positive = false,
+  int last_check_flags = default_flags
+>
+struct contained_in_list_gf;
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags>
+{
+  constexpr static bool value = true;
+  constexpr static int global_flags = last_check_flags;
+};
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename a,
+  typename... as,
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags> : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value, test<check_against, a>::global_flags> {};
+
+template<
+  template<typename, typename> class test,
+  typename check_against
+  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; };
+
+/* generic reductions */
+
+template<
+  typename Reducer,
+  typename... Ts
+> struct reduce;
+
+template<
+  typename Reducer
+> struct reduce<Reducer>
+{
+  constexpr static inline int run() { return Reducer::Identity; }
+};
+
+template<
+  typename Reducer,
+  typename A
+> struct reduce<Reducer, A>
+{
+  constexpr static inline A run(A a) { return a; }
+};
+
+template<
+  typename Reducer,
+  typename A,
+  typename... Ts
+> struct reduce<Reducer, A, Ts...>
+{
+  constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
+    return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
+  }
+};
+
+/* generic binary operations */
+
+struct sum_op           {
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
+  static constexpr int Identity = 0;
+};
+struct product_op       {
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
+  static constexpr int Identity = 1;
+};
+
+struct logical_and_op   { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
+struct logical_or_op    { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
+
+struct equal_op         { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
+struct not_equal_op     { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
+struct lesser_op        { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
+struct lesser_equal_op  { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
+struct greater_op       { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
+struct greater_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
+
+/* generic unary operations */
+
+struct not_op                { template<typename A> constexpr static inline auto run(A a) -> decltype(!a)      { return !a;      } };
+struct negation_op           { template<typename A> constexpr static inline auto run(A a) -> decltype(-a)      { return -a;      } };
+struct greater_equal_zero_op { template<typename A> constexpr static inline auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
+
+
+/* reductions for lists */
+
+// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
+// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
+// does...
+template<typename... Ts>
+constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
+{
+  return reduce<product_op, Ts...>::run(ts...);
+}
+
+template<typename... Ts>
+constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
+{
+  return reduce<sum_op, Ts...>::run(ts...);
+}
+
+/* reverse arrays */
+
+template<typename Array, int... n>
+constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
+{
+  return {{array_get<sizeof...(n) - n - 1>(arr)...}};
+}
+
+template<typename T, std::size_t N>
+constexpr inline array<T, N> array_reverse(array<T, N> arr)
+{
+  return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
+}
+
+
+/* generic array reductions */
+
+// can't reuse standard reduce() interface above because Intel's Compiler
+// *really* doesn't like it, so we just reimplement the stuff
+// (start from N - 1 and work down to 0 because specialization for
+// n == N - 1 also doesn't work in Intel's compiler, so it goes into
+// an infinite loop)
+template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
+struct h_array_reduce {
+  EIGEN_DEVICE_FUNC constexpr static inline auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
+  {
+    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
+  }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+struct h_array_reduce<Reducer, T, N, 0>
+{
+  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, N>& arr, T)
+  {
+    return array_get<0>(arr);
+  }
+};
+
+template<typename Reducer, typename T>
+struct h_array_reduce<Reducer, T, 0>
+{
+  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, 0>&, T identity)
+  {
+    return identity;
+  }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
+{
+  return h_array_reduce<Reducer, T, N>::run(arr, identity);
+}
+
+/* standard array reductions */
+
+template<typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr inline auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
+{
+  return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
+}
+
+template<typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr inline auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
+{
+  return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
+}
+
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
+/* zip an array */
+
+template<typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+{
+  return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
+}
+
+template<typename Op, typename A, typename B, std::size_t N>
+constexpr inline array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
+{
+  return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* zip an array and reduce the result */
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr inline auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
+{
+  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
+}
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
+{
+  return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array */
+
+template<typename Op, typename A, std::size_t N, int... n>
+constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
+{
+  return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
+}
+
+template<typename Op, typename A, std::size_t N>
+constexpr inline array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
+{
+  return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array and reduce */
+
+template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
+constexpr inline auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
+{
+  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
+}
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+constexpr inline auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
+{
+  return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* repeat a value n times (and make an array out of it
+ * usage:
+ *   array<int, 16> = repeat<16>(42);
+ */
+
+template<int n>
+struct h_repeat
+{
+  template<typename t, int... ii>
+  constexpr static inline array<t, n> run(t v, numeric_list<int, ii...>)
+  {
+    return {{ typename id_numeric<int, ii, t>::type(v)... }};
+  }
+};
+
+template<int n, typename t>
+constexpr array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); }
+
+/* instantiate a class by a C-style array */
+template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps>
+struct h_instantiate_by_c_array;
+
+template<class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]);
+  }
+};
+
+template<class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...);
+  }
+};
+
+template<class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    (void)arr;
+    return InstType(args...);
+  }
+};
+
+template<class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    (void)arr;
+    return InstType(args...);
+  }
+};
+
+template<class InstType, typename ArrType, std::size_t N, bool Reverse = false>
+InstType instantiate_by_c_array(ArrType* arr)
+{
+  return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#else // Non C++11, fallback to emulation mode
+
+#include "EmulateCXX11Meta.h"
+
+#endif
+
+#endif // EIGEN_CXX11META_H
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
new file mode 100644
index 000000000..fe4d22803
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
@@ -0,0 +1,88 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11WORKAROUNDS_H
+#define EIGEN_CXX11WORKAROUNDS_H
+
+/* COMPATIBILITY CHECKS
+ * (so users of compilers that are too old get some realistic error messages)
+ */
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310)
+#error Intel Compiler only supports required C++ features since version 13.1.
+// note that most stuff in principle works with 13.0 but when combining
+// some features, at some point 13.0 will just fail with an internal assertion
+#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
+// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
+// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
+// it sees. Unfortunately, that is still not our #error directive, but at least the output is
+// short enough the user has a chance to see that the compiler version is not sufficient for
+// the funky template mojo we use.
+#pragma GCC diagnostic error "-Wfatal-errors"
+#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6.
+#endif
+
+/* Check that the compiler at least claims to support C++11. It might not be sufficient
+ * because the compiler may not implement it correctly, but at least we'll know.
+ * On the other hand, visual studio still doesn't claim to support C++11 although it's
+ * compliant enugh for our purpose.
+ */
+#if (__cplusplus <= 199711L) && (EIGEN_COMP_MSVC < 1900)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic error "-Wfatal-errors"
+#endif
+#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.)
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+/* std::get is only constexpr in C++14, not yet in C++11
+ */
+
+
+template<std::size_t I, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
+
+/* Suppose you have a template of the form
+ * template<typename T> struct X;
+ * And you want to specialize it in such a way:
+ *    template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: };
+ *    template<>                            struct X<Foo<>>          { ::: };
+ * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since
+ * g++ can only match templates called with parameter packs if the number of template
+ * arguments is not a fixed size (so inside the first specialization, referencing
+ * X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following:
+ *    template<typename S...> struct X<Foo<S...>> { ::: }:
+ * as an additional (!) specialization, which will then only match the empty case.
+ * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax,
+ * so we have to create a workaround for this.
+ */
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)    mt... n
+#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)   , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_USE(n)        n...
+#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)       , n...
+#else
+#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_USE(n)
+#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11WORKAROUNDS_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
new file mode 100644
index 000000000..30d3ebcff
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -0,0 +1,267 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_ARRAY_H
+#define EIGEN_EMULATE_ARRAY_H
+
+
+
+// The array class is only available starting with cxx11. Emulate our own here
+// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
+// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
+
+namespace Eigen {
+template <typename T, size_t n> class array {
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& front() { return values[0]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& back() { return values[n-1]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  static std::size_t size() { return n; }
+
+  T values[n];
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() { }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v) {
+    EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
+    EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
+    EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+                            const T& v4) {
+    EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5) {
+    EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6) {
+    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6, const T& v7) {
+    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(
+      const T& v1, const T& v2, const T& v3, const T& v4,
+      const T& v5, const T& v6, const T& v7, const T& v8) {
+    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+    values[7] = v8;
+  }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
+    eigen_assert(l.size() == n);
+    internal::smart_copy(l.begin(), l.end(), values);
+  }
+#endif
+};
+
+
+// Specialize array for zero size
+template <typename T> class array<T, 0> {
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t) {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& front() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& front() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& back() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& back() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() : dummy() { }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+    eigen_assert(l.size() == 0);
+  }
+#endif
+
+ private:
+  T dummy;
+};
+
+// Comparison operator
+// Todo: implement !=, <, <=, >,  and >=
+template<class T, std::size_t N>
+EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs) {
+  for (std::size_t i = 0; i < N; ++i) {
+    if (lhs[i] != rhs[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
+namespace internal {
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
+  return a[I];
+}
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
+  return a[I];
+}
+
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+  static const size_t value = N;
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#else
+
+// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array
+#include <array>
+namespace Eigen {
+
+template <typename T, std::size_t N> using array = std::array<T, N>;
+
+namespace internal {
+/* std::get is only constexpr in C++14, not yet in C++11
+ *     - libstdc++ from version 4.7 onwards has it nevertheless,
+ *                                          so use that
+ *     - libstdc++ older versions: use _M_instance directly
+ *     - libc++ all versions so far: use __elems_ directly
+ *     - all other libs: use std::get to be portable, but
+ *                       this may not be constexpr
+ */
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
+#define STD_GET_ARR_HACK             a._M_instance[I]
+#elif defined(_LIBCPP_VERSION)
+#define STD_GET_ARR_HACK             a.__elems_[I]
+#else
+#define STD_GET_ARR_HACK             std::template get<I, T, N>(a)
+#endif
+
+template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+
+#undef STD_GET_ARR_HACK
+
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<std::array<T,N> > {
+  static const size_t value = N;
+};
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif
+
+#endif  // EIGEN_EMULATE_ARRAY_H
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
new file mode 100644
index 000000000..f3aa1b144
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
@@ -0,0 +1,311 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+#define EIGEN_EMULATE_CXX11_META_H
+
+
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \file CXX11/util/EmulateCXX11Meta.h
+  * This file emulates a subset of the functionality provided by CXXMeta.h for
+  * compilers that don't yet support cxx11 such as nvcc.
+  */
+
+struct empty_list { static const std::size_t count = 0; };
+
+template<typename T, typename Tail=empty_list> struct type_list {
+  typedef T HeadType;
+  typedef Tail TailType;
+  static const T head;
+  static const Tail tail;
+  static const std::size_t count = 1 + Tail::count;
+};
+
+struct null_type { };
+
+template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
+         typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
+         typename T7 = null_type, typename T8 = null_type>
+struct make_type_list {
+  typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
+
+  typedef type_list<T1, tailresult> type;
+};
+
+template<> struct make_type_list<> {
+  typedef empty_list type;
+};
+
+
+template <std::size_t index, class TList> struct get_type;
+
+template <class Head, class Tail>
+struct get_type<0, type_list<Head, Tail> >
+{
+  typedef Head type;
+};
+
+template <std::size_t i, class Head, class Tail>
+struct get_type<i, type_list<Head, Tail> >
+{
+  typedef typename get_type<i-1, Tail>::type type;
+};
+
+
+/* numeric list */
+template <typename T, T n>
+struct type2val {
+  typedef T type;
+  static const T value = n;
+};
+
+
+template<typename T, size_t n, T V> struct gen_numeric_list_repeated;
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> {
+  typedef typename make_type_list<type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V> >::type type;
+};
+
+
+template <std::size_t index, class NList> struct get;
+
+template <std::size_t i>
+struct get<i, empty_list>
+{
+  get() { eigen_assert(false && "index overflow"); }
+  typedef void type;
+  static const char value = '\0';
+};
+
+template <std::size_t i, class Head>
+struct get<i, type_list<Head, empty_list> >
+{
+  get() { eigen_assert(false && "index overflow"); }
+  typedef void type;
+  static const char value = '\0';
+};
+
+template <class Head>
+struct get<0, type_list<Head, empty_list> >
+{
+  typedef typename Head::type type;
+  static const type value = Head::value;
+};
+
+template <class Head, class Tail>
+struct get<0, type_list<Head, Tail> >
+{
+  typedef typename Head::type type;
+  static const type value = Head::value;
+};
+
+template <std::size_t i, class Head, class Tail>
+struct get<i, type_list<Head, Tail> >
+{
+  typedef typename Tail::HeadType::type type;
+  static const type value = get<i-1, Tail>::value;
+};
+
+
+template <class NList> struct arg_prod {
+  static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value;
+};
+template <> struct arg_prod<empty_list> {
+  static const int value = 1;
+};
+
+
+template<int n, typename t>
+array<t, n> repeat(t v) {
+  array<t, n> array;
+  array.fill(v);
+  return array;
+}
+
+template<std::size_t I, class Head, class Tail>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>&) {
+  return get<I, type_list<Head, Tail> >::value;
+}
+template<std::size_t I, class Head, class Tail>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>&) {
+  return get<I, type_list<Head, Tail> >::value;
+}
+
+template <class NList>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) {
+  return arg_prod<NList>::value;
+}
+
+template<typename t, std::size_t n>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
+  t prod = 1;
+  for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
+  return prod;
+}
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
+  return 0;
+}
+
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
+
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
+  return a[I];
+}
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
+  return a[I];
+}
+
+struct sum_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
+};
+struct product_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a * b; }
+};
+
+struct logical_and_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a && b; }
+};
+struct logical_or_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a || b; }
+};
+
+struct equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a == b; }
+};
+struct not_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a != b; }
+};
+struct lesser_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a < b; }
+};
+struct lesser_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; }
+};
+
+struct greater_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a > b; }
+};
+struct greater_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; }
+};
+
+struct not_op {
+  template<typename A> static inline bool run(A a) { return !a; }
+};
+struct negation_op {
+  template<typename A> static inline bool run(A a) { return -a; }
+};
+struct greater_equal_zero_op {
+  template<typename A> static inline bool run(A a) { return a >= 0; }
+};
+
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+struct ArrayApplyAndReduce {
+  static inline bool run(const array<A, N>& a) {
+    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    bool result = Reducer::run(Op::run(a[0]), Op::run(a[1]));
+    for (size_t i = 2; i < N; ++i) {
+      result = Reducer::run(result, Op::run(a[i]));
+    }
+    return result;
+  }
+};
+
+template<typename Reducer, typename Op, typename A>
+struct ArrayApplyAndReduce<Reducer, Op, A, 1>  {
+  static inline bool run(const array<A, 1>& a) {
+    return Op::run(a[0]);
+  }
+};
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+inline bool array_apply_and_reduce(const array<A, N>& a) {
+  return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a);
+}
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+struct ArrayZipAndReduce {
+  static inline bool run(const array<A, N>& a, const array<B, N>& b) {
+    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1]));
+    for (size_t i = 2; i < N; ++i) {
+      result = Reducer::run(result, Op::run(a[i], b[i]));
+    }
+    return result;
+  }
+};
+
+template<typename Reducer, typename Op, typename A, typename B>
+struct ArrayZipAndReduce<Reducer, Op, A, B, 1> {
+  static inline bool run(const array<A, 1>& a, const array<B, 1>& b) {
+    return Op::run(a[0], b[0]);
+  }
+};
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) {
+  return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+
+
+#endif  // EIGEN_EMULATE_CXX11_META_H
diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
new file mode 100644
index 000000000..4bc3dd1ba
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FIXEDSIZEVECTOR_H
+#define EIGEN_FIXEDSIZEVECTOR_H
+
+namespace Eigen {
+
+/** \class MaxSizeVector
+  * \ingroup Core
+  *
+  * \brief The MaxSizeVector class.
+  *
+  * The %MaxSizeVector provides a subset of std::vector functionality.
+  *
+  * The goal is to provide basic std::vector operations when using
+  * std::vector is not an option (e.g. on GPU or when compiling using
+  * FMA/AVX, as this can cause either compilation failures or illegal
+  * instruction failures).
+  *
+  * Beware: The constructors are not API compatible with these of
+  * std::vector.
+  */
+template <typename T>
+class MaxSizeVector {
+ public:
+  // Construct a new MaxSizeVector, reserve n elements.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit MaxSizeVector(size_t n)
+      : reserve_(n), size_(0),
+        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
+    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; }
+  }
+
+  // Construct a new MaxSizeVector, reserve and resize to n.
+  // Copy the init value to all elements.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  MaxSizeVector(size_t n, const T& init)
+      : reserve_(n), size_(n),
+        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
+    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  ~MaxSizeVector() {
+    for (size_t i = 0; i < size_; ++i) {
+      data_[i].~T();
+    }
+    internal::aligned_free(data_);
+  }
+
+  void resize(size_t n) {
+    eigen_assert(n <= reserve_);
+    for (size_t i = size_; i < n; ++i) {
+      new (&data_[i]) T;
+    }
+    for (size_t i = n; i < size_; ++i) {
+      data_[i].~T();
+    }
+    size_ = n;
+  }
+
+  // Append new elements (up to reserved size).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void push_back(const T& t) {
+    eigen_assert(size_ < reserve_);
+    data_[size_++] = t;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T& operator[] (size_t i) const {
+    eigen_assert(i < size_);
+    return data_[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T& operator[] (size_t i) {
+    eigen_assert(i < size_);
+    return data_[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T& back() {
+    eigen_assert(size_ > 0);
+    return data_[size_ - 1];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T& back() const {
+    eigen_assert(size_ > 0);
+    return data_[size_ - 1];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void pop_back() {
+    // NOTE: This does not destroy the value at the end the way
+    // std::vector's version of pop_back() does.  That happens when
+    // the Vector is destroyed.
+    eigen_assert(size_ > 0);
+    size_--;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t size() const { return size_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool empty() const { return size_ == 0; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* data() { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* data() const { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* begin() { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* end() { return data_ + size_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* begin() const { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* end() const { return data_ + size_; }
+
+ private:
+  size_t reserve_;
+  size_t size_;
+  T* data_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_FIXEDSIZEVECTOR_H
diff --git a/unsupported/Eigen/EulerAngles b/unsupported/Eigen/EulerAngles
new file mode 100644
index 000000000..521fa3f76
--- /dev/null
+++ b/unsupported/Eigen/EulerAngles
@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLES_MODULE_H
+#define EIGEN_EULERANGLES_MODULE_H
+
+
+#include "Eigen/Core"
+#include "Eigen/Geometry"
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+  * \defgroup EulerAngles_Module EulerAngles module
+  * \brief This module provides generic euler angles rotation.
+  *
+  * Euler angles are a way to represent 3D rotation.
+  *
+  * In order to use this module in your code, include this header:
+  * \code
+  * #include <unsupported/Eigen/EulerAngles>
+  * \endcode
+  *
+  * See \ref EulerAngles for more information.
+  *
+  */
+
+}
+
+#include "src/EulerAngles/EulerSystem.h"
+#include "src/EulerAngles/EulerAngles.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_EULERANGLES_MODULE_H
diff --git a/unsupported/Eigen/IterativeSolvers b/unsupported/Eigen/IterativeSolvers
index aa15403db..31e880bdc 100644
--- a/unsupported/Eigen/IterativeSolvers
+++ b/unsupported/Eigen/IterativeSolvers
@@ -24,9 +24,6 @@
   */
 //@{
 
-#include "../../Eigen/src/misc/Solve.h"
-#include "../../Eigen/src/misc/SparseSolve.h"
-
 #ifndef EIGEN_MPL2_ONLY
 #include "src/IterativeSolvers/IterationController.h"
 #include "src/IterativeSolvers/ConstrainedConjGrad.h"
@@ -36,7 +33,7 @@
 #include "../../Eigen/Jacobi"
 #include "../../Eigen/Householder"
 #include "src/IterativeSolvers/GMRES.h"
-#include "src/IterativeSolvers/IncompleteCholesky.h"
+#include "src/IterativeSolvers/DGMRES.h"
 //#include "src/IterativeSolvers/SSORPreconditioner.h"
 #include "src/IterativeSolvers/MINRES.h"
 
diff --git a/unsupported/Eigen/KroneckerProduct b/unsupported/Eigen/KroneckerProduct
index c932c06a6..5f5afb8cf 100644
--- a/unsupported/Eigen/KroneckerProduct
+++ b/unsupported/Eigen/KroneckerProduct
@@ -13,6 +13,8 @@
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+#include "../../Eigen/src/SparseCore/SparseUtil.h"
+
 namespace Eigen {
 
 /**
diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport
index d4b03647d..7f0b70c63 100644
--- a/unsupported/Eigen/MPRealSupport
+++ b/unsupported/Eigen/MPRealSupport
@@ -27,6 +27,8 @@ namespace Eigen {
   * via the <a href="http://www.holoborodko.com/pavel/mpfr">MPFR C++</a>
   * library which itself is built upon <a href="http://www.mpfr.org/">MPFR</a>/<a href="http://gmplib.org/">GMP</a>.
   *
+  * \warning MPFR C++ is licensed under the GPL.
+  *
   * You can find a copy of MPFR C++ that is known to be compatible in the unsupported/test/mpreal folder.
   *
   * Here is an example:
@@ -65,30 +67,35 @@ int main()
       IsSigned = 1,
       IsComplex = 0,
       RequireInitialization = 1,
-      ReadCost = 10,
-      AddCost = 10,
-      MulCost = 40
+      ReadCost = HugeCost,
+      AddCost  = HugeCost,
+      MulCost  = HugeCost
     };
 
     typedef mpfr::mpreal Real;
     typedef mpfr::mpreal NonInteger;
     
-    inline static Real highest   (long Precision = mpfr::mpreal::get_default_prec())  { return  mpfr::maxval(Precision); }
-    inline static Real lowest    (long Precision = mpfr::mpreal::get_default_prec())  { return -mpfr::maxval(Precision); }
+    static inline Real highest  (long Precision = mpfr::mpreal::get_default_prec()) { return  mpfr::maxval(Precision); }
+    static inline Real lowest   (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); }
 
     // Constants
-    inline static Real Pi       (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_pi(Precision);        }
-    inline static Real Euler    (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_euler(Precision);     }
-    inline static Real Log2     (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_log2(Precision);      }
-    inline static Real Catalan  (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_catalan(Precision);   }
-
-    inline static Real epsilon  (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::machine_epsilon(Precision); }
-    inline static Real epsilon  (const Real& x)                                         {    return mpfr::machine_epsilon(x); }
-
-    inline static Real dummy_precision()   
-    { 
-        unsigned int weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100;
-        return mpfr::machine_epsilon(weak_prec);
+    static inline Real Pi      (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_pi(Precision);        }
+    static inline Real Euler   (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_euler(Precision);     }
+    static inline Real Log2    (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_log2(Precision);      }
+    static inline Real Catalan (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_catalan(Precision);   }
+
+    static inline Real epsilon (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::machine_epsilon(Precision); }
+    static inline Real epsilon (const Real& x)                                      { return mpfr::machine_epsilon(x); }
+
+#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
+    static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec())  { return std::numeric_limits<Real>::digits10(Precision); }
+    static inline int digits10 (const Real& x)                                      { return std::numeric_limits<Real>::digits10(x); }
+#endif
+
+    static inline Real dummy_precision()
+    {
+      mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100;
+      return mpfr::machine_epsilon(weak_prec);
     }
   };
 
@@ -139,64 +146,63 @@ int main()
     public:
       typedef mpfr::mpreal ResScalar;
       enum {
-        nr = 2, // must be 2 for proper packing...
+        Vectorizable = false,
+        LhsPacketSize = 1,
+        RhsPacketSize = 1,
+        ResPacketSize = 1,
+        NumberOfRegisters = 1,
+        nr = 1,
         mr = 1,
-        WorkSpaceFactor = nr,
         LhsProgress = 1,
         RhsProgress = 1
       };
+      typedef ResScalar LhsPacket;
+      typedef ResScalar RhsPacket;
+      typedef ResScalar ResPacket;
+      
     };
 
-    template<typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-    struct gebp_kernel<mpfr::mpreal,mpfr::mpreal,Index,mr,nr,ConjugateLhs,ConjugateRhs>
+
+
+    template<typename Index, typename DataMapper, bool ConjugateLhs, bool ConjugateRhs>
+    struct gebp_kernel<mpfr::mpreal,mpfr::mpreal,Index,DataMapper,1,1,ConjugateLhs,ConjugateRhs>
     {
       typedef mpfr::mpreal mpreal;
 
       EIGEN_DONT_INLINE
-      void operator()(mpreal* res, Index resStride, const mpreal* blockA, const mpreal* blockB, Index rows, Index depth, Index cols, mpreal alpha,
-                      Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, mpreal* /*unpackedB*/ = 0)
+      void operator()(const DataMapper& res, const mpreal* blockA, const mpreal* blockB, 
+                      Index rows, Index depth, Index cols, const mpreal& alpha,
+                      Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
       {
-        mpreal acc1, acc2, tmp;
-        
+        if(rows==0 || cols==0 || depth==0)
+          return;
+
+        mpreal  acc1(0,mpfr_get_prec(blockA[0].mpfr_srcptr())),
+                tmp (0,mpfr_get_prec(blockA[0].mpfr_srcptr()));
+
         if(strideA==-1) strideA = depth;
         if(strideB==-1) strideB = depth;
 
-        for(Index j=0; j<cols; j+=nr)
+        for(Index i=0; i<rows; ++i)
         {
-          Index actual_nr = (std::min<Index>)(nr,cols-j);
-          mpreal *C1 = res + j*resStride;
-          mpreal *C2 = res + (j+1)*resStride;
-          for(Index i=0; i<rows; i++)
+          for(Index j=0; j<cols; ++j)
           {
-            mpreal *B = const_cast<mpreal*>(blockB) + j*strideB + offsetB*actual_nr;
-            mpreal *A = const_cast<mpreal*>(blockA) + i*strideA + offsetA;
+            const mpreal *A = blockA + i*strideA + offsetA;
+            const mpreal *B = blockB + j*strideB + offsetB;
+            
             acc1 = 0;
-            acc2 = 0;
             for(Index k=0; k<depth; k++)
             {
-              mpfr_mul(tmp.mpfr_ptr(), A[k].mpfr_ptr(), B[0].mpfr_ptr(), mpreal::get_default_rnd());
+              mpfr_mul(tmp.mpfr_ptr(), A[k].mpfr_srcptr(), B[k].mpfr_srcptr(), mpreal::get_default_rnd());
               mpfr_add(acc1.mpfr_ptr(), acc1.mpfr_ptr(), tmp.mpfr_ptr(),  mpreal::get_default_rnd());
-              
-              if(actual_nr==2) {
-                mpfr_mul(tmp.mpfr_ptr(), A[k].mpfr_ptr(), B[1].mpfr_ptr(), mpreal::get_default_rnd());
-                mpfr_add(acc2.mpfr_ptr(), acc2.mpfr_ptr(), tmp.mpfr_ptr(),  mpreal::get_default_rnd());
-              }
-              
-              B+=actual_nr;
             }
             
-            mpfr_mul(acc1.mpfr_ptr(), acc1.mpfr_ptr(), alpha.mpfr_ptr(), mpreal::get_default_rnd());
-            mpfr_add(C1[i].mpfr_ptr(), C1[i].mpfr_ptr(), acc1.mpfr_ptr(),  mpreal::get_default_rnd());
-            
-            if(actual_nr==2) {
-              mpfr_mul(acc2.mpfr_ptr(), acc2.mpfr_ptr(), alpha.mpfr_ptr(), mpreal::get_default_rnd());
-              mpfr_add(C2[i].mpfr_ptr(), C2[i].mpfr_ptr(), acc2.mpfr_ptr(),  mpreal::get_default_rnd());
-            }
+            mpfr_mul(acc1.mpfr_ptr(), acc1.mpfr_srcptr(), alpha.mpfr_srcptr(), mpreal::get_default_rnd());
+            mpfr_add(res(i,j).mpfr_ptr(), res(i,j).mpfr_srcptr(), acc1.mpfr_srcptr(),  mpreal::get_default_rnd());
           }
         }
       }
     };
-
   } // end namespace internal
 }
 
diff --git a/unsupported/Eigen/MatrixFunctions b/unsupported/Eigen/MatrixFunctions
index 0991817d5..0320606c1 100644
--- a/unsupported/Eigen/MatrixFunctions
+++ b/unsupported/Eigen/MatrixFunctions
@@ -13,8 +13,6 @@
 
 #include <cfloat>
 #include <list>
-#include <functional>
-#include <iterator>
 
 #include <Eigen/Core>
 #include <Eigen/LU>
@@ -84,7 +82,9 @@ const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cos() const
 \param[in]  M  a square matrix.
 \returns  expression representing \f$ \cos(M) \f$.
 
-This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cos().
+This function computes the matrix cosine. Use ArrayBase::cos() for computing the entry-wise cosine.
+
+The implementation calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cos().
 
 \sa \ref matrixbase_sin "sin()" for an example.
 
@@ -125,6 +125,9 @@ differential equations: the solution of \f$ y' = My \f$ with the
 initial condition \f$ y(0) = y_0 \f$ is given by
 \f$ y(t) = \exp(M) y_0 \f$.
 
+The matrix exponential is different from applying the exp function to all the entries in the matrix.
+Use ArrayBase::exp() if you want to do the latter.
+
 The cost of the computation is approximately \f$ 20 n^3 \f$ for
 matrices of size \f$ n \f$. The number 20 depends weakly on the
 norm of the matrix.
@@ -179,6 +182,9 @@ the scalar logarithm, the equation \f$ \exp(X) = M \f$ may have
 multiple solutions; this function returns a matrix whose eigenvalues
 have imaginary part in the interval \f$ (-\pi,\pi] \f$.
 
+The matrix logarithm is different from applying the log function to all the entries in the matrix.
+Use ArrayBase::log() if you want to do the latter.
+
 In the real case, the matrix \f$ M \f$ should be invertible and
 it should have no eigenvalues which are real and negative (pairs of
 complex conjugate eigenvalues are allowed). In the complex case, it
@@ -230,22 +236,66 @@ const MatrixPowerReturnValue<Derived> MatrixBase<Derived>::pow(RealScalar p) con
 \endcode
 
 \param[in]  M  base of the matrix power, should be a square matrix.
-\param[in]  p  exponent of the matrix power, should be real.
+\param[in]  p  exponent of the matrix power.
 
 The matrix power \f$ M^p \f$ is defined as \f$ \exp(p \log(M)) \f$,
 where exp denotes the matrix exponential, and log denotes the matrix
-logarithm.
+logarithm. This is different from raising all the entries in the matrix
+to the p-th power. Use ArrayBase::pow() if you want to do the latter.
 
-The matrix \f$ M \f$ should meet the conditions to be an argument of
-matrix logarithm. If \p p is not of the real scalar type of \p M, it
-is casted into the real scalar type of \p M.
+If \p p is complex, the scalar type of \p M should be the type of \p
+p . \f$ M^p \f$ simply evaluates into \f$ \exp(p \log(M)) \f$.
+Therefore, the matrix \f$ M \f$ should meet the conditions to be an
+argument of matrix logarithm.
 
-This function computes the matrix power using the Schur-Pad&eacute;
+If \p p is real, it is casted into the real scalar type of \p M. Then
+this function computes the matrix power using the Schur-Pad&eacute;
 algorithm as implemented by class MatrixPower. The exponent is split
 into integral part and fractional part, where the fractional part is
 in the interval \f$ (-1, 1) \f$. The main diagonal and the first
 super-diagonal is directly computed.
 
+If \p M is singular with a semisimple zero eigenvalue and \p p is
+positive, the Schur factor \f$ T \f$ is reordered with Givens
+rotations, i.e.
+
+\f[ T = \left[ \begin{array}{cc}
+      T_1 & T_2 \\
+      0   & 0
+    \end{array} \right] \f]
+
+where \f$ T_1 \f$ is invertible. Then \f$ T^p \f$ is given by
+
+\f[ T^p = \left[ \begin{array}{cc}
+      T_1^p & T_1^{-1} T_1^p T_2 \\
+      0     & 0
+    \end{array}. \right] \f]
+
+\warning Fractional power of a matrix with a non-semisimple zero
+eigenvalue is not well-defined. We introduce an assertion failure
+against inaccurate result, e.g. \code
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+int main()
+{
+  Eigen::Matrix4d A;
+  A << 0, 0, 2, 3,
+       0, 0, 4, 5,
+       0, 0, 6, 7,
+       0, 0, 8, 9;
+  std::cout << A.pow(0.37) << std::endl;
+  
+  // The 1 makes eigenvalue 0 non-semisimple.
+  A.coeffRef(0, 1) = 1;
+
+  // This fails if EIGEN_NO_DEBUG is undefined.
+  std::cout << A.pow(0.37) << std::endl;
+
+  return 0;
+}
+\endcode
+
 Details of the algorithm can be found in: Nicholas J. Higham and
 Lijing Lin, "A Schur-Pad&eacute; algorithm for fractional powers of a
 matrix," <em>SIAM J. %Matrix Anal. Applic.</em>,
@@ -350,7 +400,9 @@ const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sin() const
 \param[in]  M  a square matrix.
 \returns  expression representing \f$ \sin(M) \f$.
 
-This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sin().
+This function computes the matrix sine. Use ArrayBase::sin() for computing the entry-wise sine.
+
+The implementation calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sin().
 
 Example: \include MatrixSine.cpp
 Output: \verbinclude MatrixSine.out
@@ -387,7 +439,9 @@ const MatrixSquareRootReturnValue<Derived> MatrixBase<Derived>::sqrt() const
 
 The matrix square root of \f$ M \f$ is the matrix \f$ M^{1/2} \f$
 whose square is the original matrix; so if \f$ S = M^{1/2} \f$ then
-\f$ S^2 = M \f$. 
+\f$ S^2 = M \f$. This is different from taking the square root of all
+the entries in the matrix; use ArrayBase::sqrt() if you want to do the
+latter.
 
 In the <b>real case</b>, the matrix \f$ M \f$ should be invertible and
 it should have no eigenvalues which are real and negative (pairs of
diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport
index e2769449c..87f50947d 100644
--- a/unsupported/Eigen/OpenGLSupport
+++ b/unsupported/Eigen/OpenGLSupport
@@ -51,7 +51,7 @@ namespace internal {
             typename Scalar = typename XprType::Scalar,                                                             \
             int Rows = XprType::RowsAtCompileTime,                                                                  \
             int Cols = XprType::ColsAtCompileTime,                                                                  \
-            bool IsGLCompatible = bool(XprType::Flags&LinearAccessBit)                                              \
+            bool IsGLCompatible = bool(internal::evaluator<XprType>::Flags&LinearAccessBit)                         \
                               && bool(XprType::Flags&DirectAccessBit)                                               \
                               && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)>               \
   struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl);                                                                      \
@@ -180,11 +180,11 @@ template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompa
 
 inline void glRotate(const Rotation2D<float>& rot)
 {
-  glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f);
+  glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f);
 }
 inline void glRotate(const Rotation2D<double>& rot)
 {
-  glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0);
+  glRotated(rot.angle()*180.0/EIGEN_PI, 0.0, 0.0, 1.0);
 }
 
 template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)
@@ -203,7 +203,7 @@ namespace internal {
             typename Scalar = typename XprType::Scalar,                                                                         \
             int Rows = XprType::RowsAtCompileTime,                                                                              \
             int Cols = XprType::ColsAtCompileTime,                                                                              \
-            bool IsGLCompatible = bool(XprType::Flags&LinearAccessBit)                                                          \
+            bool IsGLCompatible = bool(internal::evaluator<XprType>::Flags&LinearAccessBit)                                     \
                               && bool(XprType::Flags&DirectAccessBit)                                                           \
                               && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)>                           \
   struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl);                                                                                  \
@@ -276,12 +276,12 @@ EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,4,Matrix
 
 #ifdef GL_VERSION_2_1
 
-static void glUniformMatrix2x3fv_ei(GLint loc, const float* v)         { glUniformMatrix2x3fv(loc,1,false,v); }
-static void glUniformMatrix3x2fv_ei(GLint loc, const float* v)         { glUniformMatrix3x2fv(loc,1,false,v); }
-static void glUniformMatrix2x4fv_ei(GLint loc, const float* v)         { glUniformMatrix2x4fv(loc,1,false,v); }
-static void glUniformMatrix4x2fv_ei(GLint loc, const float* v)         { glUniformMatrix4x2fv(loc,1,false,v); }
-static void glUniformMatrix3x4fv_ei(GLint loc, const float* v)         { glUniformMatrix3x4fv(loc,1,false,v); }
-static void glUniformMatrix4x3fv_ei(GLint loc, const float* v)         { glUniformMatrix4x3fv(loc,1,false,v); }
+inline void glUniformMatrix2x3fv_ei(GLint loc, const float* v)         { glUniformMatrix2x3fv(loc,1,false,v); }
+inline void glUniformMatrix3x2fv_ei(GLint loc, const float* v)         { glUniformMatrix3x2fv(loc,1,false,v); }
+inline void glUniformMatrix2x4fv_ei(GLint loc, const float* v)         { glUniformMatrix2x4fv(loc,1,false,v); }
+inline void glUniformMatrix4x2fv_ei(GLint loc, const float* v)         { glUniformMatrix4x2fv(loc,1,false,v); }
+inline void glUniformMatrix3x4fv_ei(GLint loc, const float* v)         { glUniformMatrix3x4fv(loc,1,false,v); }
+inline void glUniformMatrix4x3fv_ei(GLint loc, const float* v)         { glUniformMatrix4x3fv(loc,1,false,v); }
 
 EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        2,3,Matrix2x3fv_ei)
 EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        3,2,Matrix3x2fv_ei)
diff --git a/unsupported/Eigen/SVD b/unsupported/Eigen/SVD
deleted file mode 100644
index 7cc059280..000000000
--- a/unsupported/Eigen/SVD
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef EIGEN_SVD_MODULE_H
-#define EIGEN_SVD_MODULE_H
-
-#include <Eigen/QR>
-#include <Eigen/Householder>
-#include <Eigen/Jacobi>
-
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
-/** \defgroup SVD_Module SVD module
-  *
-  *
-  *
-  * This module provides SVD decomposition for matrices (both real and complex).
-  * This decomposition is accessible via the following MatrixBase method:
-  *  - MatrixBase::jacobiSvd()
-  *
-  * \code
-  * #include <Eigen/SVD>
-  * \endcode
-  */
-
-#include "../../Eigen/src/misc/Solve.h"
-#include "../../Eigen/src/SVD/UpperBidiagonalization.h"
-#include "src/SVD/SVDBase.h"
-#include "src/SVD/JacobiSVD.h"
-#include "src/SVD/BDCSVD.h"
-#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
-#include "../../Eigen/src/SVD/JacobiSVD_MKL.h"
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#include "../../Eigen/src/Eigen2Support/SVD.h"
-#endif
-
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif // EIGEN_SVD_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/SparseExtra b/unsupported/Eigen/SparseExtra
index b5597902a..819cffa27 100644
--- a/unsupported/Eigen/SparseExtra
+++ b/unsupported/Eigen/SparseExtra
@@ -37,9 +37,6 @@
   */
 
 
-#include "../../Eigen/src/misc/Solve.h"
-#include "../../Eigen/src/misc/SparseSolve.h"
-
 #include "src/SparseExtra/DynamicSparseMatrix.h"
 #include "src/SparseExtra/BlockOfDynamicSparseMatrix.h"
 #include "src/SparseExtra/RandomSetter.h"
diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
new file mode 100644
index 000000000..a2ad4925e
--- /dev/null
+++ b/unsupported/Eigen/SpecialFunctions
@@ -0,0 +1,63 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_MODULE
+#define EIGEN_SPECIALFUNCTIONS_MODULE
+
+#include <math.h>
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+  * \defgroup SpecialFunctions_Module Special math functions module
+  *
+  * This module features additional coefficient-wise math functions available
+  * within the numext:: namespace for the scalar version, and as method and/or free
+  * functions of Array. Those include:
+  *
+  * - erf
+  * - erfc
+  * - lgamma
+  * - igamma
+  * - igammac
+  * - digamma
+  * - polygamma
+  * - zeta
+  * - betainc
+  *
+  * \code
+  * #include <unsupported/Eigen/SpecialFunctions>
+  * \endcode
+  */
+//@{
+
+}
+
+#include "src/SpecialFunctions/SpecialFunctionsImpl.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
+#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
+
+#if defined EIGEN_VECTORIZE_CUDA
+  #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
+#endif
+
+namespace Eigen {
+//@}
+}
+
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPECIALFUNCTIONS_MODULE
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
index 1a61e3367..33b6c393f 100644
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
@@ -20,37 +20,60 @@ public:
   AutoDiffJacobian(const Functor& f) : Functor(f) {}
 
   // forward constructors
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  template<typename... T>
+  AutoDiffJacobian(const T& ...Values) : Functor(Values...) {}
+#else
   template<typename T0>
   AutoDiffJacobian(const T0& a0) : Functor(a0) {}
   template<typename T0, typename T1>
   AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
   template<typename T0, typename T1, typename T2>
   AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
+#endif
+
+  typedef typename Functor::InputType InputType;
+  typedef typename Functor::ValueType ValueType;
+  typedef typename ValueType::Scalar Scalar;
 
   enum {
-    InputsAtCompileTime = Functor::InputsAtCompileTime,
-    ValuesAtCompileTime = Functor::ValuesAtCompileTime
+    InputsAtCompileTime = InputType::RowsAtCompileTime,
+    ValuesAtCompileTime = ValueType::RowsAtCompileTime
   };
 
-  typedef typename Functor::InputType InputType;
-  typedef typename Functor::ValueType ValueType;
-  typedef typename Functor::JacobianType JacobianType;
-  typedef typename JacobianType::Scalar Scalar;
+  typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
   typedef typename JacobianType::Index Index;
 
-  typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType;
+  typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
   typedef AutoDiffScalar<DerivativeType> ActiveScalar;
 
-
   typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
   typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  // Some compilers don't accept variadic parameters after a default parameter,
+  // i.e., we can't just write _jac=0 but we need to overload operator():
+  EIGEN_STRONG_INLINE
+  void operator() (const InputType& x, ValueType* v) const
+  {
+      this->operator()(x, v, 0);
+  }
+  template<typename... ParamsType>
+  void operator() (const InputType& x, ValueType* v, JacobianType* _jac,
+                   const ParamsType&... Params) const
+#else
   void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
+#endif
   {
     eigen_assert(v!=0);
+
     if (!_jac)
     {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+      Functor::operator()(x, v, Params...);
+#else
       Functor::operator()(x, v);
+#endif
       return;
     }
 
@@ -61,12 +84,16 @@ public:
 
     if(InputsAtCompileTime==Dynamic)
       for (Index j=0; j<jac.rows(); j++)
-        av[j].derivatives().resize(this->inputs());
+        av[j].derivatives().resize(x.rows());
 
     for (Index i=0; i<jac.cols(); i++)
-      ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i);
+      ax[i].derivatives() = DerivativeType::Unit(x.rows(),i);
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    Functor::operator()(ax, &av, Params...);
+#else
     Functor::operator()(ax, &av);
+#endif
 
     for (Index i=0; i<jac.rows(); i++)
     {
@@ -74,8 +101,6 @@ public:
       jac.row(i) = av[i].derivatives();
     }
   }
-protected:
-
 };
 
 }
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 8d42e69b9..50fedf6ac 100644..100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -30,6 +30,13 @@ template<typename _DerType, bool Enable> struct auto_diff_special_op;
 
 } // end namespace internal
 
+template<typename _DerType> class AutoDiffScalar;
+
+template<typename NewDerType>
+inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) {
+  return AutoDiffScalar<NewDerType>(value,der);
+}
+
 /** \class AutoDiffScalar
   * \brief A scalar type replacement with automatic differentation capability
   *
@@ -60,7 +67,7 @@ template<typename _DerType>
 class AutoDiffScalar
   : public internal::auto_diff_special_op
             <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                                        typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
+                                          typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
 {
   public:
     typedef internal::auto_diff_special_op
@@ -99,7 +106,11 @@ class AutoDiffScalar
     {}
 
     template<typename OtherDerType>
-    AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other)
+    AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    , typename internal::enable_if<internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value,void*>::type = 0
+#endif
+    )
       : m_value(other.value()), m_derivatives(other.derivatives())
     {}
 
@@ -127,6 +138,14 @@ class AutoDiffScalar
       return *this;
     }
 
+    inline AutoDiffScalar& operator=(const Scalar& other)
+    {
+      m_value = other;
+      if(m_derivatives.size()>0)
+        m_derivatives.setZero();
+      return *this;
+    }
+
 //     inline operator const Scalar& () const { return m_value; }
 //     inline operator Scalar& () { return m_value; }
 
@@ -245,20 +264,16 @@ class AutoDiffScalar
         -m_derivatives);
     }
 
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator*(const Scalar& other) const
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        m_value * other,
-        (m_derivatives * other));
+      return MakeAutoDiffScalar(m_value * other, m_derivatives * other);
     }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator*(const Scalar& other, const AutoDiffScalar& a)
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        a.value() * other,
-        a.derivatives() * other);
+      return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other);
     }
 
 //     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
@@ -277,20 +292,16 @@ class AutoDiffScalar
 //         a.derivatives() * other);
 //     }
 
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator/(const Scalar& other) const
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        m_value / other,
-        (m_derivatives * (Scalar(1)/other)));
+      return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1)/other)));
     }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator/(const Scalar& other, const AutoDiffScalar& a)
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        other / a.value(),
-        a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
+      return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
     }
 
 //     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
@@ -310,34 +321,29 @@ class AutoDiffScalar
 //     }
 
     template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
-        const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(
+        CwiseBinaryOp<internal::scalar_difference_op<Scalar> EIGEN_COMMA
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) EIGEN_COMMA
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) >,Scalar,product) >
     operator/(const AutoDiffScalar<OtherDerType>& other) const
     {
       internal::make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
-        const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >(
+      return MakeAutoDiffScalar(
         m_value / other.value(),
-          ((m_derivatives * other.value()) - (m_value * other.derivatives()))
+          ((m_derivatives * other.value()) - (other.derivatives() * m_value))
         * (Scalar(1)/(other.value()*other.value())));
     }
 
     template<typename OtherDerType>
     inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type> > >
+        const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product),
+        const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) > >
     operator*(const AutoDiffScalar<OtherDerType>& other) const
     {
       internal::make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<const CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > >(
+      return MakeAutoDiffScalar(
         m_value * other.value(),
-        (m_derivatives * other.value()) + (m_value * other.derivatives()));
+        (m_derivatives * other.value()) + (other.derivatives() * m_value));
     }
 
     inline AutoDiffScalar& operator*=(const Scalar& other)
@@ -414,18 +420,18 @@ struct auto_diff_special_op<_DerType, true>
   }
 
 
-  inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+  inline const AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >
   operator*(const Real& other) const
   {
-    return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+    return AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >(
       derived().value() * other,
       derived().derivatives() * other);
   }
 
-  friend inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+  friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >
   operator*(const Real& other, const AutoDiffScalar<_DerType>& a)
   {
-    return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+    return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >(
       a.value() * other,
       a.derivatives() * other);
   }
@@ -489,43 +495,44 @@ struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows,
   }
 };
 
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,A_Scalar>
-{
-  enum { Defined = 1 };
-  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
-
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<A_Scalar, Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> >
-{
-  enum { Defined = 1 };
-  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
+} // end namespace internal
 
-template<typename DerType>
-struct scalar_product_traits<AutoDiffScalar<DerType>,typename DerType::Scalar>
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,typename DerType::Scalar,BinOp>
 {
-  enum { Defined = 1 };
   typedef AutoDiffScalar<DerType> ReturnType;
 };
 
-template<typename DerType>
-struct scalar_product_traits<typename DerType::Scalar,AutoDiffScalar<DerType> >
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, BinOp>
 {
-  enum { Defined = 1 };
   typedef AutoDiffScalar<DerType> ReturnType;
 };
 
-} // end namespace internal
+
+// The following is an attempt to let Eigen's known about expression template, but that's more tricky!
+
+// template<typename DerType, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp>
+// {
+//   enum { Defined = 1 };
+//   typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType;
+// };
+//
+// template<typename DerType1,typename DerType2, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp>
+// {
+//   enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value };
+//   typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType;
+// };
 
 #define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \
   template<typename DerType> \
-  inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > \
+  inline const Eigen::AutoDiffScalar< \
+  EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \
   FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
     using namespace Eigen; \
-    typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
-    typedef AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > ReturnType; \
+    EIGEN_UNUSED typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
     CODE; \
   }
 
@@ -536,75 +543,92 @@ inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x)  {
 template<typename DerType>
 inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&)    { return 0.; }
 template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (min)(const AutoDiffScalar<DerType>& x, const T& y)    { return (x <= y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+  return (x <= y ? ADS(x) : ADS(y));
+}
 template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (max)(const AutoDiffScalar<DerType>& x, const T& y)    { return (x >= y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+  return (x >= y ? ADS(x) : ADS(y));
+}
 template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (min)(const T& x, const AutoDiffScalar<DerType>& y)    { return (x < y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+  return (x < y ? ADS(x) : ADS(y));
+}
 template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (max)(const T& x, const AutoDiffScalar<DerType>& y)    { return (x > y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+  return (x > y ? ADS(x) : ADS(y));
+}
+template<typename DerType>
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+  return (x.value() < y.value() ? x : y);
+}
+template<typename DerType>
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+  return (x.value() >= y.value() ? x : y);
+}
+
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs,
   using std::abs;
-  return ReturnType(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
+  return Eigen::MakeAutoDiffScalar(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2,
   using numext::abs2;
-  return ReturnType(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
+  return Eigen::MakeAutoDiffScalar(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt,
   using std::sqrt;
   Scalar sqrtx = sqrt(x.value());
-  return ReturnType(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
+  return Eigen::MakeAutoDiffScalar(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos,
   using std::cos;
   using std::sin;
-  return ReturnType(cos(x.value()), x.derivatives() * (-sin(x.value())));)
+  return Eigen::MakeAutoDiffScalar(cos(x.value()), x.derivatives() * (-sin(x.value())));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin,
   using std::sin;
   using std::cos;
-  return ReturnType(sin(x.value()),x.derivatives() * cos(x.value()));)
+  return Eigen::MakeAutoDiffScalar(sin(x.value()),x.derivatives() * cos(x.value()));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp,
   using std::exp;
   Scalar expx = exp(x.value());
-  return ReturnType(expx,x.derivatives() * expx);)
+  return Eigen::MakeAutoDiffScalar(expx,x.derivatives() * expx);)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log,
   using std::log;
-  return ReturnType(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
+  return Eigen::MakeAutoDiffScalar(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
 
 template<typename DerType>
-inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<DerType>::Scalar>, const DerType> >
-pow(const Eigen::AutoDiffScalar<DerType>& x, typename Eigen::internal::traits<DerType>::Scalar y)
+inline const Eigen::AutoDiffScalar<
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<DerType>::type,typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar,product) >
+pow(const Eigen::AutoDiffScalar<DerType> &x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y)
 {
   using namespace Eigen;
-  typedef typename Eigen::internal::traits<DerType>::Scalar Scalar;
-  return AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const DerType> >(
-    std::pow(x.value(),y),
-    x.derivatives() * (y * std::pow(x.value(),y-1)));
+  using std::pow;
+  return Eigen::MakeAutoDiffScalar(pow(x.value(),y), x.derivatives() * (y * pow(x.value(),y-1)));
 }
 
 
 template<typename DerTypeA,typename DerTypeB>
-inline const AutoDiffScalar<Matrix<typename internal::traits<DerTypeA>::Scalar,Dynamic,1> >
+inline const AutoDiffScalar<Matrix<typename internal::traits<typename internal::remove_all<DerTypeA>::type>::Scalar,Dynamic,1> >
 atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b)
 {
   using std::atan2;
-  using std::max;
-  typedef typename internal::traits<DerTypeA>::Scalar Scalar;
+  typedef typename internal::traits<typename internal::remove_all<DerTypeA>::type>::Scalar Scalar;
   typedef AutoDiffScalar<Matrix<Scalar,Dynamic,1> > PlainADS;
   PlainADS ret;
   ret.value() = atan2(a.value(), b.value());
   
-  Scalar tmp2 = a.value() * a.value();
-  Scalar tmp3 = b.value() * b.value();
-  Scalar tmp4 = tmp3/(tmp2+tmp3);
+  Scalar squared_hypot = a.value() * a.value() + b.value() * b.value();
   
-  if (tmp4!=0)
-    ret.derivatives() = (a.derivatives() * b.value() - a.value() * b.derivatives()) * (tmp2+tmp3);
+  // if (squared_hypot==0) the derivation is undefined and the following results in a NaN:
+  ret.derivatives() = (a.derivatives() * b.value() - a.value() * b.derivatives()) / squared_hypot;
 
   return ret;
 }
@@ -612,26 +636,44 @@ atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b)
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan,
   using std::tan;
   using std::cos;
-  return ReturnType(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
+  return Eigen::MakeAutoDiffScalar(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin,
   using std::sqrt;
   using std::asin;
-  return ReturnType(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
+  return Eigen::MakeAutoDiffScalar(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
   
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos,
   using std::sqrt;
   using std::acos;
-  return ReturnType(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+  return Eigen::MakeAutoDiffScalar(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tanh,
+  using std::cosh;
+  using std::tanh;
+  return Eigen::MakeAutoDiffScalar(tanh(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cosh(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh,
+  using std::sinh;
+  using std::cosh;
+  return Eigen::MakeAutoDiffScalar(sinh(x.value()),x.derivatives() * cosh(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh,
+  using std::sinh;
+  using std::cosh;
+  return Eigen::MakeAutoDiffScalar(cosh(x.value()),x.derivatives() * sinh(x.value()));)
 
 #undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY
 
 template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
-  : NumTraits< typename NumTraits<typename DerType::Scalar>::Real >
+  : NumTraits< typename NumTraits<typename internal::remove_all<DerType>::type::Scalar>::Real >
 {
-  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime> > Real;
+  typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
+  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerTypeCleaned::Scalar>::Real,DerTypeCleaned::RowsAtCompileTime,DerTypeCleaned::ColsAtCompileTime,
+                                0, DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime> > Real;
   typedef AutoDiffScalar<DerType> NonInteger;
-  typedef AutoDiffScalar<DerType>& Nested;
+  typedef AutoDiffScalar<DerType> Nested;
+  typedef typename NumTraits<typename DerTypeCleaned::Scalar>::Literal Literal;
   enum{
     RequireInitialization = 1
   };
diff --git a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt b/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
deleted file mode 100644
index ad91fd9c4..000000000
--- a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_AutoDiff_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_AutoDiff_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/AutoDiff COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/BVH/CMakeLists.txt b/unsupported/Eigen/src/BVH/CMakeLists.txt
deleted file mode 100644
index b377d865c..000000000
--- a/unsupported/Eigen/src/BVH/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_BVH_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_BVH_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/BVH COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt
deleted file mode 100644
index 125c43fdf..000000000
--- a/unsupported/Eigen/src/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-ADD_SUBDIRECTORY(AutoDiff)
-ADD_SUBDIRECTORY(BVH)
-ADD_SUBDIRECTORY(FFT)
-ADD_SUBDIRECTORY(IterativeSolvers)
-ADD_SUBDIRECTORY(KroneckerProduct)
-ADD_SUBDIRECTORY(LevenbergMarquardt)
-ADD_SUBDIRECTORY(MatrixFunctions)
-ADD_SUBDIRECTORY(MoreVectorization)
-ADD_SUBDIRECTORY(NonLinearOptimization)
-ADD_SUBDIRECTORY(NumericalDiff)
-ADD_SUBDIRECTORY(Polynomials)
-ADD_SUBDIRECTORY(Skyline)
-ADD_SUBDIRECTORY(SparseExtra)
-ADD_SUBDIRECTORY(Splines)
diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index 3b6a69aff..866a8a460 100644
--- a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
@@ -628,15 +628,15 @@ ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>&
       m_info = Success;
     }
 
-    delete select;
+    delete[] select;
   }
 
-  delete v;
-  delete iparam;
-  delete ipntr;
-  delete workd;
-  delete workl;
-  delete resid;
+  delete[] v;
+  delete[] iparam;
+  delete[] ipntr;
+  delete[] workd;
+  delete[] workl;
+  delete[] resid;
 
   m_isInitialized = true;
 
diff --git a/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
new file mode 100644
index 000000000..40af550e8
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_EulerAngles_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_EulerAngles_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
new file mode 100644
index 000000000..13a0da1ab
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLESCLASS_H// TODO: Fix previous "EIGEN_EULERANGLES_H" definition?
+#define EIGEN_EULERANGLESCLASS_H
+
+namespace Eigen
+{
+  /*template<typename Other,
+         int OtherRows=Other::RowsAtCompileTime,
+         int OtherCols=Other::ColsAtCompileTime>
+  struct ei_eulerangles_assign_impl;*/
+
+  /** \class EulerAngles
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * \brief Represents a rotation in a 3 dimensional space as three Euler angles.
+    *
+    * Euler rotation is a set of three rotation of three angles over three fixed axes, defined by the EulerSystem given as a template parameter.
+    * 
+    * Here is how intrinsic Euler angles works:
+    *  - first, rotate the axes system over the alpha axis in angle alpha
+    *  - then, rotate the axes system over the beta axis(which was rotated in the first stage) in angle beta
+    *  - then, rotate the axes system over the gamma axis(which was rotated in the two stages above) in angle gamma
+    *
+    * \note This class support only intrinsic Euler angles for simplicity,
+    *  see EulerSystem how to easily overcome this for extrinsic systems.
+    *
+    * ### Rotation representation and conversions ###
+    *
+    * It has been proved(see Wikipedia link below) that every rotation can be represented
+    *  by Euler angles, but there is no singular representation (e.g. unlike rotation matrices).
+    * Therefore, you can convert from Eigen rotation and to them
+    *  (including rotation matrices, which is not called "rotations" by Eigen design).
+    *
+    * Euler angles usually used for:
+    *  - convenient human representation of rotation, especially in interactive GUI.
+    *  - gimbal systems and robotics
+    *  - efficient encoding(i.e. 3 floats only) of rotation for network protocols.
+    *
+    * However, Euler angles are slow comparing to quaternion or matrices,
+    *  because their unnatural math definition, although it's simple for human.
+    * To overcome this, this class provide easy movement from the math friendly representation
+    *  to the human friendly representation, and vise-versa.
+    *
+    * All the user need to do is a safe simple C++ type conversion,
+    *  and this class take care for the math.
+    * Additionally, some axes related computation is done in compile time.
+    *
+    * #### Euler angles ranges in conversions ####
+    *
+    * When converting some rotation to Euler angles, there are some ways you can guarantee
+    *  the Euler angles ranges.
+    *
+    * #### implicit ranges ####
+    * When using implicit ranges, all angles are guarantee to be in the range [-PI, +PI],
+    *  unless you convert from some other Euler angles.
+    * In this case, the range is __undefined__ (might be even less than -PI or greater than +2*PI).
+    * \sa EulerAngles(const MatrixBase<Derived>&)
+    * \sa EulerAngles(const RotationBase<Derived, 3>&)
+    *
+    * #### explicit ranges ####
+    * When using explicit ranges, all angles are guarantee to be in the range you choose.
+    * In the range Boolean parameter, you're been ask whether you prefer the positive range or not:
+    * - _true_ - force the range between [0, +2*PI]
+    * - _false_ - force the range between [-PI, +PI]
+    *
+    * ##### compile time ranges #####
+    * This is when you have compile time ranges and you prefer to
+    *  use template parameter. (e.g. for performance)
+    * \sa FromRotation()
+    *
+    * ##### run-time time ranges #####
+    * Run-time ranges are also supported.
+    * \sa EulerAngles(const MatrixBase<Derived>&, bool, bool, bool)
+    * \sa EulerAngles(const RotationBase<Derived, 3>&, bool, bool, bool)
+    *
+    * ### Convenient user typedefs ###
+    *
+    * Convenient typedefs for EulerAngles exist for float and double scalar,
+    *  in a form of EulerAngles{A}{B}{C}{scalar},
+    *  e.g. \ref EulerAnglesXYZd, \ref EulerAnglesZYZf.
+    *
+    * Only for positive axes{+x,+y,+z} Euler systems are have convenient typedef.
+    * If you need negative axes{-x,-y,-z}, it is recommended to create you own typedef with
+    *  a word that represent what you need.
+    *
+    * ### Example ###
+    *
+    * \include EulerAngles.cpp
+    * Output: \verbinclude EulerAngles.out
+    *
+    * ### Additional reading ###
+    *
+    * If you're want to get more idea about how Euler system work in Eigen see EulerSystem.
+    *
+    * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+    *
+    * \tparam _Scalar the scalar type, i.e., the type of the angles.
+    *
+    * \tparam _System the EulerSystem to use, which represents the axes of rotation.
+    */
+  template <typename _Scalar, class _System>
+  class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3>
+  {
+    public:
+      /** the scalar type of the angles */
+      typedef _Scalar Scalar;
+      
+      /** the EulerSystem to use, which represents the axes of rotation. */
+      typedef _System System;
+    
+      typedef Matrix<Scalar,3,3> Matrix3; /*!< the equivalent rotation matrix type */
+      typedef Matrix<Scalar,3,1> Vector3; /*!< the equivalent 3 dimension vector type */
+      typedef Quaternion<Scalar> QuaternionType; /*!< the equivalent quaternion type */
+      typedef AngleAxis<Scalar> AngleAxisType; /*!< the equivalent angle-axis type */
+      
+      /** \returns the axis vector of the first (alpha) rotation */
+      static Vector3 AlphaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::AlphaAxisAbs - 1);
+        return System::IsAlphaOpposite ? -u : u;
+      }
+      
+      /** \returns the axis vector of the second (beta) rotation */
+      static Vector3 BetaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::BetaAxisAbs - 1);
+        return System::IsBetaOpposite ? -u : u;
+      }
+      
+      /** \returns the axis vector of the third (gamma) rotation */
+      static Vector3 GammaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::GammaAxisAbs - 1);
+        return System::IsGammaOpposite ? -u : u;
+      }
+
+    private:
+      Vector3 m_angles;
+
+    public:
+      /** Default constructor without initialization. */
+      EulerAngles() {}
+      /** Constructs and initialize Euler angles(\p alpha, \p beta, \p gamma). */
+      EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) :
+        m_angles(alpha, beta, gamma) {}
+      
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m.
+        *
+        * \note All angles will be in the range [-PI, PI].
+      */
+      template<typename Derived>
+      EulerAngles(const MatrixBase<Derived>& m) { *this = m; }
+      
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+        *  with options to choose for each angle the requested range.
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param m The 3x3 rotation matrix to convert
+        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<typename Derived>
+      EulerAngles(
+        const MatrixBase<Derived>& m,
+        bool positiveRangeAlpha,
+        bool positiveRangeBeta,
+        bool positiveRangeGamma) {
+        
+        System::CalcEulerAngles(*this, m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+      }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot.
+        *
+        * \note All angles will be in the range [-PI, PI], unless \p rot is an EulerAngles.
+        *  If rot is an EulerAngles, expected EulerAngles range is __undefined__.
+        *  (Use other functions here for enforcing range if this effect is desired)
+      */
+      template<typename Derived>
+      EulerAngles(const RotationBase<Derived, 3>& rot) { *this = rot; }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot,
+        *  with options to choose for each angle the requested range.
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param rot The 3x3 rotation matrix to convert
+        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<typename Derived>
+      EulerAngles(
+        const RotationBase<Derived, 3>& rot,
+        bool positiveRangeAlpha,
+        bool positiveRangeBeta,
+        bool positiveRangeGamma) {
+        
+        System::CalcEulerAngles(*this, rot.toRotationMatrix(), positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+      }
+
+      /** \returns The angle values stored in a vector (alpha, beta, gamma). */
+      const Vector3& angles() const { return m_angles; }
+      /** \returns A read-write reference to the angle values stored in a vector (alpha, beta, gamma). */
+      Vector3& angles() { return m_angles; }
+
+      /** \returns The value of the first angle. */
+      Scalar alpha() const { return m_angles[0]; }
+      /** \returns A read-write reference to the angle of the first angle. */
+      Scalar& alpha() { return m_angles[0]; }
+
+      /** \returns The value of the second angle. */
+      Scalar beta() const { return m_angles[1]; }
+      /** \returns A read-write reference to the angle of the second angle. */
+      Scalar& beta() { return m_angles[1]; }
+
+      /** \returns The value of the third angle. */
+      Scalar gamma() const { return m_angles[2]; }
+      /** \returns A read-write reference to the angle of the third angle. */
+      Scalar& gamma() { return m_angles[2]; }
+
+      /** \returns The Euler angles rotation inverse (which is as same as the negative),
+        *  (-alpha, -beta, -gamma).
+      */
+      EulerAngles inverse() const
+      {
+        EulerAngles res;
+        res.m_angles = -m_angles;
+        return res;
+      }
+
+      /** \returns The Euler angles rotation negative (which is as same as the inverse),
+        *  (-alpha, -beta, -gamma).
+      */
+      EulerAngles operator -() const
+      {
+        return inverse();
+      }
+      
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+        *  with options to choose for each angle the requested range (__only in compile time__).
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param m The 3x3 rotation matrix to convert
+        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        */
+      template<
+        bool PositiveRangeAlpha,
+        bool PositiveRangeBeta,
+        bool PositiveRangeGamma,
+        typename Derived>
+      static EulerAngles FromRotation(const MatrixBase<Derived>& m)
+      {
+        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+        
+        EulerAngles e;
+        System::template CalcEulerAngles<
+          PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma, _Scalar>(e, m);
+        return e;
+      }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot,
+        *  with options to choose for each angle the requested range (__only in compile time__).
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param rot The 3x3 rotation matrix to convert
+        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<
+        bool PositiveRangeAlpha,
+        bool PositiveRangeBeta,
+        bool PositiveRangeGamma,
+        typename Derived>
+      static EulerAngles FromRotation(const RotationBase<Derived, 3>& rot)
+      {
+        return FromRotation<PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma>(rot.toRotationMatrix());
+      }
+      
+      /*EulerAngles& fromQuaternion(const QuaternionType& q)
+      {
+        // TODO: Implement it in a faster way for quaternions
+        // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+        //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+        // Currently we compute all matrix cells from quaternion.
+
+        // Special case only for ZYX
+        //Scalar y2 = q.y() * q.y();
+        //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+        //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+        //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+      }*/
+      
+      /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinant of +1). */
+      template<typename Derived>
+      EulerAngles& operator=(const MatrixBase<Derived>& m) {
+        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+        
+        System::CalcEulerAngles(*this, m);
+        return *this;
+      }
+
+      // TODO: Assign and construct from another EulerAngles (with different system)
+      
+      /** Set \c *this from a rotation. */
+      template<typename Derived>
+      EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
+        System::CalcEulerAngles(*this, rot.toRotationMatrix());
+        return *this;
+      }
+      
+      // TODO: Support isApprox function
+
+      /** \returns an equivalent 3x3 rotation matrix. */
+      Matrix3 toRotationMatrix() const
+      {
+        return static_cast<QuaternionType>(*this).toRotationMatrix();
+      }
+
+      /** Convert the Euler angles to quaternion. */
+      operator QuaternionType() const
+      {
+        return
+          AngleAxisType(alpha(), AlphaAxisVector()) *
+          AngleAxisType(beta(), BetaAxisVector())   *
+          AngleAxisType(gamma(), GammaAxisVector());
+      }
+      
+      friend std::ostream& operator<<(std::ostream& s, const EulerAngles<Scalar, System>& eulerAngles)
+      {
+        s << eulerAngles.angles().transpose();
+        return s;
+      }
+  };
+
+#define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
+  /** \ingroup EulerAngles_Module */ \
+  typedef EulerAngles<SCALAR_TYPE, EulerSystem##AXES> EulerAngles##AXES##SCALAR_POSTFIX;
+
+#define EIGEN_EULER_ANGLES_TYPEDEFS(SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYZ, SCALAR_TYPE, SCALAR_POSTFIX)
+
+EIGEN_EULER_ANGLES_TYPEDEFS(float, f)
+EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
+
+  namespace internal
+  {
+    template<typename _Scalar, class _System>
+    struct traits<EulerAngles<_Scalar, _System> >
+    {
+      typedef _Scalar Scalar;
+    };
+  }
+  
+}
+
+#endif // EIGEN_EULERANGLESCLASS_H
diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
new file mode 100644
index 000000000..98f9f647d
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -0,0 +1,326 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERSYSTEM_H
+#define EIGEN_EULERSYSTEM_H
+
+namespace Eigen
+{
+  // Forward declerations
+  template <typename _Scalar, class _System>
+  class EulerAngles;
+  
+  namespace internal
+  {
+    // TODO: Check if already exists on the rest API
+    template <int Num, bool IsPositive = (Num > 0)>
+    struct Abs
+    {
+      enum { value = Num };
+    };
+  
+    template <int Num>
+    struct Abs<Num, false>
+    {
+      enum { value = -Num };
+    };
+
+    template <int Axis>
+    struct IsValidAxis
+    {
+      enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
+    };
+  }
+  
+  #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1]
+  
+  /** \brief Representation of a fixed signed rotation axis for EulerSystem.
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * Values here represent:
+    *  - The axis of the rotation: X, Y or Z.
+    *  - The sign (i.e. direction of the rotation along the axis): positive(+) or negative(-)
+    *
+    * Therefore, this could express all the axes {+X,+Y,+Z,-X,-Y,-Z}
+    *
+    * For positive axis, use +EULER_{axis}, and for negative axis use -EULER_{axis}.
+    */
+  enum EulerAxis
+  {
+    EULER_X = 1, /*!< the X axis */
+    EULER_Y = 2, /*!< the Y axis */
+    EULER_Z = 3  /*!< the Z axis */
+  };
+  
+  /** \class EulerSystem
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * \brief Represents a fixed Euler rotation system.
+    *
+    * This meta-class goal is to represent the Euler system in compilation time, for EulerAngles.
+    *
+    * You can use this class to get two things:
+    *  - Build an Euler system, and then pass it as a template parameter to EulerAngles.
+    *  - Query some compile time data about an Euler system. (e.g. Whether it's tait bryan)
+    *
+    * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
+    * This meta-class store constantly those signed axes. (see \ref EulerAxis)
+    *
+    * ### Types of Euler systems ###
+    *
+    * All and only valid 3 dimension Euler rotation over standard
+    *  signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
+    *  - all axes X, Y, Z in each valid order (see below what order is valid)
+    *  - rotation over the axis is supported both over the positive and negative directions.
+    *  - both tait bryan and proper/classic Euler angles (i.e. the opposite).
+    *
+    * Since EulerSystem support both positive and negative directions,
+    *  you may call this rotation distinction in other names:
+    *  - _right handed_ or _left handed_
+    *  - _counterclockwise_ or _clockwise_
+    *
+    * Notice all axed combination are valid, and would trigger a static assertion.
+    * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
+    * This yield two and only two classes:
+    *  - _tait bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+    *  - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
+    *     and the second is different, e.g. {X,Y,X}
+    *
+    * ### Intrinsic vs extrinsic Euler systems ###
+    *
+    * Only intrinsic Euler systems are supported for simplicity.
+    *  If you want to use extrinsic Euler systems,
+    *   just use the equal intrinsic opposite order for axes and angles.
+    *  I.e axes (A,B,C) becomes (C,B,A), and angles (a,b,c) becomes (c,b,a).
+    *
+    * ### Convenient user typedefs ###
+    *
+    * Convenient typedefs for EulerSystem exist (only for positive axes Euler systems),
+    *  in a form of EulerSystem{A}{B}{C}, e.g. \ref EulerSystemXYZ.
+    *
+    * ### Additional reading ###
+    *
+    * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+    *
+    * \tparam _AlphaAxis the first fixed EulerAxis
+    *
+    * \tparam _AlphaAxis the second fixed EulerAxis
+    *
+    * \tparam _AlphaAxis the third fixed EulerAxis
+    */
+  template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
+  class EulerSystem
+  {
+    public:
+    // It's defined this way and not as enum, because I think
+    //  that enum is not guerantee to support negative numbers
+    
+    /** The first rotation axis */
+    static const int AlphaAxis = _AlphaAxis;
+    
+    /** The second rotation axis */
+    static const int BetaAxis = _BetaAxis;
+    
+    /** The third rotation axis */
+    static const int GammaAxis = _GammaAxis;
+
+    enum
+    {
+      AlphaAxisAbs = internal::Abs<AlphaAxis>::value, /*!< the first rotation axis unsigned */
+      BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */
+      GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
+      
+      IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< weather alpha axis is negative */
+      IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< weather beta axis is negative */
+      IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< weather gamma axis is negative */
+      
+      IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< weather the Euler system is odd */
+      IsEven = IsOdd ? 0 : 1, /*!< weather the Euler system is even */
+
+      IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< weather the Euler system is tait bryan */
+    };
+    
+    private:
+    
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<AlphaAxis>::value,
+      ALPHA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<BetaAxis>::value,
+      BETA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<GammaAxis>::value,
+      GAMMA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)AlphaAxisAbs != (unsigned)BetaAxisAbs,
+      ALPHA_AXIS_CANT_BE_EQUAL_TO_BETA_AXIS);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs,
+      BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS);
+
+    enum
+    {
+      // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system. 
+      // They are used in this class converters.
+      // They are always different from each other, and their possible values are: 0, 1, or 2.
+      I = AlphaAxisAbs - 1,
+      J = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
+      K = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
+    };
+    
+    // TODO: Get @mat parameter in form that avoids double evaluation.
+    template <typename Derived>
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/)
+    {
+      using std::atan2;
+      using std::sin;
+      using std::cos;
+      
+      typedef typename Derived::Scalar Scalar;
+      typedef Matrix<Scalar,2,1> Vector2;
+      
+      res[0] = atan2(mat(J,K), mat(K,K));
+      Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
+      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
+        if(res[0] > Scalar(0)) {
+          res[0] -= Scalar(EIGEN_PI);
+        }
+        else {
+          res[0] += Scalar(EIGEN_PI);
+        }
+        res[1] = atan2(-mat(I,K), -c2);
+      }
+      else
+        res[1] = atan2(-mat(I,K), c2);
+      Scalar s1 = sin(res[0]);
+      Scalar c1 = cos(res[0]);
+      res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J));
+    }
+
+    template <typename Derived>
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
+    {
+      using std::atan2;
+      using std::sin;
+      using std::cos;
+
+      typedef typename Derived::Scalar Scalar;
+      typedef Matrix<Scalar,2,1> Vector2;
+      
+      res[0] = atan2(mat(J,I), mat(K,I));
+      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
+      {
+        if(res[0] > Scalar(0)) {
+          res[0] -= Scalar(EIGEN_PI);
+        }
+        else {
+          res[0] += Scalar(EIGEN_PI);
+        }
+        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+        res[1] = -atan2(s2, mat(I,I));
+      }
+      else
+      {
+        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+        res[1] = atan2(s2, mat(I,I));
+      }
+
+      // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
+      // we can compute their respective rotation, and apply its inverse to M. Since the result must
+      // be a rotation around x, we have:
+      //
+      //  c2  s1.s2 c1.s2                   1  0   0 
+      //  0   c1    -s1       *    M    =   0  c3  s3
+      //  -s2 s1.c2 c1.c2                   0 -s3  c3
+      //
+      //  Thus:  m11.c1 - m21.s1 = c3  &   m12.c1 - m22.s1 = s3
+
+      Scalar s1 = sin(res[0]);
+      Scalar c1 = cos(res[0]);
+      res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J));
+    }
+    
+    template<typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+    {
+      CalcEulerAngles(res, mat, false, false, false);
+    }
+    
+    template<
+      bool PositiveRangeAlpha,
+      bool PositiveRangeBeta,
+      bool PositiveRangeGamma,
+      typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+    {
+      CalcEulerAngles(res, mat, PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma);
+    }
+    
+    template<typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat,
+      bool PositiveRangeAlpha,
+      bool PositiveRangeBeta,
+      bool PositiveRangeGamma)
+    {
+      CalcEulerAngles_imp(
+        res.angles(), mat,
+        typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type());
+
+      if (IsAlphaOpposite == IsOdd)
+        res.alpha() = -res.alpha();
+        
+      if (IsBetaOpposite == IsOdd)
+        res.beta() = -res.beta();
+        
+      if (IsGammaOpposite == IsOdd)
+        res.gamma() = -res.gamma();
+      
+      // Saturate results to the requested range
+      if (PositiveRangeAlpha && (res.alpha() < 0))
+        res.alpha() += Scalar(2 * EIGEN_PI);
+      
+      if (PositiveRangeBeta && (res.beta() < 0))
+        res.beta() += Scalar(2 * EIGEN_PI);
+      
+      if (PositiveRangeGamma && (res.gamma() < 0))
+        res.gamma() += Scalar(2 * EIGEN_PI);
+    }
+    
+    template <typename _Scalar, class _System>
+    friend class Eigen::EulerAngles;
+  };
+
+#define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \
+  /** \ingroup EulerAngles_Module */ \
+  typedef EulerSystem<EULER_##A, EULER_##B, EULER_##C> EulerSystem##A##B##C;
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,X)
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Y)
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,Z)
+}
+
+#endif // EIGEN_EULERSYSTEM_H
diff --git a/unsupported/Eigen/src/FFT/CMakeLists.txt b/unsupported/Eigen/src/FFT/CMakeLists.txt
deleted file mode 100644
index edcffcb18..000000000
--- a/unsupported/Eigen/src/FFT/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_FFT_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_FFT_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/FFT COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt b/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
deleted file mode 100644
index 7986afc5e..000000000
--- a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_IterativeSolvers_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_IterativeSolvers_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/IterativeSolvers COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
index 9fcc8a8d9..bae04fc30 100644
--- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
@@ -40,7 +40,6 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
 {
   eigen_assert(vec.size() == perm.size());
   typedef typename IndexType::Scalar Index; 
-  typedef typename VectorType::Scalar Scalar; 
   bool flag; 
   for (Index k  = 0; k < ncut; k++)
   {
@@ -84,6 +83,8 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
  * x = solver.solve(b);
  * \endcode
  * 
+ * DGMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
  * References :
  * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid
  *  Algebraic Solvers for Linear Systems Arising from Compressible
@@ -101,16 +102,18 @@ template< typename _MatrixType, typename _Preconditioner>
 class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
 {
     typedef IterativeSolverBase<DGMRES> Base;
-    using Base::mp_matrix;
+    using Base::matrix;
     using Base::m_error;
     using Base::m_iterations;
     using Base::m_info;
     using Base::m_isInitialized;
     using Base::m_tolerance; 
   public:
+    using Base::_solve_impl;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef _Preconditioner Preconditioner;
     typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix; 
@@ -133,30 +136,14 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  DGMRES(const MatrixType& A) : Base(A),m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false)
-  {}
+  template<typename MatrixDerived>
+  explicit DGMRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {}
 
   ~DGMRES() {}
   
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<DGMRES, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "DGMRES is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "DGMRES::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <DGMRES, Rhs, Guess>(*this, b.derived(), x0);
-  }
-  
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {    
     bool failed = false;
     for(int j=0; j<b.cols(); ++j)
@@ -165,7 +152,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
       m_error = Base::m_tolerance;
       
       typename Dest::ColXpr xj(x,j);
-      dgmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner);
+      dgmres(matrix(), b.col(j), xj, Base::m_preconditioner);
     }
     m_info = failed ? NumericalIssue
            : m_error <= Base::m_tolerance ? Success
@@ -175,10 +162,10 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
+  void _solve_impl(const Rhs& b, MatrixBase<Dest>& x) const
   {
     x = b;
-    _solveWithGuess(b,x);
+    _solve_with_guess_impl(b,x.derived());
   }
   /** 
    * Get the restart value
@@ -217,7 +204,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     template<typename Dest>
     int dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const; 
     // Compute data to use for deflation 
-    int dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, Index& neig) const;
+    int dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const;
     // Apply deflation to a vector
     template<typename RhsType, typename DestType>
     int dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
@@ -233,7 +220,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
     mutable DenseMatrix m_T; /* T=U^T*M^{-1}*A*U */
     mutable PartialPivLU<DenseMatrix> m_luT; // LU factorization of m_T
-    mutable int m_neig; //Number of eigenvalues to extract at each restart
+    mutable StorageIndex m_neig; //Number of eigenvalues to extract at each restart
     mutable int m_r; // Current number of deflated eigenvalues, size of m_U
     mutable int m_maxNeig; // Maximum number of eigenvalues to deflate
     mutable RealScalar m_lambdaN; //Modulus of the largest eigenvalue of A
@@ -353,7 +340,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, con
     
     beta = std::abs(g(it+1));
     m_error = beta/normRhs; 
-    std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl;
+    // std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl;
     it++; nbIts++; 
     
     if (m_error < m_tolerance)
@@ -431,7 +418,7 @@ inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_Matr
 }
 
 template< typename _MatrixType, typename _Preconditioner>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, Index& neig) const
+int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const
 {
   // First, find the Schur form of the Hessenberg matrix H
   typename internal::conditional<NumTraits<Scalar>::IsComplex, ComplexSchur<DenseMatrix>, RealSchur<DenseMatrix> >::type schurofH; 
@@ -441,7 +428,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
   schurofH.computeFromHessenberg(m_Hes.topLeftCorner(it,it), matrixQ, computeU); 
   
   ComplexVector eig(it);
-  Matrix<Index,Dynamic,1>perm(it); 
+  Matrix<StorageIndex,Dynamic,1>perm(it);
   eig = this->schurValues(schurofH);
   
   // Reorder the absolute values of Schur values
@@ -522,21 +509,5 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x,
   return 0; 
 }
 
-namespace internal {
-
-  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
-struct solve_retval<DGMRES<_MatrixType, _Preconditioner>, Rhs>
-  : solve_retval_base<DGMRES<_MatrixType, _Preconditioner>, Rhs>
-{
-  typedef DGMRES<_MatrixType, _Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-} // end namespace internal
-
 } // end namespace Eigen
 #endif 
diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index 7ba13afd2..5a82b0df6 100644
--- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -11,193 +11,197 @@
 #ifndef EIGEN_GMRES_H
 #define EIGEN_GMRES_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
 /**
- * Generalized Minimal Residual Algorithm based on the
- * Arnoldi algorithm implemented with Householder reflections.
- *
- * Parameters:
- *  \param mat       matrix of linear system of equations
- *  \param Rhs       right hand side vector of linear system of equations
- *  \param x         on input: initial guess, on output: solution
- *  \param precond   preconditioner used
- *  \param iters     on input: maximum number of iterations to perform
- *                   on output: number of iterations performed
- *  \param restart   number of iterations for a restart
- *  \param tol_error on input: residual tolerance
- *                   on output: residuum achieved
- *
- * \sa IterativeMethods::bicgstab() 
- *  
- *
- * For references, please see:
- *
- * Saad, Y. and Schultz, M. H.
- * GMRES: A Generalized Minimal Residual Algorithm for Solving Nonsymmetric Linear Systems.
- * SIAM J.Sci.Stat.Comp. 7, 1986, pp. 856 - 869.
- *
- * Saad, Y.
- * Iterative Methods for Sparse Linear Systems.
- * Society for Industrial and Applied Mathematics, Philadelphia, 2003.
- *
- * Walker, H. F.
- * Implementations of the GMRES method.
- * Comput.Phys.Comm. 53, 1989, pp. 311 - 320.
- *
- * Walker, H. F.
- * Implementation of the GMRES Method using Householder Transformations.
- * SIAM J.Sci.Stat.Comp. 9, 1988, pp. 152 - 163.
- *
- */
+* Generalized Minimal Residual Algorithm based on the
+* Arnoldi algorithm implemented with Householder reflections.
+*
+* Parameters:
+*  \param mat       matrix of linear system of equations
+*  \param Rhs       right hand side vector of linear system of equations
+*  \param x         on input: initial guess, on output: solution
+*  \param precond   preconditioner used
+*  \param iters     on input: maximum number of iterations to perform
+*                   on output: number of iterations performed
+*  \param restart   number of iterations for a restart
+*  \param tol_error on input: relative residual tolerance
+*                   on output: residuum achieved
+*
+* \sa IterativeMethods::bicgstab()
+*
+*
+* For references, please see:
+*
+* Saad, Y. and Schultz, M. H.
+* GMRES: A Generalized Minimal Residual Algorithm for Solving Nonsymmetric Linear Systems.
+* SIAM J.Sci.Stat.Comp. 7, 1986, pp. 856 - 869.
+*
+* Saad, Y.
+* Iterative Methods for Sparse Linear Systems.
+* Society for Industrial and Applied Mathematics, Philadelphia, 2003.
+*
+* Walker, H. F.
+* Implementations of the GMRES method.
+* Comput.Phys.Comm. 53, 1989, pp. 311 - 320.
+*
+* Walker, H. F.
+* Implementation of the GMRES Method using Householder Transformations.
+* SIAM J.Sci.Stat.Comp. 9, 1988, pp. 152 - 163.
+*
+*/
 template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
 bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Preconditioner & precond,
-		int &iters, const int &restart, typename Dest::RealScalar & tol_error) {
+    Index &iters, const Index &restart, typename Dest::RealScalar & tol_error) {
 
-	using std::sqrt;
-	using std::abs;
+  using std::sqrt;
+  using std::abs;
 
-	typedef typename Dest::RealScalar RealScalar;
-	typedef typename Dest::Scalar Scalar;
-	typedef Matrix < Scalar, Dynamic, 1 > VectorType;
-	typedef Matrix < Scalar, Dynamic, Dynamic > FMatrixType;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix < Scalar, Dynamic, 1 > VectorType;
+  typedef Matrix < Scalar, Dynamic, Dynamic, ColMajor> FMatrixType;
 
-	RealScalar tol = tol_error;
-	const int maxIters = iters;
-	iters = 0;
+  RealScalar tol = tol_error;
+  const Index maxIters = iters;
+  iters = 0;
 
-	const int m = mat.rows();
+  const Index m = mat.rows();
 
-	VectorType p0 = rhs - mat*x;
-	VectorType r0 = precond.solve(p0);
- 
-	// is initial guess already good enough?
-	if(abs(r0.norm()) < tol) {
-		return true; 
-	}
+  // residual and preconditioned residual
+  VectorType p0 = rhs - mat*x;
+  VectorType r0 = precond.solve(p0);
 
-	VectorType w = VectorType::Zero(restart + 1);
+  const RealScalar r0Norm = r0.norm();
 
-	FMatrixType H = FMatrixType::Zero(m, restart + 1); // Hessenberg matrix
-	VectorType tau = VectorType::Zero(restart + 1);
-	std::vector < JacobiRotation < Scalar > > G(restart);
-
-	// generate first Householder vector
-	VectorType e(m-1);
-	RealScalar beta;
-	r0.makeHouseholder(e, tau.coeffRef(0), beta);
-	w(0)=(Scalar) beta;
-	H.bottomLeftCorner(m - 1, 1) = e;
-
-	for (int k = 1; k <= restart; ++k) {
-
-		++iters;
-
-		VectorType v = VectorType::Unit(m, k - 1), workspace(m);
-
-		// apply Householder reflections H_{1} ... H_{k-1} to v
-		for (int i = k - 1; i >= 0; --i) {
-			v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
-		}
-
-		// apply matrix M to v:  v = mat * v;
-		VectorType t=mat*v;
-		v=precond.solve(t);
-
-		// apply Householder reflections H_{k-1} ... H_{1} to v
-		for (int i = 0; i < k; ++i) {
-			v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
-		}
-
-		if (v.tail(m - k).norm() != 0.0) {
-
-			if (k <= restart) {
-
-				// generate new Householder vector
-                                  VectorType e(m - k - 1);
-				RealScalar beta;
-				v.tail(m - k).makeHouseholder(e, tau.coeffRef(k), beta);
-				H.col(k).tail(m - k - 1) = e;
-
-				// apply Householder reflection H_{k} to v
-				v.tail(m - k).applyHouseholderOnTheLeft(H.col(k).tail(m - k - 1), tau.coeffRef(k), workspace.data());
-
-			}
-                }
-
-                if (k > 1) {
-                        for (int i = 0; i < k - 1; ++i) {
-                                // apply old Givens rotations to v
-                                v.applyOnTheLeft(i, i + 1, G[i].adjoint());
-                        }
-                }
-
-                if (k<m && v(k) != (Scalar) 0) {
-                        // determine next Givens rotation
-                        G[k - 1].makeGivens(v(k - 1), v(k));
-
-                        // apply Givens rotation to v and w
-                        v.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
-                        w.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
-
-                }
-
-                // insert coefficients into upper matrix triangle
-                H.col(k - 1).head(k) = v.head(k);
-
-                bool stop=(k==m || abs(w(k)) < tol || iters == maxIters);
+  // is initial guess already good enough?
+  if(r0Norm == 0)
+  {
+    tol_error = 0;
+    return true;
+  }
 
-                if (stop || k == restart) {
+  // storage for Hessenberg matrix and Householder data
+  FMatrixType H   = FMatrixType::Zero(m, restart + 1);
+  VectorType w    = VectorType::Zero(restart + 1);
+  VectorType tau  = VectorType::Zero(restart + 1);
 
-                        // solve upper triangular system
-                        VectorType y = w.head(k);
-                        H.topLeftCorner(k, k).template triangularView < Eigen::Upper > ().solveInPlace(y);
+  // storage for Jacobi rotations
+  std::vector < JacobiRotation < Scalar > > G(restart);
+  
+  // storage for temporaries
+  VectorType t(m), v(m), workspace(m), x_new(m);
+
+  // generate first Householder vector
+  Ref<VectorType> H0_tail = H.col(0).tail(m - 1);
+  RealScalar beta;
+  r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta);
+  w(0) = Scalar(beta);
+  
+  for (Index k = 1; k <= restart; ++k)
+  {
+    ++iters;
 
-                        // use Horner-like scheme to calculate solution vector
-                        VectorType x_new = y(k - 1) * VectorType::Unit(m, k - 1);
+    v = VectorType::Unit(m, k - 1);
 
-                        // apply Householder reflection H_{k} to x_new
-                        x_new.tail(m - k + 1).applyHouseholderOnTheLeft(H.col(k - 1).tail(m - k), tau.coeffRef(k - 1), workspace.data());
+    // apply Householder reflections H_{1} ... H_{k-1} to v
+    // TODO: use a HouseholderSequence
+    for (Index i = k - 1; i >= 0; --i) {
+      v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+    }
 
-                        for (int i = k - 2; i >= 0; --i) {
-                                x_new += y(i) * VectorType::Unit(m, i);
-                                // apply Householder reflection H_{i} to x_new
-                                x_new.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
-                        }
+    // apply matrix M to v:  v = mat * v;
+    t.noalias() = mat * v;
+    v = precond.solve(t);
 
-                        x += x_new;
+    // apply Householder reflections H_{k-1} ... H_{1} to v
+    // TODO: use a HouseholderSequence
+    for (Index i = 0; i < k; ++i) {
+      v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+    }
 
-                        if (stop) {
-                                return true;
-                        } else {
-                                k=0;
+    if (v.tail(m - k).norm() != 0.0)
+    {
+      if (k <= restart)
+      {
+        // generate new Householder vector
+        Ref<VectorType> Hk_tail = H.col(k).tail(m - k - 1);
+        v.tail(m - k).makeHouseholder(Hk_tail, tau.coeffRef(k), beta);
+
+        // apply Householder reflection H_{k} to v
+        v.tail(m - k).applyHouseholderOnTheLeft(Hk_tail, tau.coeffRef(k), workspace.data());
+      }
+    }
 
-                                // reset data for a restart  r0 = rhs - mat * x;
-                                VectorType p0=mat*x;
-                                VectorType p1=precond.solve(p0);
-                                r0 = rhs - p1;
-//                                 r0_sqnorm = r0.squaredNorm();
-                                w = VectorType::Zero(restart + 1);
-                                H = FMatrixType::Zero(m, restart + 1);
-                                tau = VectorType::Zero(restart + 1);
+    if (k > 1)
+    {
+      for (Index i = 0; i < k - 1; ++i)
+      {
+        // apply old Givens rotations to v
+        v.applyOnTheLeft(i, i + 1, G[i].adjoint());
+      }
+    }
 
-                                // generate first Householder vector
-                                RealScalar beta;
-                                r0.makeHouseholder(e, tau.coeffRef(0), beta);
-                                w(0)=(Scalar) beta;
-                                H.bottomLeftCorner(m - 1, 1) = e;
+    if (k<m && v(k) != (Scalar) 0)
+    {
+      // determine next Givens rotation
+      G[k - 1].makeGivens(v(k - 1), v(k));
 
-                        }
+      // apply Givens rotation to v and w
+      v.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+      w.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+    }
 
-                }
+    // insert coefficients into upper matrix triangle
+    H.col(k-1).head(k) = v.head(k);
 
+    tol_error = abs(w(k)) / r0Norm;
+    bool stop = (k==m || tol_error < tol || iters == maxIters);
 
+    if (stop || k == restart)
+    {
+      // solve upper triangular system
+      Ref<VectorType> y = w.head(k);
+      H.topLeftCorner(k, k).template triangularView <Upper>().solveInPlace(y);
+
+      // use Horner-like scheme to calculate solution vector
+      x_new.setZero();
+      for (Index i = k - 1; i >= 0; --i)
+      {
+        x_new(i) += y(i);
+        // apply Householder reflection H_{i} to x_new
+        x_new.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+      }
+
+      x += x_new;
+
+      if(stop)
+      {
+        return true;
+      }
+      else
+      {
+        k=0;
+
+        // reset data for restart
+        p0.noalias() = rhs - mat*x;
+        r0 = precond.solve(p0);
+
+        // clear Hessenberg matrix and Householder data
+        H.setZero();
+        w.setZero();
+        tau.setZero();
+
+        // generate first Householder vector
+        r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta);
+        w(0) = Scalar(beta);
+      }
+    }
+  }
 
-	}
-	
-	return false;
+  return false;
 
 }
 
@@ -230,7 +234,7 @@ struct traits<GMRES<_MatrixType,_Preconditioner> >
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
-  * 
+  *
   * This class can be used as the direct solver classes. Here is a typical usage example:
   * \code
   * int n = 10000;
@@ -244,29 +248,31 @@ struct traits<GMRES<_MatrixType,_Preconditioner> >
   * // update b, and solve again
   * x = solver.solve(b);
   * \endcode
-  * 
+  *
   * By default the iterations start with x=0 as an initial guess of the solution.
   * One can control the start using the solveWithGuess() method.
   * 
+  * GMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, typename _Preconditioner>
 class GMRES : public IterativeSolverBase<GMRES<_MatrixType,_Preconditioner> >
 {
   typedef IterativeSolverBase<GMRES> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
   using Base::m_isInitialized;
- 
+
 private:
-  int m_restart;
-  
+  Index m_restart;
+
 public:
+  using Base::_solve_impl;
   typedef _MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef _Preconditioner Preconditioner;
 
@@ -276,95 +282,62 @@ public:
   GMRES() : Base(), m_restart(30) {}
 
   /** Initialize the solver with matrix \a A for further \c Ax=b solving.
-    * 
+    *
     * This constructor is a shortcut for the default constructor followed
     * by a call to compute().
-    * 
+    *
     * \warning this class stores a reference to the matrix A as well as some
     * precomputed values that depend on it. Therefore, if \a A is changed
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  GMRES(const MatrixType& A) : Base(A), m_restart(30) {}
+  template<typename MatrixDerived>
+  explicit GMRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_restart(30) {}
 
   ~GMRES() {}
-  
+
   /** Get the number of iterations after that a restart is performed.
     */
-  int get_restart() { return m_restart; }
-  
+  Index get_restart() { return m_restart; }
+
   /** Set the number of iterations after that a restart is performed.
     *  \param restart   number of iterations for a restarti, default is 30.
     */
-  void set_restart(const int restart) { m_restart=restart; }
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<GMRES, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "GMRES is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "GMRES::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <GMRES, Rhs, Guess>(*this, b.derived(), x0);
-  }
-  
+  void set_restart(const Index restart) { m_restart=restart; }
+
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
-  {    
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  {
     bool failed = false;
-    for(int j=0; j<b.cols(); ++j)
+    for(Index j=0; j<b.cols(); ++j)
     {
       m_iterations = Base::maxIterations();
       m_error = Base::m_tolerance;
-      
+
       typename Dest::ColXpr xj(x,j);
-      if(!internal::gmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
+      if(!internal::gmres(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
         failed = true;
     }
     m_info = failed ? NumericalIssue
-           : m_error <= Base::m_tolerance ? Success
-           : NoConvergence;
+          : m_error <= Base::m_tolerance ? Success
+          : NoConvergence;
     m_isInitialized = true;
   }
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
+  void _solve_impl(const Rhs& b, MatrixBase<Dest> &x) const
   {
     x = b;
     if(x.squaredNorm() == 0) return; // Check Zero right hand side
-    _solveWithGuess(b,x);
+    _solve_with_guess_impl(b,x.derived());
   }
 
 protected:
 
 };
 
-
-namespace internal {
-
-  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
-struct solve_retval<GMRES<_MatrixType, _Preconditioner>, Rhs>
-  : solve_retval_base<GMRES<_MatrixType, _Preconditioner>, Rhs>
-{
-  typedef GMRES<_MatrixType, _Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_GMRES_H
diff --git a/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h b/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
deleted file mode 100644
index 661c1f2e0..000000000
--- a/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
+++ /dev/null
@@ -1,278 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_INCOMPLETE_CHOlESKY_H
-#define EIGEN_INCOMPLETE_CHOlESKY_H
-#include "Eigen/src/IterativeLinearSolvers/IncompleteLUT.h" 
-#include <Eigen/OrderingMethods>
-#include <list>
-
-namespace Eigen {  
-/** 
- * \brief Modified Incomplete Cholesky with dual threshold
- * 
- * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
- *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
- * 
- * \tparam _MatrixType The type of the sparse matrix. It should be a symmetric 
- *                     matrix. It is advised to give  a row-oriented sparse matrix 
- * \tparam _UpLo The triangular part of the matrix to reference. 
- * \tparam _OrderingType 
- */
-
-template <typename Scalar, int _UpLo = Lower, typename _OrderingType = NaturalOrdering<int> >
-class IncompleteCholesky : internal::noncopyable
-{
-  public:
-    typedef SparseMatrix<Scalar,ColMajor> MatrixType;
-    typedef _OrderingType OrderingType;
-    typedef typename MatrixType::RealScalar RealScalar; 
-    typedef typename MatrixType::Index Index; 
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
-    typedef Matrix<Scalar,Dynamic,1> ScalarType; 
-    typedef Matrix<Index,Dynamic, 1> IndexType;
-    typedef std::vector<std::list<Index> > VectorList; 
-    enum { UpLo = _UpLo };
-  public:
-    IncompleteCholesky() : m_shift(1),m_factorizationIsOk(false) {}
-    IncompleteCholesky(const MatrixType& matrix) : m_shift(1),m_factorizationIsOk(false)
-    {
-      compute(matrix);
-    }
-    
-    Index rows() const { return m_L.rows(); }
-    
-    Index cols() const { return m_L.cols(); }
-    
-
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "IncompleteLLT is not initialized.");
-      return m_info;
-    }
-    
-    /** 
-     * \brief Set the initial shift parameter
-     */
-    void setShift( Scalar shift) { m_shift = shift; }
-    
-    /**
-    * \brief Computes the fill reducing permutation vector. 
-    */
-    template<typename MatrixType>
-    void analyzePattern(const MatrixType& mat)
-    {
-      OrderingType ord; 
-      ord(mat.template selfadjointView<UpLo>(), m_perm); 
-      m_analysisIsOk = true; 
-    }
-    
-    template<typename MatrixType>
-    void factorize(const MatrixType& amat);
-    
-    template<typename MatrixType>
-    void compute (const MatrixType& matrix)
-    {
-      analyzePattern(matrix); 
-      factorize(matrix);
-    }
-    
-    template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
-    {
-      eigen_assert(m_factorizationIsOk && "factorize() should be called first");
-      if (m_perm.rows() == b.rows())
-        x = m_perm.inverse() * b; 
-      else 
-        x = b; 
-      x = m_scal.asDiagonal() * x;
-      x = m_L.template triangularView<UnitLower>().solve(x); 
-      x = m_L.adjoint().template triangularView<Upper>().solve(x); 
-      if (m_perm.rows() == b.rows())
-        x = m_perm * x;
-      x = m_scal.asDiagonal() * x;
-    }
-    template<typename Rhs> inline const internal::solve_retval<IncompleteCholesky, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_factorizationIsOk && "IncompleteLLT did not succeed");
-      eigen_assert(m_isInitialized && "IncompleteLLT is not initialized.");
-      eigen_assert(cols()==b.rows()
-                && "IncompleteLLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<IncompleteCholesky, Rhs>(*this, b.derived());
-    }
-  protected:
-    SparseMatrix<Scalar,ColMajor> m_L;  // The lower part stored in CSC
-    ScalarType m_scal; // The vector for scaling the matrix 
-    Scalar m_shift; //The initial shift parameter
-    bool m_analysisIsOk; 
-    bool m_factorizationIsOk; 
-    bool m_isInitialized;
-    ComputationInfo m_info;
-    PermutationType m_perm; 
-    
-  private:
-    template <typename IdxType, typename SclType>
-    inline void updateList(const IdxType& colPtr, IdxType& rowIdx, SclType& vals, const Index& col, const Index& jk, IndexType& firstElt, VectorList& listCol); 
-}; 
-
-template<typename Scalar, int _UpLo, typename OrderingType>
-template<typename _MatrixType>
-void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType& mat)
-{
-  using std::sqrt;
-  using std::min;
-  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
-    
-  // Dropping strategies : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added
-  
-  // Apply the fill-reducing permutation computed in analyzePattern()
-  if (m_perm.rows() == mat.rows() ) // To detect the null permutation
-    m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>().twistedBy(m_perm);
-  else
-    m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>();
-  
-  Index n = m_L.cols(); 
-  Index nnz = m_L.nonZeros();
-  Map<ScalarType> vals(m_L.valuePtr(), nnz); //values
-  Map<IndexType> rowIdx(m_L.innerIndexPtr(), nnz);  //Row indices
-  Map<IndexType> colPtr( m_L.outerIndexPtr(), n+1); // Pointer to the beginning of each row
-  IndexType firstElt(n-1); // for each j, points to the next entry in vals that will be used in the factorization
-  VectorList listCol(n); // listCol(j) is a linked list of columns to update column j
-  ScalarType curCol(n); // Store a  nonzero values in each column
-  IndexType irow(n); // Row indices of nonzero elements in each column
-  
-  
-  // Computes the scaling factors 
-  m_scal.resize(n);
-  for (int j = 0; j < n; j++)
-  {
-    m_scal(j) = m_L.col(j).norm();
-    m_scal(j) = sqrt(m_scal(j));
-  }
-  // Scale and compute the shift for the matrix 
-  Scalar mindiag = vals[0];
-  for (int j = 0; j < n; j++){
-    for (int k = colPtr[j]; k < colPtr[j+1]; k++)
-     vals[k] /= (m_scal(j) * m_scal(rowIdx[k]));
-    mindiag = (min)(vals[colPtr[j]], mindiag);
-  }
-  
-  if(mindiag < Scalar(0.)) m_shift = m_shift - mindiag;
-  // Apply the shift to the diagonal elements of the matrix
-  for (int j = 0; j < n; j++)
-    vals[colPtr[j]] += m_shift;
-  // jki version of the Cholesky factorization 
-  for (int j=0; j < n; ++j)
-  {  
-    //Left-looking factorize the column j 
-    // First, load the jth column into curCol 
-    Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
-    curCol.setZero();
-    irow.setLinSpaced(n,0,n-1); 
-    for (int i = colPtr[j] + 1; i < colPtr[j+1]; i++)
-    {
-      curCol(rowIdx[i]) = vals[i]; 
-      irow(rowIdx[i]) = rowIdx[i]; 
-    }
-    std::list<int>::iterator k; 
-    // Browse all previous columns that will update column j
-    for(k = listCol[j].begin(); k != listCol[j].end(); k++) 
-    {
-      int jk = firstElt(*k); // First element to use in the column 
-      jk += 1; 
-      for (int i = jk; i < colPtr[*k+1]; i++)
-      {
-        curCol(rowIdx[i]) -= vals[i] * vals[jk] ;
-      }
-      updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol);
-    }
-    
-    // Scale the current column
-    if(RealScalar(diag) <= 0) 
-    {
-      std::cerr << "\nNegative diagonal during Incomplete factorization... "<< j << "\n";
-      m_info = NumericalIssue; 
-      return; 
-    }
-    RealScalar rdiag = sqrt(RealScalar(diag));
-    vals[colPtr[j]] = rdiag;
-    for (int i = j+1; i < n; i++)
-    {
-      //Scale 
-      curCol(i) /= rdiag;
-      //Update the remaining diagonals with curCol
-      vals[colPtr[i]] -= curCol(i) * curCol(i);
-    }
-    // Select the largest p elements
-    //  p is the original number of elements in the column (without the diagonal)
-    int p = colPtr[j+1] - colPtr[j] - 1 ; 
-    internal::QuickSplit(curCol, irow, p); 
-    // Insert the largest p elements in the matrix
-    int cpt = 0; 
-    for (int i = colPtr[j]+1; i < colPtr[j+1]; i++)
-    {
-      vals[i] = curCol(cpt); 
-      rowIdx[i] = irow(cpt); 
-      cpt ++; 
-    }
-    // Get the first smallest row index and put it after the diagonal element
-    Index jk = colPtr(j)+1;
-    updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); 
-  }
-  m_factorizationIsOk = true; 
-  m_isInitialized = true;
-  m_info = Success; 
-}
-
-template<typename Scalar, int _UpLo, typename OrderingType>
-template <typename IdxType, typename SclType>
-inline void IncompleteCholesky<Scalar,_UpLo, OrderingType>::updateList(const IdxType& colPtr, IdxType& rowIdx, SclType& vals, const Index& col, const Index& jk, IndexType& firstElt, VectorList& listCol)
-{
-  if (jk < colPtr(col+1) )
-  {
-    Index p = colPtr(col+1) - jk;
-    Index minpos; 
-    rowIdx.segment(jk,p).minCoeff(&minpos);
-    minpos += jk;
-    if (rowIdx(minpos) != rowIdx(jk))
-    {
-      //Swap
-      std::swap(rowIdx(jk),rowIdx(minpos));
-      std::swap(vals(jk),vals(minpos));
-    }
-    firstElt(col) = jk;
-    listCol[rowIdx(jk)].push_back(col);
-  }
-}
-namespace internal {
-
-template<typename _Scalar, int _UpLo, typename OrderingType, typename Rhs>
-struct solve_retval<IncompleteCholesky<_Scalar,  _UpLo, OrderingType>, Rhs>
-  : solve_retval_base<IncompleteCholesky<_Scalar, _UpLo, OrderingType>, Rhs>
-{
-  typedef IncompleteCholesky<_Scalar, _UpLo, OrderingType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen 
-
-#endif
diff --git a/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h b/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
index 67e780181..7d08c3515 100644
--- a/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
+++ b/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
@@ -13,8 +13,12 @@
 namespace Eigen { 
 
 template <typename _Scalar>
-class IncompleteLU
+class IncompleteLU : public SparseSolverBase<IncompleteLU<_Scalar> >
 {
+  protected:
+    typedef SparseSolverBase<IncompleteLU<_Scalar> > Base;
+    using Base::m_isInitialized;
+    
     typedef _Scalar Scalar;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef typename Vector::Index Index;
@@ -23,10 +27,10 @@ class IncompleteLU
   public:
     typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
 
-    IncompleteLU() : m_isInitialized(false) {}
+    IncompleteLU() {}
 
     template<typename MatrixType>
-    IncompleteLU(const MatrixType& mat) : m_isInitialized(false)
+    IncompleteLU(const MatrixType& mat)
     {
       compute(mat);
     }
@@ -71,43 +75,16 @@ class IncompleteLU
     }
 
     template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
+    void _solve_impl(const Rhs& b, Dest& x) const
     {
       x = m_lu.template triangularView<UnitLower>().solve(b);
       x = m_lu.template triangularView<Upper>().solve(x);
     }
 
-    template<typename Rhs> inline const internal::solve_retval<IncompleteLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "IncompleteLU is not initialized.");
-      eigen_assert(cols()==b.rows()
-                && "IncompleteLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<IncompleteLU, Rhs>(*this, b.derived());
-    }
-
   protected:
     FactorType m_lu;
-    bool m_isInitialized;
-};
-
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<IncompleteLU<_MatrixType>, Rhs>
-  : solve_retval_base<IncompleteLU<_MatrixType>, Rhs>
-{
-  typedef IncompleteLU<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
 };
 
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_INCOMPLETE_LU_H
diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
index 30f26aa50..256990c1a 100644
--- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -29,7 +29,7 @@ namespace Eigen {
         template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
         EIGEN_DONT_INLINE
         void minres(const MatrixType& mat, const Rhs& rhs, Dest& x,
-                    const Preconditioner& precond, int& iters,
+                    const Preconditioner& precond, Index& iters,
                     typename Dest::RealScalar& tol_error)
         {
             using std::sqrt;
@@ -48,8 +48,8 @@ namespace Eigen {
             }
             
             // initialize
-            const int maxIters(iters);  // initialize maxIters to iters
-            const int N(mat.cols());    // the size of the matrix
+            const Index maxIters(iters);  // initialize maxIters to iters
+            const Index N(mat.cols());    // the size of the matrix
             const RealScalar threshold2(tol_error*tol_error*rhsNorm2); // convergence threshold (compared to residualNorm2)
             
             // Initialize preconditioned Lanczos
@@ -144,7 +144,6 @@ namespace Eigen {
     
     template< typename _MatrixType, int _UpLo=Lower,
     typename _Preconditioner = IdentityPreconditioner>
-//    typename _Preconditioner = IdentityPreconditioner<typename _MatrixType::Scalar> > // preconditioner must be positive definite
     class MINRES;
     
     namespace internal {
@@ -166,8 +165,8 @@ namespace Eigen {
      * The vectors x and b can be either dense or sparse.
      *
      * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
-     * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-     *               or Upper. Default is Lower.
+     * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
+     *               Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
      * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
      *
      * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
@@ -192,6 +191,8 @@ namespace Eigen {
      * By default the iterations start with x=0 as an initial guess of the solution.
      * One can control the start using the solveWithGuess() method.
      *
+     * MINRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+     *
      * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
      */
     template< typename _MatrixType, int _UpLo, typename _Preconditioner>
@@ -199,15 +200,15 @@ namespace Eigen {
     {
         
         typedef IterativeSolverBase<MINRES> Base;
-        using Base::mp_matrix;
+        using Base::matrix;
         using Base::m_error;
         using Base::m_iterations;
         using Base::m_info;
         using Base::m_isInitialized;
     public:
+        using Base::_solve_impl;
         typedef _MatrixType MatrixType;
         typedef typename MatrixType::Scalar Scalar;
-        typedef typename MatrixType::Index Index;
         typedef typename MatrixType::RealScalar RealScalar;
         typedef _Preconditioner Preconditioner;
         
@@ -228,46 +229,41 @@ namespace Eigen {
          * this class becomes invalid. Call compute() to update it with the new
          * matrix A, or modify a copy of A.
          */
-        MINRES(const MatrixType& A) : Base(A) {}
+        template<typename MatrixDerived>
+        explicit MINRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
         
         /** Destructor. */
         ~MINRES(){}
-		
-        /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-         * \a x0 as an initial solution.
-         *
-         * \sa compute()
-         */
-        template<typename Rhs,typename Guess>
-        inline const internal::solve_retval_with_guess<MINRES, Rhs, Guess>
-        solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-        {
-            eigen_assert(m_isInitialized && "MINRES is not initialized.");
-            eigen_assert(Base::rows()==b.rows()
-                         && "MINRES::solve(): invalid number of rows of the right hand side matrix b");
-            return internal::solve_retval_with_guess
-            <MINRES, Rhs, Guess>(*this, b.derived(), x0);
-        }
-        
+
         /** \internal */
         template<typename Rhs,typename Dest>
-        void _solveWithGuess(const Rhs& b, Dest& x) const
+        void _solve_with_guess_impl(const Rhs& b, Dest& x) const
         {
+            typedef typename Base::MatrixWrapper MatrixWrapper;
+            typedef typename Base::ActualMatrixType ActualMatrixType;
+            enum {
+              TransposeInput  =   (!MatrixWrapper::MatrixFree)
+                              &&  (UpLo==(Lower|Upper))
+                              &&  (!MatrixType::IsRowMajor)
+                              &&  (!NumTraits<Scalar>::IsComplex)
+            };
+            typedef typename internal::conditional<TransposeInput,Transpose<const ActualMatrixType>, ActualMatrixType const&>::type RowMajorWrapper;
+            EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY);
             typedef typename internal::conditional<UpLo==(Lower|Upper),
-                                                   const MatrixType&,
-                                                   SparseSelfAdjointView<const MatrixType, UpLo>
-                                                  >::type MatrixWrapperType;
-                                          
+                                                  RowMajorWrapper,
+                                                  typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type
+                                            >::type SelfAdjointWrapper;
+
             m_iterations = Base::maxIterations();
             m_error = Base::m_tolerance;
-            
+            RowMajorWrapper row_mat(matrix());
             for(int j=0; j<b.cols(); ++j)
             {
                 m_iterations = Base::maxIterations();
                 m_error = Base::m_tolerance;
                 
                 typename Dest::ColXpr xj(x,j);
-                internal::minres(MatrixWrapperType(*mp_matrix), b.col(j), xj,
+                internal::minres(SelfAdjointWrapper(row_mat), b.col(j), xj,
                                  Base::m_preconditioner, m_iterations, m_error);
             }
             
@@ -277,33 +273,16 @@ namespace Eigen {
         
         /** \internal */
         template<typename Rhs,typename Dest>
-        void _solve(const Rhs& b, Dest& x) const
+        void _solve_impl(const Rhs& b, MatrixBase<Dest> &x) const
         {
             x.setZero();
-            _solveWithGuess(b,x);
+            _solve_with_guess_impl(b,x.derived());
         }
         
     protected:
         
     };
-    
-    namespace internal {
-        
-        template<typename _MatrixType, int _UpLo, typename _Preconditioner, typename Rhs>
-        struct solve_retval<MINRES<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-        : solve_retval_base<MINRES<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-        {
-            typedef MINRES<_MatrixType,_UpLo,_Preconditioner> Dec;
-            EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-            
-            template<typename Dest> void evalTo(Dest& dst) const
-            {
-                dec()._solve(rhs(),dst);
-            }
-        };
-        
-    } // end namespace internal
-    
+
 } // end namespace Eigen
 
 #endif // EIGEN_MINRES_H
diff --git a/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/unsupported/Eigen/src/IterativeSolvers/Scaling.h
index 4fd439202..d113e6e90 100644
--- a/unsupported/Eigen/src/IterativeSolvers/Scaling.h
+++ b/unsupported/Eigen/src/IterativeSolvers/Scaling.h
@@ -9,6 +9,9 @@
 
 #ifndef EIGEN_ITERSCALING_H
 #define EIGEN_ITERSCALING_H
+
+namespace Eigen {
+
 /**
   * \ingroup IterativeSolvers_Module
   * \brief iterative scaling algorithm to equilibrate rows and column norms in matrices
@@ -41,8 +44,6 @@
   * 
   * \sa \ref IncompleteLUT 
   */
-namespace Eigen {
-using std::abs; 
 template<typename _MatrixType>
 class IterScaling
 {
@@ -71,6 +72,7 @@ class IterScaling
      */
     void compute (const MatrixType& mat)
     {
+      using std::abs;
       int m = mat.rows(); 
       int n = mat.cols();
       eigen_assert((m>0 && m == n) && "Please give a non - empty matrix");
diff --git a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt b/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
deleted file mode 100644
index 4daefebee..000000000
--- a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_KroneckerProduct_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_KroneckerProduct_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/KroneckerProduct COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index 532896c3b..582fa8512 100644
--- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -12,58 +12,93 @@
 #ifndef KRONECKER_TENSOR_PRODUCT_H
 #define KRONECKER_TENSOR_PRODUCT_H
 
-namespace Eigen { 
-
-template<typename Scalar, int Options, typename Index> class SparseMatrix;
+namespace Eigen {
 
 /*!
- * \brief Kronecker tensor product helper class for dense matrices
+ * \ingroup KroneckerProduct_Module
  *
- * This class is the return value of kroneckerProduct(MatrixBase,
- * MatrixBase). Use the function rather than construct this class
- * directly to avoid specifying template prarameters.
+ * \brief The base class of dense and sparse Kronecker product.
  *
- * \tparam Lhs  Type of the left-hand side, a matrix expression.
- * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
+ * \tparam Derived is the derived type.
  */
-template<typename Lhs, typename Rhs>
-class KroneckerProduct : public ReturnByValue<KroneckerProduct<Lhs,Rhs> >
+template<typename Derived>
+class KroneckerProductBase : public ReturnByValue<Derived>
 {
   private:
-    typedef ReturnByValue<KroneckerProduct> Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::Index Index;
+    typedef typename internal::traits<Derived> Traits;
+    typedef typename Traits::Scalar Scalar;
+
+  protected:
+    typedef typename Traits::Lhs Lhs;
+    typedef typename Traits::Rhs Rhs;
 
   public:
     /*! \brief Constructor. */
-    KroneckerProduct(const Lhs& A, const Rhs& B)
+    KroneckerProductBase(const Lhs& A, const Rhs& B)
       : m_A(A), m_B(B)
     {}
 
-    /*! \brief Evaluate the Kronecker tensor product. */
-    template<typename Dest> void evalTo(Dest& dst) const;
-    
     inline Index rows() const { return m_A.rows() * m_B.rows(); }
     inline Index cols() const { return m_A.cols() * m_B.cols(); }
 
+    /*!
+     * This overrides ReturnByValue::coeff because this function is
+     * efficient enough.
+     */
     Scalar coeff(Index row, Index col) const
     {
       return m_A.coeff(row / m_B.rows(), col / m_B.cols()) *
              m_B.coeff(row % m_B.rows(), col % m_B.cols());
     }
 
+    /*!
+     * This overrides ReturnByValue::coeff because this function is
+     * efficient enough.
+     */
     Scalar coeff(Index i) const
     {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(KroneckerProduct);
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
       return m_A.coeff(i / m_A.size()) * m_B.coeff(i % m_A.size());
     }
 
-  private:
+  protected:
     typename Lhs::Nested m_A;
     typename Rhs::Nested m_B;
 };
 
 /*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * \brief Kronecker tensor product helper class for dense matrices
+ *
+ * This class is the return value of kroneckerProduct(MatrixBase,
+ * MatrixBase). Use the function rather than construct this class
+ * directly to avoid specifying template prarameters.
+ *
+ * \tparam Lhs  Type of the left-hand side, a matrix expression.
+ * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
+ */
+template<typename Lhs, typename Rhs>
+class KroneckerProduct : public KroneckerProductBase<KroneckerProduct<Lhs,Rhs> >
+{
+  private:
+    typedef KroneckerProductBase<KroneckerProduct> Base;
+    using Base::m_A;
+    using Base::m_B;
+
+  public:
+    /*! \brief Constructor. */
+    KroneckerProduct(const Lhs& A, const Rhs& B)
+      : Base(A, B)
+    {}
+
+    /*! \brief Evaluate the Kronecker tensor product. */
+    template<typename Dest> void evalTo(Dest& dst) const;
+};
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
  * \brief Kronecker tensor product helper class for sparse matrices
  *
  * If at least one of the operands is a sparse matrix expression,
@@ -77,34 +112,21 @@ class KroneckerProduct : public ReturnByValue<KroneckerProduct<Lhs,Rhs> >
  * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
  */
 template<typename Lhs, typename Rhs>
-class KroneckerProductSparse : public EigenBase<KroneckerProductSparse<Lhs,Rhs> >
+class KroneckerProductSparse : public KroneckerProductBase<KroneckerProductSparse<Lhs,Rhs> >
 {
   private:
-    typedef typename internal::traits<KroneckerProductSparse>::Index Index;
+    typedef KroneckerProductBase<KroneckerProductSparse> Base;
+    using Base::m_A;
+    using Base::m_B;
 
   public:
     /*! \brief Constructor. */
     KroneckerProductSparse(const Lhs& A, const Rhs& B)
-      : m_A(A), m_B(B)
+      : Base(A, B)
     {}
 
     /*! \brief Evaluate the Kronecker tensor product. */
     template<typename Dest> void evalTo(Dest& dst) const;
-    
-    inline Index rows() const { return m_A.rows() * m_B.rows(); }
-    inline Index cols() const { return m_A.cols() * m_B.cols(); }
-
-    template<typename Scalar, int Options, typename Index>
-    operator SparseMatrix<Scalar, Options, Index>()
-    {
-      SparseMatrix<Scalar, Options, Index> result;
-      evalTo(result.derived());
-      return result;
-    }
-
-  private:
-    typename Lhs::Nested m_A;
-    typename Rhs::Nested m_B;
 };
 
 template<typename Lhs, typename Rhs>
@@ -124,22 +146,49 @@ template<typename Lhs, typename Rhs>
 template<typename Dest>
 void KroneckerProductSparse<Lhs,Rhs>::evalTo(Dest& dst) const
 {
-  const Index Br = m_B.rows(),
-              Bc = m_B.cols();
-  dst.resize(rows(),cols());
+  Index Br = m_B.rows(), Bc = m_B.cols();
+  dst.resize(this->rows(), this->cols());
   dst.resizeNonZeros(0);
-  dst.reserve(m_A.nonZeros() * m_B.nonZeros());
+  
+  // 1 - evaluate the operands if needed:
+  typedef typename internal::nested_eval<Lhs,Dynamic>::type Lhs1;
+  typedef typename internal::remove_all<Lhs1>::type Lhs1Cleaned;
+  const Lhs1 lhs1(m_A);
+  typedef typename internal::nested_eval<Rhs,Dynamic>::type Rhs1;
+  typedef typename internal::remove_all<Rhs1>::type Rhs1Cleaned;
+  const Rhs1 rhs1(m_B);
+    
+  // 2 - construct respective iterators
+  typedef Eigen::InnerIterator<Lhs1Cleaned> LhsInnerIterator;
+  typedef Eigen::InnerIterator<Rhs1Cleaned> RhsInnerIterator;
+  
+  // compute number of non-zeros per innervectors of dst
+  {
+    // TODO VectorXi is not necessarily big enough!
+    VectorXi nnzA = VectorXi::Zero(Dest::IsRowMajor ? m_A.rows() : m_A.cols());
+    for (Index kA=0; kA < m_A.outerSize(); ++kA)
+      for (LhsInnerIterator itA(lhs1,kA); itA; ++itA)
+        nnzA(Dest::IsRowMajor ? itA.row() : itA.col())++;
+      
+    VectorXi nnzB = VectorXi::Zero(Dest::IsRowMajor ? m_B.rows() : m_B.cols());
+    for (Index kB=0; kB < m_B.outerSize(); ++kB)
+      for (RhsInnerIterator itB(rhs1,kB); itB; ++itB)
+        nnzB(Dest::IsRowMajor ? itB.row() : itB.col())++;
+    
+    Matrix<int,Dynamic,Dynamic,ColMajor> nnzAB = nnzB * nnzA.transpose();
+    dst.reserve(VectorXi::Map(nnzAB.data(), nnzAB.size()));
+  }
 
   for (Index kA=0; kA < m_A.outerSize(); ++kA)
   {
     for (Index kB=0; kB < m_B.outerSize(); ++kB)
     {
-      for (typename Lhs::InnerIterator itA(m_A,kA); itA; ++itA)
+      for (LhsInnerIterator itA(lhs1,kA); itA; ++itA)
       {
-        for (typename Rhs::InnerIterator itB(m_B,kB); itB; ++itB)
+        for (RhsInnerIterator itB(rhs1,kB); itB; ++itB)
         {
-          const Index i = itA.row() * Br + itB.row(),
-                      j = itA.col() * Bc + itB.col();
+          Index i = itA.row() * Br + itB.row(),
+                j = itA.col() * Bc + itB.col();
           dst.insert(i,j) = itA.value() * itB.value();
         }
       }
@@ -154,14 +203,14 @@ struct traits<KroneckerProduct<_Lhs,_Rhs> >
 {
   typedef typename remove_all<_Lhs>::type Lhs;
   typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
 
   enum {
     Rows = size_at_compile_time<traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime>::ret,
     Cols = size_at_compile_time<traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime>::ret,
     MaxRows = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
-    MaxCols = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret,
-    CoeffReadCost = Lhs::CoeffReadCost + Rhs::CoeffReadCost + NumTraits<Scalar>::MulCost
+    MaxCols = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret
   };
 
   typedef Matrix<Scalar,Rows,Cols> ReturnType;
@@ -173,9 +222,9 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
   typedef MatrixXpr XprKind;
   typedef typename remove_all<_Lhs>::type Lhs;
   typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename Lhs::Index, typename Rhs::Index>::type Index;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind, scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret StorageKind;
+  typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
 
   enum {
     LhsFlags = Lhs::Flags,
@@ -190,9 +239,11 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
     RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
 
     Flags = ((LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
-          | EvalBeforeNestingBit | EvalBeforeAssigningBit,
-    CoeffReadCost = Dynamic
+          | EvalBeforeNestingBit,
+    CoeffReadCost = HugeCost
   };
+
+  typedef SparseMatrix<Scalar, 0, StorageIndex> ReturnType;
 };
 
 } // end namespace internal
@@ -228,6 +279,16 @@ KroneckerProduct<A,B> kroneckerProduct(const MatrixBase<A>& a, const MatrixBase<
  * Computes Kronecker tensor product of two matrices, at least one of
  * which is sparse
  *
+ * \warning If you want to replace a matrix by its Kronecker product
+ *          with some matrix, do \b NOT do this:
+ * \code
+ * A = kroneckerProduct(A,B); // bug!!! caused by aliasing effect
+ * \endcode
+ * instead, use eval() to work around this:
+ * \code
+ * A = kroneckerProduct(A,B).eval();
+ * \endcode
+ *
  * \param a  Dense/sparse matrix a
  * \param b  Dense/sparse matrix b
  * \return   Kronecker tensor product of a and b, stored in a sparse
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
deleted file mode 100644
index d9690854d..000000000
--- a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_LevenbergMarquardt_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_LevenbergMarquardt_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/LevenbergMarquardt COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h b/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
index 32d3ad518..b75bea25f 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
@@ -23,7 +23,6 @@ void covar(
         Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon()) )
 {
     using std::abs;
-    typedef DenseIndex Index;
     /* Local variables */
     Index i, j, k, l, ii, jj;
     bool sing;
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h b/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
index 9532042d9..9a4836547 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
@@ -30,7 +30,7 @@ namespace internal {
     using std::abs;
     typedef typename QRSolver::MatrixType MatrixType;
     typedef typename QRSolver::Scalar Scalar;
-    typedef typename QRSolver::Index Index;
+//    typedef typename QRSolver::StorageIndex StorageIndex;
 
     /* Local variables */
     Index j;
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h b/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
index f5290dee4..ae9d793b1 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
@@ -19,18 +19,17 @@ namespace Eigen {
 
 namespace internal {
 
-template <typename Scalar,int Rows, int Cols, typename Index>
+template <typename Scalar,int Rows, int Cols, typename PermIndex>
 void lmqrsolv(
   Matrix<Scalar,Rows,Cols> &s,
-  const PermutationMatrix<Dynamic,Dynamic,Index> &iPerm,
+  const PermutationMatrix<Dynamic,Dynamic,PermIndex> &iPerm,
   const Matrix<Scalar,Dynamic,1> &diag,
   const Matrix<Scalar,Dynamic,1> &qtb,
   Matrix<Scalar,Dynamic,1> &x,
   Matrix<Scalar,Dynamic,1> &sdiag)
 {
-
     /* Local variables */
-    Index i, j, k, l;
+    Index i, j, k;
     Scalar temp;
     Index n = s.cols();
     Matrix<Scalar,Dynamic,1>  wa(n);
@@ -52,7 +51,7 @@ void lmqrsolv(
 
         /*        prepare the row of d to be eliminated, locating the */
         /*        diagonal element using p from the qr factorization. */
-        l = iPerm.indices()(j);
+        const PermIndex l = iPerm.indices()(j);
         if (diag[l] == 0.)
             break;
         sdiag.tail(n-j).setZero();
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index 51dd1d3c4..995427978 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
@@ -115,8 +115,7 @@ class LevenbergMarquardt : internal::no_assignment_operator
     typedef typename FunctorType::JacobianType JacobianType;
     typedef typename JacobianType::Scalar Scalar;
     typedef typename JacobianType::RealScalar RealScalar; 
-    typedef typename JacobianType::Index Index;
-    typedef typename QRSolver::Index PermIndex;
+    typedef typename QRSolver::StorageIndex PermIndex;
     typedef Matrix<Scalar,Dynamic,1> FVectorType;
     typedef PermutationMatrix<Dynamic,Dynamic> PermutationType;
   public:
@@ -144,11 +143,13 @@ class LevenbergMarquardt : internal::no_assignment_operator
     
     /** Sets the default parameters */
     void resetParameters() 
-    { 
+    {
+      using std::sqrt;        
+
       m_factor = 100.; 
       m_maxfev = 400; 
-      m_ftol = std::sqrt(NumTraits<RealScalar>::epsilon());
-      m_xtol = std::sqrt(NumTraits<RealScalar>::epsilon());
+      m_ftol = sqrt(NumTraits<RealScalar>::epsilon());
+      m_xtol = sqrt(NumTraits<RealScalar>::epsilon());
       m_gtol = 0. ; 
       m_epsfcn = 0. ;
     }
@@ -174,6 +175,24 @@ class LevenbergMarquardt : internal::no_assignment_operator
     /** Use an external Scaling. If set to true, pass a nonzero diagonal to diag() */
     void setExternalScaling(bool value) {m_useExternalScaling  = value; }
     
+    /** \returns the tolerance for the norm of the solution vector */
+    RealScalar xtol() const {return m_xtol; }
+    
+    /** \returns the tolerance for the norm of the vector function */
+    RealScalar ftol() const {return m_ftol; }
+    
+    /** \returns the tolerance for the norm of the gradient of the error vector */
+    RealScalar gtol() const {return m_gtol; }
+    
+    /** \returns the step bound for the diagonal shift */
+    RealScalar factor() const {return m_factor; }
+    
+    /** \returns the error precision */
+    RealScalar epsilon() const {return m_epsfcn; }
+    
+    /** \returns the maximum number of function evaluation */
+    Index maxfev() const {return m_maxfev; }
+    
     /** \returns a reference to the diagonal of the jacobian */
     FVectorType& diag() {return m_diag; }
     
@@ -285,7 +304,7 @@ LevenbergMarquardt<FunctorType>::minimizeInit(FVectorType  &x)
 //     m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative
     if (!m_useExternalScaling)
         m_diag.resize(n);
-    eigen_assert( (!m_useExternalScaling || m_diag.size()==n) || "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
+    eigen_assert( (!m_useExternalScaling || m_diag.size()==n) && "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
     m_qtf.resize(n);
 
     /* Function Body */
diff --git a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt b/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
deleted file mode 100644
index cdde64d2c..000000000
--- a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MatrixFunctions_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_MatrixFunctions_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MatrixFunctions COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index 6825a7882..bb6d9e1fe 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -1,8 +1,8 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009, 2010 Jitse Niesen <jitse@maths.leeds.ac.uk>
-// Copyright (C) 2011 Chen-Pang He <jdh8@ms63.hinet.net>
+// Copyright (C) 2009, 2010, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2011, 2013 Chen-Pang He <jdh8@ms63.hinet.net>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,388 +14,374 @@
 #include "StemFunction.h"
 
 namespace Eigen {
+namespace internal {
 
-/** \ingroup MatrixFunctions_Module
-  * \brief Class for computing the matrix exponential.
-  * \tparam MatrixType type of the argument of the exponential,
-  * expected to be an instantiation of the Matrix class template.
-  */
-template <typename MatrixType>
-class MatrixExponential {
-
-  public:
+/** \brief Scaling operator.
+ *
+ * This struct is used by CwiseUnaryOp to scale a matrix by \f$ 2^{-s} \f$.
+ */
+template <typename RealScalar>
+struct MatrixExponentialScalingOp
+{
+  /** \brief Constructor.
+   *
+   * \param[in] squarings  The integer \f$ s \f$ in this document.
+   */
+  MatrixExponentialScalingOp(int squarings) : m_squarings(squarings) { }
+
+
+  /** \brief Scale a matrix coefficient.
+   *
+   * \param[in,out] x  The scalar to be scaled, becoming \f$ 2^{-s} x \f$.
+   */
+  inline const RealScalar operator() (const RealScalar& x) const
+  {
+    using std::ldexp;
+    return ldexp(x, -m_squarings);
+  }
 
-    /** \brief Constructor.
-      * 
-      * The class stores a reference to \p M, so it should not be
-      * changed (or destroyed) before compute() is called.
-      *
-      * \param[in] M  matrix whose exponential is to be computed.
-      */
-    MatrixExponential(const MatrixType &M);
+  typedef std::complex<RealScalar> ComplexScalar;
 
-    /** \brief Computes the matrix exponential.
-      *
-      * \param[out] result  the matrix exponential of \p M in the constructor.
-      */
-    template <typename ResultType> 
-    void compute(ResultType &result);
+  /** \brief Scale a matrix coefficient.
+   *
+   * \param[in,out] x  The scalar to be scaled, becoming \f$ 2^{-s} x \f$.
+   */
+  inline const ComplexScalar operator() (const ComplexScalar& x) const
+  {
+    using std::ldexp;
+    return ComplexScalar(ldexp(x.real(), -m_squarings), ldexp(x.imag(), -m_squarings));
+  }
 
   private:
-
-    // Prevent copying
-    MatrixExponential(const MatrixExponential&);
-    MatrixExponential& operator=(const MatrixExponential&);
-
-    /** \brief Compute the (3,3)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade3(const MatrixType &A);
-
-    /** \brief Compute the (5,5)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade5(const MatrixType &A);
-
-    /** \brief Compute the (7,7)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade7(const MatrixType &A);
-
-    /** \brief Compute the (9,9)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade9(const MatrixType &A);
-
-    /** \brief Compute the (13,13)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade13(const MatrixType &A);
-
-    /** \brief Compute the (17,17)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  This function activates only if your long double is double-double or quadruple.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade17(const MatrixType &A);
-
-    /** \brief Compute Pad&eacute; approximant to the exponential.
-     *
-     * Computes \c m_U, \c m_V and \c m_squarings such that
-     * \f$ (V+U)(V-U)^{-1} \f$ is a Pad&eacute; of
-     * \f$ \exp(2^{-\mbox{squarings}}M) \f$ around \f$ M = 0 \f$. The
-     * degree of the Pad&eacute; approximant and the value of
-     * squarings are chosen such that the approximation error is no
-     * more than the round-off error.
-     *
-     * The argument of this function should correspond with the (real
-     * part of) the entries of \c m_M.  It is used to select the
-     * correct implementation using overloading.
-     */
-    void computeUV(double);
-
-    /** \brief Compute Pad&eacute; approximant to the exponential.
-     *
-     *  \sa computeUV(double);
-     */
-    void computeUV(float);
-    
-    /** \brief Compute Pad&eacute; approximant to the exponential.
-     *
-     *  \sa computeUV(double);
-     */
-    void computeUV(long double);
-
-    typedef typename internal::traits<MatrixType>::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename std::complex<RealScalar> ComplexScalar;
-
-    /** \brief Reference to matrix whose exponential is to be computed. */
-    typename internal::nested<MatrixType>::type m_M;
-
-    /** \brief Odd-degree terms in numerator of Pad&eacute; approximant. */
-    MatrixType m_U;
-
-    /** \brief Even-degree terms in numerator of Pad&eacute; approximant. */
-    MatrixType m_V;
-
-    /** \brief Used for temporary storage. */
-    MatrixType m_tmp1;
-
-    /** \brief Used for temporary storage. */
-    MatrixType m_tmp2;
-
-    /** \brief Identity matrix of the same size as \c m_M. */
-    MatrixType m_Id;
-
-    /** \brief Number of squarings required in the last step. */
     int m_squarings;
-
-    /** \brief L1 norm of m_M. */
-    RealScalar m_l1norm;
 };
 
-template <typename MatrixType>
-MatrixExponential<MatrixType>::MatrixExponential(const MatrixType &M) :
-  m_M(M),
-  m_U(M.rows(),M.cols()),
-  m_V(M.rows(),M.cols()),
-  m_tmp1(M.rows(),M.cols()),
-  m_tmp2(M.rows(),M.cols()),
-  m_Id(MatrixType::Identity(M.rows(), M.cols())),
-  m_squarings(0),
-  m_l1norm(M.cwiseAbs().colwise().sum().maxCoeff())
-{
-  /* empty body */
-}
-
-template <typename MatrixType>
-template <typename ResultType> 
-void MatrixExponential<MatrixType>::compute(ResultType &result)
+/** \brief Compute the (3,3)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade3(const MatA& A, MatU& U, MatV& V)
 {
-#if LDBL_MANT_DIG > 112 // rarely happens
-  if(sizeof(RealScalar) > 14) {
-    result = m_M.matrixFunction(StdStemFunctions<ComplexScalar>::exp);
-    return;
-  }
-#endif
-  computeUV(RealScalar());
-  m_tmp1 = m_U + m_V;   // numerator of Pade approximant
-  m_tmp2 = -m_U + m_V;  // denominator of Pade approximant
-  result = m_tmp2.partialPivLu().solve(m_tmp1);
-  for (int i=0; i<m_squarings; i++)
-    result *= result;   // undo scaling by repeated squaring
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatA>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {120.L, 60.L, 12.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade3(const MatrixType &A)
+/** \brief Compute the (5,5)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade5(const MatA& A, MatU& U, MatV& V)
 {
-  const RealScalar b[] = {120., 60., 12., 1.};
-  m_tmp1.noalias() = A * A;
-  m_tmp2 = b[3]*m_tmp1 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_V = b[2]*m_tmp1 + b[0]*m_Id;
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType tmp = b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade5(const MatrixType &A)
+/** \brief Compute the (7,7)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade7(const MatA& A, MatU& U, MatV& V)
 {
-  const RealScalar b[] = {30240., 15120., 3360., 420., 30., 1.};
-  MatrixType A2 = A * A;
-  m_tmp1.noalias() = A2 * A2;
-  m_tmp2 = b[5]*m_tmp1 + b[3]*A2 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_V = b[4]*m_tmp1 + b[2]*A2 + b[0]*m_Id;
-}
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  const MatrixType tmp = b[7] * A6 + b[5] * A4 + b[3] * A2 
+    + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade7(const MatrixType &A)
-{
-  const RealScalar b[] = {17297280., 8648640., 1995840., 277200., 25200., 1512., 56., 1.};
-  MatrixType A2 = A * A;
-  MatrixType A4 = A2 * A2;
-  m_tmp1.noalias() = A4 * A2;
-  m_tmp2 = b[7]*m_tmp1 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_V = b[6]*m_tmp1 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
 }
 
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade9(const MatrixType &A)
+/** \brief Compute the (9,9)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade9(const MatA& A, MatU& U, MatV& V)
 {
-  const RealScalar b[] = {17643225600., 8821612800., 2075673600., 302702400., 30270240.,
-		      2162160., 110880., 3960., 90., 1.};
-  MatrixType A2 = A * A;
-  MatrixType A4 = A2 * A2;
-  MatrixType A6 = A4 * A2;
-  m_tmp1.noalias() = A6 * A2;
-  m_tmp2 = b[9]*m_tmp1 + b[7]*A6 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_V = b[8]*m_tmp1 + b[6]*A6 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L,
+                          2162160.L, 110880.L, 3960.L, 90.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  const MatrixType A8 = A6 * A2;
+  const MatrixType tmp = b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2 
+    + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[8] * A8 + b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade13(const MatrixType &A)
+/** \brief Compute the (13,13)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade13(const MatA& A, MatU& U, MatV& V)
 {
-  const RealScalar b[] = {64764752532480000., 32382376266240000., 7771770303897600.,
-		      1187353796428800., 129060195264000., 10559470521600., 670442572800.,
-		      33522128640., 1323241920., 40840800., 960960., 16380., 182., 1.};
-  MatrixType A2 = A * A;
-  MatrixType A4 = A2 * A2;
-  m_tmp1.noalias() = A4 * A2;
-  m_V = b[13]*m_tmp1 + b[11]*A4 + b[9]*A2; // used for temporary storage
-  m_tmp2.noalias() = m_tmp1 * m_V;
-  m_tmp2 += b[7]*m_tmp1 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_tmp2 = b[12]*m_tmp1 + b[10]*A4 + b[8]*A2;
-  m_V.noalias() = m_tmp1 * m_tmp2;
-  m_V += b[6]*m_tmp1 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L,
+                          1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L,
+                          33522128640.L, 1323241920.L, 40840800.L, 960960.L, 16380.L, 182.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  V = b[13] * A6 + b[11] * A4 + b[9] * A2; // used for temporary storage
+  MatrixType tmp = A6 * V;
+  tmp += b[7] * A6 + b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  tmp = b[12] * A6 + b[10] * A4 + b[8] * A2;
+  V.noalias() = A6 * tmp;
+  V += b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 
+/** \brief Compute the (17,17)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ *
+ *  This function activates only if your long double is double-double or quadruple.
+ */
 #if LDBL_MANT_DIG > 64
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade17(const MatrixType &A)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade17(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
   const RealScalar b[] = {830034394580628357120000.L, 415017197290314178560000.L,
-		      100610229646136770560000.L, 15720348382208870400000.L,
-		      1774878043152614400000.L, 153822763739893248000.L, 10608466464820224000.L,
-		      595373117923584000.L, 27563570274240000.L, 1060137318240000.L,
-		      33924394183680.L, 899510451840.L, 19554575040.L, 341863200.L, 4651200.L,
-		      46512.L, 306.L, 1.L};
-  MatrixType A2 = A * A;
-  MatrixType A4 = A2 * A2;
-  MatrixType A6 = A4 * A2;
-  m_tmp1.noalias() = A4 * A4;
-  m_V = b[17]*m_tmp1 + b[15]*A6 + b[13]*A4 + b[11]*A2; // used for temporary storage
-  m_tmp2.noalias() = m_tmp1 * m_V;
-  m_tmp2 += b[9]*m_tmp1 + b[7]*A6 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_tmp2 = b[16]*m_tmp1 + b[14]*A6 + b[12]*A4 + b[10]*A2;
-  m_V.noalias() = m_tmp1 * m_tmp2;
-  m_V += b[8]*m_tmp1 + b[6]*A6 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+                          100610229646136770560000.L, 15720348382208870400000.L,
+                          1774878043152614400000.L, 153822763739893248000.L, 10608466464820224000.L,
+                          595373117923584000.L, 27563570274240000.L, 1060137318240000.L,
+                          33924394183680.L, 899510451840.L, 19554575040.L, 341863200.L, 4651200.L,
+                          46512.L, 306.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  const MatrixType A8 = A4 * A4;
+  V = b[17] * A8 + b[15] * A6 + b[13] * A4 + b[11] * A2; // used for temporary storage
+  MatrixType tmp = A8 * V;
+  tmp += b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2 
+    + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  tmp = b[16] * A8 + b[14] * A6 + b[12] * A4 + b[10] * A2;
+  V.noalias() = tmp * A8;
+  V += b[8] * A8 + b[6] * A6 + b[4] * A4 + b[2] * A2 
+    + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 #endif
 
+template <typename MatrixType, typename RealScalar = typename NumTraits<typename traits<MatrixType>::Scalar>::Real>
+struct matrix_exp_computeUV
+{
+  /** \brief Compute Pad&eacute; approximant to the exponential.
+    *
+    * Computes \c U, \c V and \c squarings such that \f$ (V+U)(V-U)^{-1} \f$ is a Pad&eacute;
+    * approximant of \f$ \exp(2^{-\mbox{squarings}}M) \f$ around \f$ M = 0 \f$, where \f$ M \f$
+    * denotes the matrix \c arg. The degree of the Pad&eacute; approximant and the value of squarings
+    * are chosen such that the approximation error is no more than the round-off error.
+    */
+  static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings);
+};
+
 template <typename MatrixType>
-void MatrixExponential<MatrixType>::computeUV(float)
+struct matrix_exp_computeUV<MatrixType, float>
 {
-  using std::frexp;
-  using std::pow;
-  if (m_l1norm < 4.258730016922831e-001) {
-    pade3(m_M);
-  } else if (m_l1norm < 1.880152677804762e+000) {
-    pade5(m_M);
-  } else {
-    const float maxnorm = 3.925724783138660f;
-    frexp(m_l1norm / maxnorm, &m_squarings);
-    if (m_squarings < 0) m_squarings = 0;
-    MatrixType A = m_M / pow(Scalar(2), m_squarings);
-    pade7(A);
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
+  {
+    using std::frexp;
+    using std::pow;
+    const float l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+    squarings = 0;
+    if (l1norm < 4.258730016922831e-001f) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 1.880152677804762e+000f) {
+      matrix_exp_pade5(arg, U, V);
+    } else {
+      const float maxnorm = 3.925724783138660f;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<float>(squarings));
+      matrix_exp_pade7(A, U, V);
+    }
   }
-}
+};
 
 template <typename MatrixType>
-void MatrixExponential<MatrixType>::computeUV(double)
+struct matrix_exp_computeUV<MatrixType, double>
 {
-  using std::frexp;
-  using std::pow;
-  if (m_l1norm < 1.495585217958292e-002) {
-    pade3(m_M);
-  } else if (m_l1norm < 2.539398330063230e-001) {
-    pade5(m_M);
-  } else if (m_l1norm < 9.504178996162932e-001) {
-    pade7(m_M);
-  } else if (m_l1norm < 2.097847961257068e+000) {
-    pade9(m_M);
-  } else {
-    const double maxnorm = 5.371920351148152;
-    frexp(m_l1norm / maxnorm, &m_squarings);
-    if (m_squarings < 0) m_squarings = 0;
-    MatrixType A = m_M / pow(Scalar(2), m_squarings);
-    pade13(A);
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
+  {
+    using std::frexp;
+    using std::pow;
+    const double l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+    squarings = 0;
+    if (l1norm < 1.495585217958292e-002) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 2.539398330063230e-001) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 9.504178996162932e-001) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 2.097847961257068e+000) {
+      matrix_exp_pade9(arg, U, V);
+    } else {
+      const double maxnorm = 5.371920351148152;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<double>(squarings));
+      matrix_exp_pade13(A, U, V);
+    }
   }
-}
-
+};
+  
 template <typename MatrixType>
-void MatrixExponential<MatrixType>::computeUV(long double)
+struct matrix_exp_computeUV<MatrixType, long double>
 {
-  using std::frexp;
-  using std::pow;
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
+  {
 #if   LDBL_MANT_DIG == 53   // double precision
-  computeUV(double());
-#elif LDBL_MANT_DIG <= 64   // extended precision
-  if (m_l1norm < 4.1968497232266989671e-003L) {
-    pade3(m_M);
-  } else if (m_l1norm < 1.1848116734693823091e-001L) {
-    pade5(m_M);
-  } else if (m_l1norm < 5.5170388480686700274e-001L) {
-    pade7(m_M);
-  } else if (m_l1norm < 1.3759868875587845383e+000L) {
-    pade9(m_M);
-  } else {
-    const long double maxnorm = 4.0246098906697353063L;
-    frexp(m_l1norm / maxnorm, &m_squarings);
-    if (m_squarings < 0) m_squarings = 0;
-    MatrixType A = m_M / pow(Scalar(2), m_squarings);
-    pade13(A);
-  }
+    matrix_exp_computeUV<MatrixType, double>::run(arg, U, V, squarings);
+  
+#else
+  
+    using std::frexp;
+    using std::pow;
+    const long double l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+    squarings = 0;
+  
+#if LDBL_MANT_DIG <= 64   // extended precision
+  
+    if (l1norm < 4.1968497232266989671e-003L) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 1.1848116734693823091e-001L) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 5.5170388480686700274e-001L) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 1.3759868875587845383e+000L) {
+      matrix_exp_pade9(arg, U, V);
+    } else {
+      const long double maxnorm = 4.0246098906697353063L;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
+      matrix_exp_pade13(A, U, V);
+    }
+  
 #elif LDBL_MANT_DIG <= 106  // double-double
-  if (m_l1norm < 3.2787892205607026992947488108213e-005L) {
-    pade3(m_M);
-  } else if (m_l1norm < 6.4467025060072760084130906076332e-003L) {
-    pade5(m_M);
-  } else if (m_l1norm < 6.8988028496595374751374122881143e-002L) {
-    pade7(m_M);
-  } else if (m_l1norm < 2.7339737518502231741495857201670e-001L) {
-    pade9(m_M);
-  } else if (m_l1norm < 1.3203382096514474905666448850278e+000L) {
-    pade13(m_M);
-  } else {
-    const long double maxnorm = 3.2579440895405400856599663723517L;
-    frexp(m_l1norm / maxnorm, &m_squarings);
-    if (m_squarings < 0) m_squarings = 0;
-    MatrixType A = m_M / pow(Scalar(2), m_squarings);
-    pade17(A);
-  }
+  
+    if (l1norm < 3.2787892205607026992947488108213e-005L) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 6.4467025060072760084130906076332e-003L) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 6.8988028496595374751374122881143e-002L) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 2.7339737518502231741495857201670e-001L) {
+      matrix_exp_pade9(arg, U, V);
+    } else if (l1norm < 1.3203382096514474905666448850278e+000L) {
+      matrix_exp_pade13(arg, U, V);
+    } else {
+      const long double maxnorm = 3.2579440895405400856599663723517L;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
+      matrix_exp_pade17(A, U, V);
+    }
+  
 #elif LDBL_MANT_DIG <= 112  // quadruple precison
-  if (m_l1norm < 1.639394610288918690547467954466970e-005L) {
-    pade3(m_M);
-  } else if (m_l1norm < 4.253237712165275566025884344433009e-003L) {
-    pade5(m_M);
-  } else if (m_l1norm < 5.125804063165764409885122032933142e-002L) {
-    pade7(m_M);
-  } else if (m_l1norm < 2.170000765161155195453205651889853e-001L) {
-    pade9(m_M);
-  } else if (m_l1norm < 1.125358383453143065081397882891878e+000L) {
-    pade13(m_M);
-  } else {
-    const long double maxnorm = 2.884233277829519311757165057717815L;
-    frexp(m_l1norm / maxnorm, &m_squarings);
-    if (m_squarings < 0) m_squarings = 0;
-    MatrixType A = m_M / pow(Scalar(2), m_squarings);
-    pade17(A);
-  }
+  
+    if (l1norm < 1.639394610288918690547467954466970e-005L) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 4.253237712165275566025884344433009e-003L) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 5.125804063165764409885122032933142e-002L) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 2.170000765161155195453205651889853e-001L) {
+      matrix_exp_pade9(arg, U, V);
+    } else if (l1norm < 1.125358383453143065081397882891878e+000L) {
+      matrix_exp_pade13(arg, U, V);
+    } else {
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
+      matrix_exp_pade17(A, U, V);
+    }
+  
 #else
-  // this case should be handled in compute()
-  eigen_assert(false && "Bug in MatrixExponential"); 
+  
+    // this case should be handled in compute()
+    eigen_assert(false && "Bug in MatrixExponential"); 
+  
+#endif
 #endif  // LDBL_MANT_DIG
+  }
+};
+
+
+/* Computes the matrix exponential
+ *
+ * \param arg    argument of matrix exponential (should be plain object)
+ * \param result variable in which result will be stored
+ */
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType &result)
+{
+  typedef typename ArgType::PlainObject MatrixType;
+#if LDBL_MANT_DIG > 112 // rarely happens
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  if (sizeof(RealScalar) > 14) {
+    result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>);
+    return;
+  }
+#endif
+  MatrixType U, V;
+  int squarings; 
+  matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V)
+  MatrixType numer = U + V;
+  MatrixType denom = -U + V;
+  result = denom.partialPivLu().solve(numer);
+  for (int i=0; i<squarings; i++)
+    result *= result;   // undo scaling by repeated squaring
 }
 
+} // end namespace Eigen::internal
+
 /** \ingroup MatrixFunctions_Module
   *
   * \brief Proxy for the matrix exponential of some matrix (expression).
   *
   * \tparam Derived  Type of the argument to the matrix exponential.
   *
-  * This class holds the argument to the matrix exponential until it
-  * is assigned or evaluated for some other reason (so the argument
-  * should not be changed in the meantime). It is the return type of
-  * MatrixBase::exp() and most of the time this is the only way it is
-  * used.
+  * This class holds the argument to the matrix exponential until it is assigned or evaluated for
+  * some other reason (so the argument should not be changed in the meantime). It is the return type
+  * of MatrixBase::exp() and most of the time this is the only way it is used.
   */
 template<typename Derived> struct MatrixExponentialReturnValue
 : public ReturnByValue<MatrixExponentialReturnValue<Derived> >
@@ -404,31 +390,26 @@ template<typename Derived> struct MatrixExponentialReturnValue
   public:
     /** \brief Constructor.
       *
-      * \param[in] src %Matrix (expression) forming the argument of the
-      * matrix exponential.
+      * \param src %Matrix (expression) forming the argument of the matrix exponential.
       */
     MatrixExponentialReturnValue(const Derived& src) : m_src(src) { }
 
     /** \brief Compute the matrix exponential.
       *
-      * \param[out] result the matrix exponential of \p src in the
-      * constructor.
+      * \param result the matrix exponential of \p src in the constructor.
       */
     template <typename ResultType>
     inline void evalTo(ResultType& result) const
     {
-      const typename Derived::PlainObject srcEvaluated = m_src.eval();
-      MatrixExponential<typename Derived::PlainObject> me(srcEvaluated);
-      me.compute(result);
+      const typename internal::nested_eval<Derived, 10>::type tmp(m_src);
+      internal::matrix_exp_compute(tmp, result);
     }
 
     Index rows() const { return m_src.rows(); }
     Index cols() const { return m_src.cols(); }
 
   protected:
-    const Derived& m_src;
-  private:
-    MatrixExponentialReturnValue& operator=(const MatrixExponentialReturnValue&);
+    const typename internal::ref_selector<Derived>::type m_src;
 };
 
 namespace internal {
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index 7d426640c..db2449d02 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009-2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2009-2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,398 +11,245 @@
 #define EIGEN_MATRIX_FUNCTION
 
 #include "StemFunction.h"
-#include "MatrixFunctionAtomic.h"
 
 
 namespace Eigen { 
 
+namespace internal {
+
+/** \brief Maximum distance allowed between eigenvalues to be considered "close". */
+static const float matrix_function_separation = 0.1f;
+
 /** \ingroup MatrixFunctions_Module
-  * \brief Class for computing matrix functions.
-  * \tparam  MatrixType  type of the argument of the matrix function,
-  *                      expected to be an instantiation of the Matrix class template.
-  * \tparam  AtomicType  type for computing matrix function of atomic blocks.
-  * \tparam  IsComplex   used internally to select correct specialization.
+  * \class MatrixFunctionAtomic
+  * \brief Helper class for computing matrix functions of atomic matrices.
   *
-  * This class implements the Schur-Parlett algorithm for computing matrix functions. The spectrum of the
-  * matrix is divided in clustered of eigenvalues that lies close together. This class delegates the
-  * computation of the matrix function on every block corresponding to these clusters to an object of type
-  * \p AtomicType and uses these results to compute the matrix function of the whole matrix. The class
-  * \p AtomicType should have a \p compute() member function for computing the matrix function of a block.
-  *
-  * \sa class MatrixFunctionAtomic, class MatrixLogarithmAtomic
+  * Here, an atomic matrix is a triangular matrix whose diagonal entries are close to each other.
   */
-template <typename MatrixType, 
-	  typename AtomicType,  
-          int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
-class MatrixFunction
-{  
+template <typename MatrixType>
+class MatrixFunctionAtomic 
+{
   public:
 
-    /** \brief Constructor. 
-      *
-      * \param[in]  A       argument of matrix function, should be a square matrix.
-      * \param[in]  atomic  class for computing matrix function of atomic blocks.
-      *
-      * The class stores references to \p A and \p atomic, so they should not be
-      * changed (or destroyed) before compute() is called.
-      */
-    MatrixFunction(const MatrixType& A, AtomicType& atomic);
-
-    /** \brief Compute the matrix function.
-      *
-      * \param[out] result  the function \p f applied to \p A, as
-      * specified in the constructor.
-      *
-      * See MatrixBase::matrixFunction() for details on how this computation
-      * is implemented.
-      */
-    template <typename ResultType> 
-    void compute(ResultType &result);    
-};
-
-
-/** \internal \ingroup MatrixFunctions_Module 
-  * \brief Partial specialization of MatrixFunction for real matrices
-  */
-template <typename MatrixType, typename AtomicType>
-class MatrixFunction<MatrixType, AtomicType, 0>
-{  
-  private:
-
-    typedef internal::traits<MatrixType> Traits;
-    typedef typename Traits::Scalar Scalar;
-    static const int Rows = Traits::RowsAtCompileTime;
-    static const int Cols = Traits::ColsAtCompileTime;
-    static const int Options = MatrixType::Options;
-    static const int MaxRows = Traits::MaxRowsAtCompileTime;
-    static const int MaxCols = Traits::MaxColsAtCompileTime;
-
-    typedef std::complex<Scalar> ComplexScalar;
-    typedef Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols> ComplexMatrix;
-
-  public:
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename stem_function<Scalar>::type StemFunction;
 
-    /** \brief Constructor. 
-      *
-      * \param[in]  A       argument of matrix function, should be a square matrix.
-      * \param[in]  atomic  class for computing matrix function of atomic blocks.
+    /** \brief Constructor
+      * \param[in]  f  matrix function to compute.
       */
-    MatrixFunction(const MatrixType& A, AtomicType& atomic) : m_A(A), m_atomic(atomic) { }
+    MatrixFunctionAtomic(StemFunction f) : m_f(f) { }
 
-    /** \brief Compute the matrix function.
-      *
-      * \param[out] result  the function \p f applied to \p A, as
-      * specified in the constructor.
-      *
-      * This function converts the real matrix \c A to a complex matrix,
-      * uses MatrixFunction<MatrixType,1> and then converts the result back to
-      * a real matrix.
+    /** \brief Compute matrix function of atomic matrix
+      * \param[in]  A  argument of matrix function, should be upper triangular and atomic
+      * \returns  f(A), the matrix function evaluated at the given matrix
       */
-    template <typename ResultType>
-    void compute(ResultType& result) 
-    {
-      ComplexMatrix CA = m_A.template cast<ComplexScalar>();
-      ComplexMatrix Cresult;
-      MatrixFunction<ComplexMatrix, AtomicType> mf(CA, m_atomic);
-      mf.compute(Cresult);
-      result = Cresult.real();
-    }
-
-  private:
-    typename internal::nested<MatrixType>::type m_A; /**< \brief Reference to argument of matrix function. */
-    AtomicType& m_atomic; /**< \brief Class for computing matrix function of atomic blocks. */
-
-    MatrixFunction& operator=(const MatrixFunction&);
-};
-
-      
-/** \internal \ingroup MatrixFunctions_Module 
-  * \brief Partial specialization of MatrixFunction for complex matrices
-  */
-template <typename MatrixType, typename AtomicType>
-class MatrixFunction<MatrixType, AtomicType, 1>
-{
-  private:
-
-    typedef internal::traits<MatrixType> Traits;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-    static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
-    static const int Options = MatrixType::Options;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef Matrix<Scalar, Traits::RowsAtCompileTime, 1> VectorType;
-    typedef Matrix<Index, Traits::RowsAtCompileTime, 1> IntVectorType;
-    typedef Matrix<Index, Dynamic, 1> DynamicIntVectorType;
-    typedef std::list<Scalar> Cluster;
-    typedef std::list<Cluster> ListOfClusters;
-    typedef Matrix<Scalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
-
-  public:
-
-    MatrixFunction(const MatrixType& A, AtomicType& atomic);
-    template <typename ResultType> void compute(ResultType& result);
+    MatrixType compute(const MatrixType& A);
 
   private:
-
-    void computeSchurDecomposition();
-    void partitionEigenvalues();
-    typename ListOfClusters::iterator findCluster(Scalar key);
-    void computeClusterSize();
-    void computeBlockStart();
-    void constructPermutation();
-    void permuteSchur();
-    void swapEntriesInSchur(Index index);
-    void computeBlockAtomic();
-    Block<MatrixType> block(MatrixType& A, Index i, Index j);
-    void computeOffDiagonal();
-    DynMatrixType solveTriangularSylvester(const DynMatrixType& A, const DynMatrixType& B, const DynMatrixType& C);
-
-    typename internal::nested<MatrixType>::type m_A; /**< \brief Reference to argument of matrix function. */
-    AtomicType& m_atomic; /**< \brief Class for computing matrix function of atomic blocks. */
-    MatrixType m_T; /**< \brief Triangular part of Schur decomposition */
-    MatrixType m_U; /**< \brief Unitary part of Schur decomposition */
-    MatrixType m_fT; /**< \brief %Matrix function applied to #m_T */
-    ListOfClusters m_clusters; /**< \brief Partition of eigenvalues into clusters of ei'vals "close" to each other */
-    DynamicIntVectorType m_eivalToCluster; /**< \brief m_eivalToCluster[i] = j means i-th ei'val is in j-th cluster */
-    DynamicIntVectorType m_clusterSize; /**< \brief Number of eigenvalues in each clusters  */
-    DynamicIntVectorType m_blockStart; /**< \brief Row index at which block corresponding to i-th cluster starts */
-    IntVectorType m_permutation; /**< \brief Permutation which groups ei'vals in the same cluster together */
-
-    /** \brief Maximum distance allowed between eigenvalues to be considered "close".
-      *
-      * This is morally a \c static \c const \c Scalar, but only
-      * integers can be static constant class members in C++. The
-      * separation constant is set to 0.1, a value taken from the
-      * paper by Davies and Higham. */
-    static const RealScalar separation() { return static_cast<RealScalar>(0.1); }
-
-    MatrixFunction& operator=(const MatrixFunction&);
+    StemFunction* m_f;
 };
 
-/** \brief Constructor. 
- *
- * \param[in]  A       argument of matrix function, should be a square matrix.
- * \param[in]  atomic  class for computing matrix function of atomic blocks.
- */
-template <typename MatrixType, typename AtomicType>
-MatrixFunction<MatrixType,AtomicType,1>::MatrixFunction(const MatrixType& A, AtomicType& atomic)
-  : m_A(A), m_atomic(atomic)
+template <typename MatrixType>
+typename NumTraits<typename MatrixType::Scalar>::Real matrix_function_compute_mu(const MatrixType& A)
 {
-  /* empty body */
+  typedef typename plain_col_type<MatrixType>::type VectorType;
+  typename MatrixType::Index rows = A.rows();
+  const MatrixType N = MatrixType::Identity(rows, rows) - A;
+  VectorType e = VectorType::Ones(rows);
+  N.template triangularView<Upper>().solveInPlace(e);
+  return e.cwiseAbs().maxCoeff();
 }
 
-/** \brief Compute the matrix function.
-  *
-  * \param[out] result  the function \p f applied to \p A, as
-  * specified in the constructor.
-  */
-template <typename MatrixType, typename AtomicType>
-template <typename ResultType>
-void MatrixFunction<MatrixType,AtomicType,1>::compute(ResultType& result) 
+template <typename MatrixType>
+MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
 {
-  computeSchurDecomposition();
-  partitionEigenvalues();
-  computeClusterSize();
-  computeBlockStart();
-  constructPermutation();
-  permuteSchur();
-  computeBlockAtomic();
-  computeOffDiagonal();
-  result = m_U * (m_fT.template triangularView<Upper>() * m_U.adjoint());
+  // TODO: Use that A is upper triangular
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename MatrixType::Index Index;
+  Index rows = A.rows();
+  Scalar avgEival = A.trace() / Scalar(RealScalar(rows));
+  MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows);
+  RealScalar mu = matrix_function_compute_mu(Ashifted);
+  MatrixType F = m_f(avgEival, 0) * MatrixType::Identity(rows, rows);
+  MatrixType P = Ashifted;
+  MatrixType Fincr;
+  for (Index s = 1; s < 1.1 * rows + 10; s++) { // upper limit is fairly arbitrary
+    Fincr = m_f(avgEival, static_cast<int>(s)) * P;
+    F += Fincr;
+    P = Scalar(RealScalar(1.0/(s + 1))) * P * Ashifted;
+
+    // test whether Taylor series converged
+    const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff();
+    const RealScalar Fincr_norm = Fincr.cwiseAbs().rowwise().sum().maxCoeff();
+    if (Fincr_norm < NumTraits<Scalar>::epsilon() * F_norm) {
+      RealScalar delta = 0;
+      RealScalar rfactorial = 1;
+      for (Index r = 0; r < rows; r++) {
+        RealScalar mx = 0;
+        for (Index i = 0; i < rows; i++)
+          mx = (std::max)(mx, std::abs(m_f(Ashifted(i, i) + avgEival, static_cast<int>(s+r))));
+        if (r != 0)
+          rfactorial *= RealScalar(r);
+        delta = (std::max)(delta, mx / rfactorial);
+      }
+      const RealScalar P_norm = P.cwiseAbs().rowwise().sum().maxCoeff();
+      if (mu * delta * P_norm < NumTraits<Scalar>::epsilon() * F_norm) // series converged
+        break;
+    }
+  }
+  return F;
 }
 
-/** \brief Store the Schur decomposition of #m_A in #m_T and #m_U */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::computeSchurDecomposition()
+/** \brief Find cluster in \p clusters containing some value 
+  * \param[in] key Value to find
+  * \returns Iterator to cluster containing \p key, or \c clusters.end() if no cluster in \p m_clusters
+  * contains \p key.
+  */
+template <typename Index, typename ListOfClusters>
+typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOfClusters& clusters)
 {
-  const ComplexSchur<MatrixType> schurOfA(m_A);  
-  m_T = schurOfA.matrixT();
-  m_U = schurOfA.matrixU();
+  typename std::list<Index>::iterator j;
+  for (typename ListOfClusters::iterator i = clusters.begin(); i != clusters.end(); ++i) {
+    j = std::find(i->begin(), i->end(), key);
+    if (j != i->end())
+      return i;
+  }
+  return clusters.end();
 }
 
 /** \brief Partition eigenvalues in clusters of ei'vals close to each other
   * 
-  * This function computes #m_clusters. This is a partition of the
-  * eigenvalues of #m_T in clusters, such that
-  * # Any eigenvalue in a certain cluster is at most separation() away
-  *   from another eigenvalue in the same cluster.
-  * # The distance between two eigenvalues in different clusters is
-  *   more than separation().
-  * The implementation follows Algorithm 4.1 in the paper of Davies
-  * and Higham. 
+  * \param[in]  eivals    Eigenvalues
+  * \param[out] clusters  Resulting partition of eigenvalues
+  *
+  * The partition satisfies the following two properties:
+  * # Any eigenvalue in a certain cluster is at most matrix_function_separation() away from another eigenvalue
+  *   in the same cluster.
+  * # The distance between two eigenvalues in different clusters is more than matrix_function_separation().  
+  * The implementation follows Algorithm 4.1 in the paper of Davies and Higham.
   */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::partitionEigenvalues()
+template <typename EivalsType, typename Cluster>
+void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
 {
-  using std::abs;
-  const Index rows = m_T.rows();
-  VectorType diag = m_T.diagonal(); // contains eigenvalues of A
-
-  for (Index i=0; i<rows; ++i) {
-    // Find set containing diag(i), adding a new set if necessary
-    typename ListOfClusters::iterator qi = findCluster(diag(i));
-    if (qi == m_clusters.end()) {
+  typedef typename EivalsType::Index Index;
+  typedef typename EivalsType::RealScalar RealScalar;
+  for (Index i=0; i<eivals.rows(); ++i) {
+    // Find cluster containing i-th ei'val, adding a new cluster if necessary
+    typename std::list<Cluster>::iterator qi = matrix_function_find_cluster(i, clusters);
+    if (qi == clusters.end()) {
       Cluster l;
-      l.push_back(diag(i));
-      m_clusters.push_back(l);
-      qi = m_clusters.end();
+      l.push_back(i);
+      clusters.push_back(l);
+      qi = clusters.end();
       --qi;
     }
 
     // Look for other element to add to the set
-    for (Index j=i+1; j<rows; ++j) {
-      if (abs(diag(j) - diag(i)) <= separation() && std::find(qi->begin(), qi->end(), diag(j)) == qi->end()) {
-        typename ListOfClusters::iterator qj = findCluster(diag(j));
-        if (qj == m_clusters.end()) {
-          qi->push_back(diag(j));
+    for (Index j=i+1; j<eivals.rows(); ++j) {
+      if (abs(eivals(j) - eivals(i)) <= RealScalar(matrix_function_separation)
+          && std::find(qi->begin(), qi->end(), j) == qi->end()) {
+        typename std::list<Cluster>::iterator qj = matrix_function_find_cluster(j, clusters);
+        if (qj == clusters.end()) {
+          qi->push_back(j);
         } else {
           qi->insert(qi->end(), qj->begin(), qj->end());
-          m_clusters.erase(qj);
+          clusters.erase(qj);
         }
       }
     }
   }
 }
 
-/** \brief Find cluster in #m_clusters containing some value 
-  * \param[in] key Value to find
-  * \returns Iterator to cluster containing \c key, or
-  * \c m_clusters.end() if no cluster in m_clusters contains \c key.
-  */
-template <typename MatrixType, typename AtomicType>
-typename MatrixFunction<MatrixType,AtomicType,1>::ListOfClusters::iterator MatrixFunction<MatrixType,AtomicType,1>::findCluster(Scalar key)
+/** \brief Compute size of each cluster given a partitioning */
+template <typename ListOfClusters, typename Index>
+void matrix_function_compute_cluster_size(const ListOfClusters& clusters, Matrix<Index, Dynamic, 1>& clusterSize)
 {
-  typename Cluster::iterator j;
-  for (typename ListOfClusters::iterator i = m_clusters.begin(); i != m_clusters.end(); ++i) {
-    j = std::find(i->begin(), i->end(), key);
-    if (j != i->end())
-      return i;
+  const Index numClusters = static_cast<Index>(clusters.size());
+  clusterSize.setZero(numClusters);
+  Index clusterIndex = 0;
+  for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
+    clusterSize[clusterIndex] = cluster->size();
+    ++clusterIndex;
   }
-  return m_clusters.end();
 }
 
-/** \brief Compute #m_clusterSize and #m_eivalToCluster using #m_clusters */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::computeClusterSize()
+/** \brief Compute start of each block using clusterSize */
+template <typename VectorType>
+void matrix_function_compute_block_start(const VectorType& clusterSize, VectorType& blockStart)
 {
-  const Index rows = m_T.rows();
-  VectorType diag = m_T.diagonal(); 
-  const Index numClusters = static_cast<Index>(m_clusters.size());
+  blockStart.resize(clusterSize.rows());
+  blockStart(0) = 0;
+  for (typename VectorType::Index i = 1; i < clusterSize.rows(); i++) {
+    blockStart(i) = blockStart(i-1) + clusterSize(i-1);
+  }
+}
 
-  m_clusterSize.setZero(numClusters);
-  m_eivalToCluster.resize(rows);
+/** \brief Compute mapping of eigenvalue indices to cluster indices */
+template <typename EivalsType, typename ListOfClusters, typename VectorType>
+void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster)
+{
+  typedef typename EivalsType::Index Index;
+  eivalToCluster.resize(eivals.rows());
   Index clusterIndex = 0;
-  for (typename ListOfClusters::const_iterator cluster = m_clusters.begin(); cluster != m_clusters.end(); ++cluster) {
-    for (Index i = 0; i < diag.rows(); ++i) {
-      if (std::find(cluster->begin(), cluster->end(), diag(i)) != cluster->end()) {
-        ++m_clusterSize[clusterIndex];
-        m_eivalToCluster[i] = clusterIndex;
+  for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
+    for (Index i = 0; i < eivals.rows(); ++i) {
+      if (std::find(cluster->begin(), cluster->end(), i) != cluster->end()) {
+        eivalToCluster[i] = clusterIndex;
       }
     }
     ++clusterIndex;
   }
 }
 
-/** \brief Compute #m_blockStart using #m_clusterSize */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::computeBlockStart()
-{
-  m_blockStart.resize(m_clusterSize.rows());
-  m_blockStart(0) = 0;
-  for (Index i = 1; i < m_clusterSize.rows(); i++) {
-    m_blockStart(i) = m_blockStart(i-1) + m_clusterSize(i-1);
-  }
-}
-
-/** \brief Compute #m_permutation using #m_eivalToCluster and #m_blockStart */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::constructPermutation()
+/** \brief Compute permutation which groups ei'vals in same cluster together */
+template <typename DynVectorType, typename VectorType>
+void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation)
 {
-  DynamicIntVectorType indexNextEntry = m_blockStart;
-  m_permutation.resize(m_T.rows());
-  for (Index i = 0; i < m_T.rows(); i++) {
-    Index cluster = m_eivalToCluster[i];
-    m_permutation[i] = indexNextEntry[cluster];
+  typedef typename VectorType::Index Index;
+  DynVectorType indexNextEntry = blockStart;
+  permutation.resize(eivalToCluster.rows());
+  for (Index i = 0; i < eivalToCluster.rows(); i++) {
+    Index cluster = eivalToCluster[i];
+    permutation[i] = indexNextEntry[cluster];
     ++indexNextEntry[cluster];
   }
 }  
 
-/** \brief Permute Schur decomposition in #m_U and #m_T according to #m_permutation */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::permuteSchur()
+/** \brief Permute Schur decomposition in U and T according to permutation */
+template <typename VectorType, typename MatrixType>
+void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T)
 {
-  IntVectorType p = m_permutation;
-  for (Index i = 0; i < p.rows() - 1; i++) {
+  typedef typename VectorType::Index Index;
+  for (Index i = 0; i < permutation.rows() - 1; i++) {
     Index j;
-    for (j = i; j < p.rows(); j++) {
-      if (p(j) == i) break;
+    for (j = i; j < permutation.rows(); j++) {
+      if (permutation(j) == i) break;
     }
-    eigen_assert(p(j) == i);
+    eigen_assert(permutation(j) == i);
     for (Index k = j-1; k >= i; k--) {
-      swapEntriesInSchur(k);
-      std::swap(p.coeffRef(k), p.coeffRef(k+1));
+      JacobiRotation<typename MatrixType::Scalar> rotation;
+      rotation.makeGivens(T(k, k+1), T(k+1, k+1) - T(k, k));
+      T.applyOnTheLeft(k, k+1, rotation.adjoint());
+      T.applyOnTheRight(k, k+1, rotation);
+      U.applyOnTheRight(k, k+1, rotation);
+      std::swap(permutation.coeffRef(k), permutation.coeffRef(k+1));
     }
   }
 }
 
-/** \brief Swap rows \a index and \a index+1 in Schur decomposition in #m_U and #m_T */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::swapEntriesInSchur(Index index)
-{
-  JacobiRotation<Scalar> rotation;
-  rotation.makeGivens(m_T(index, index+1), m_T(index+1, index+1) - m_T(index, index));
-  m_T.applyOnTheLeft(index, index+1, rotation.adjoint());
-  m_T.applyOnTheRight(index, index+1, rotation);
-  m_U.applyOnTheRight(index, index+1, rotation);
-}  
-
-/** \brief Compute block diagonal part of #m_fT.
-  *
-  * This routine computes the matrix function applied to the block diagonal part of #m_T, with the blocking
-  * given by #m_blockStart. The matrix function of each diagonal block is computed by #m_atomic. The
-  * off-diagonal parts of #m_fT are set to zero.
-  */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::computeBlockAtomic()
-{ 
-  m_fT.resize(m_T.rows(), m_T.cols());
-  m_fT.setZero();
-  for (Index i = 0; i < m_clusterSize.rows(); ++i) {
-    block(m_fT, i, i) = m_atomic.compute(block(m_T, i, i));
-  }
-}
-
-/** \brief Return block of matrix according to blocking given by #m_blockStart */
-template <typename MatrixType, typename AtomicType>
-Block<MatrixType> MatrixFunction<MatrixType,AtomicType,1>::block(MatrixType& A, Index i, Index j)
-{
-  return A.block(m_blockStart(i), m_blockStart(j), m_clusterSize(i), m_clusterSize(j));
-}
-
-/** \brief Compute part of #m_fT above block diagonal.
+/** \brief Compute block diagonal part of matrix function.
   *
-  * This routine assumes that the block diagonal part of #m_fT (which
-  * equals the matrix function applied to #m_T) has already been computed and computes
-  * the part above the block diagonal. The part below the diagonal is
-  * zero, because #m_T is upper triangular.
+  * This routine computes the matrix function applied to the block diagonal part of \p T (which should be
+  * upper triangular), with the blocking given by \p blockStart and \p clusterSize. The matrix function of
+  * each diagonal block is computed by \p atomic. The off-diagonal parts of \p fT are set to zero.
   */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::computeOffDiagonal()
+template <typename MatrixType, typename AtomicType, typename VectorType>
+void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
 { 
-  for (Index diagIndex = 1; diagIndex < m_clusterSize.rows(); diagIndex++) {
-    for (Index blockIndex = 0; blockIndex < m_clusterSize.rows() - diagIndex; blockIndex++) {
-      // compute (blockIndex, blockIndex+diagIndex) block
-      DynMatrixType A = block(m_T, blockIndex, blockIndex);
-      DynMatrixType B = -block(m_T, blockIndex+diagIndex, blockIndex+diagIndex);
-      DynMatrixType C = block(m_fT, blockIndex, blockIndex) * block(m_T, blockIndex, blockIndex+diagIndex);
-      C -= block(m_T, blockIndex, blockIndex+diagIndex) * block(m_fT, blockIndex+diagIndex, blockIndex+diagIndex);
-      for (Index k = blockIndex + 1; k < blockIndex + diagIndex; k++) {
-	C += block(m_fT, blockIndex, k) * block(m_T, k, blockIndex+diagIndex);
-	C -= block(m_T, blockIndex, k) * block(m_fT, k, blockIndex+diagIndex);
-      }
-      block(m_fT, blockIndex, blockIndex+diagIndex) = solveTriangularSylvester(A, B, C);
-    }
+  fT.setZero(T.rows(), T.cols());
+  for (typename VectorType::Index i = 0; i < clusterSize.rows(); ++i) {
+    fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
+      = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)));
   }
 }
 
@@ -414,8 +261,8 @@ void MatrixFunction<MatrixType,AtomicType,1>::computeOffDiagonal()
   *
   * \returns the solution X.
   *
-  * If A is m-by-m and B is n-by-n, then both C and X are m-by-n. 
-  * The (i,j)-th component of the Sylvester equation is
+  * If A is m-by-m and B is n-by-n, then both C and X are m-by-n.  The (i,j)-th component of the Sylvester
+  * equation is
   * \f[ 
   *     \sum_{k=i}^m A_{ik} X_{kj} + \sum_{k=1}^j X_{ik} B_{kj} = C_{ij}. 
   * \f]
@@ -424,16 +271,12 @@ void MatrixFunction<MatrixType,AtomicType,1>::computeOffDiagonal()
   *     X_{ij} = \frac{1}{A_{ii} + B_{jj}} \Bigl( C_{ij}
   *     - \sum_{k=i+1}^m A_{ik} X_{kj} - \sum_{k=1}^{j-1} X_{ik} B_{kj} \Bigr).
   * \f]
-  * It is assumed that A and B are such that the numerator is never
-  * zero (otherwise the Sylvester equation does not have a unique
-  * solution). In that case, these equations can be evaluated in the
-  * order \f$ i=m,\ldots,1 \f$ and \f$ j=1,\ldots,n \f$.
+  * It is assumed that A and B are such that the numerator is never zero (otherwise the Sylvester equation
+  * does not have a unique solution). In that case, these equations can be evaluated in the order 
+  * \f$ i=m,\ldots,1 \f$ and \f$ j=1,\ldots,n \f$.
   */
-template <typename MatrixType, typename AtomicType>
-typename MatrixFunction<MatrixType,AtomicType,1>::DynMatrixType MatrixFunction<MatrixType,AtomicType,1>::solveTriangularSylvester(
-  const DynMatrixType& A, 
-  const DynMatrixType& B, 
-  const DynMatrixType& C)
+template <typename MatrixType>
+MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const MatrixType& B, const MatrixType& C)
 {
   eigen_assert(A.rows() == A.cols());
   eigen_assert(A.isUpperTriangular());
@@ -442,9 +285,12 @@ typename MatrixFunction<MatrixType,AtomicType,1>::DynMatrixType MatrixFunction<M
   eigen_assert(C.rows() == A.rows());
   eigen_assert(C.cols() == B.rows());
 
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+
   Index m = A.rows();
   Index n = B.rows();
-  DynMatrixType X(m, n);
+  MatrixType X(m, n);
 
   for (Index i = m - 1; i >= 0; --i) {
     for (Index j = 0; j < n; ++j) {
@@ -473,66 +319,210 @@ typename MatrixFunction<MatrixType,AtomicType,1>::DynMatrixType MatrixFunction<M
   return X;
 }
 
+/** \brief Compute part of matrix function above block diagonal.
+  *
+  * This routine completes the computation of \p fT, denoting a matrix function applied to the triangular
+  * matrix \p T. It assumes that the block diagonal part of \p fT has already been computed. The part below
+  * the diagonal is zero, because \p T is upper triangular.
+  */
+template <typename MatrixType, typename VectorType>
+void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
+{ 
+  typedef internal::traits<MatrixType> Traits;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Index Index;
+  static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
+  static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
+  static const int Options = MatrixType::Options;
+  typedef Matrix<Scalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+
+  for (Index k = 1; k < clusterSize.rows(); k++) {
+    for (Index i = 0; i < clusterSize.rows() - k; i++) {
+      // compute (i, i+k) block
+      DynMatrixType A = T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i));
+      DynMatrixType B = -T.block(blockStart(i+k), blockStart(i+k), clusterSize(i+k), clusterSize(i+k));
+      DynMatrixType C = fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
+        * T.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k));
+      C -= T.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k))
+        * fT.block(blockStart(i+k), blockStart(i+k), clusterSize(i+k), clusterSize(i+k));
+      for (Index m = i + 1; m < i + k; m++) {
+        C += fT.block(blockStart(i), blockStart(m), clusterSize(i), clusterSize(m))
+          * T.block(blockStart(m), blockStart(i+k), clusterSize(m), clusterSize(i+k));
+        C -= T.block(blockStart(i), blockStart(m), clusterSize(i), clusterSize(m))
+          * fT.block(blockStart(m), blockStart(i+k), clusterSize(m), clusterSize(i+k));
+      }
+      fT.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k))
+        = matrix_function_solve_triangular_sylvester(A, B, C);
+    }
+  }
+}
+
+/** \ingroup MatrixFunctions_Module
+  * \brief Class for computing matrix functions.
+  * \tparam  MatrixType  type of the argument of the matrix function,
+  *                      expected to be an instantiation of the Matrix class template.
+  * \tparam  AtomicType  type for computing matrix function of atomic blocks.
+  * \tparam  IsComplex   used internally to select correct specialization.
+  *
+  * This class implements the Schur-Parlett algorithm for computing matrix functions. The spectrum of the
+  * matrix is divided in clustered of eigenvalues that lies close together. This class delegates the
+  * computation of the matrix function on every block corresponding to these clusters to an object of type
+  * \p AtomicType and uses these results to compute the matrix function of the whole matrix. The class
+  * \p AtomicType should have a \p compute() member function for computing the matrix function of a block.
+  *
+  * \sa class MatrixFunctionAtomic, class MatrixLogarithmAtomic
+  */
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct matrix_function_compute
+{  
+    /** \brief Compute the matrix function.
+      *
+      * \param[in]  A       argument of matrix function, should be a square matrix.
+      * \param[in]  atomic  class for computing matrix function of atomic blocks.
+      * \param[out] result  the function \p f applied to \p A, as
+      * specified in the constructor.
+      *
+      * See MatrixBase::matrixFunction() for details on how this computation
+      * is implemented.
+      */
+    template <typename AtomicType, typename ResultType> 
+    static void run(const MatrixType& A, AtomicType& atomic, ResultType &result);    
+};
+
+/** \internal \ingroup MatrixFunctions_Module 
+  * \brief Partial specialization of MatrixFunction for real matrices
+  *
+  * This converts the real matrix to a complex matrix, compute the matrix function of that matrix, and then
+  * converts the result back to a real matrix.
+  */
+template <typename MatrixType>
+struct matrix_function_compute<MatrixType, 0>
+{  
+  template <typename AtomicType, typename ResultType> 
+  static void run(const MatrixType& A, AtomicType& atomic, ResultType &result)
+  {
+    typedef internal::traits<MatrixType> Traits;
+    typedef typename Traits::Scalar Scalar;
+    static const int Rows = Traits::RowsAtCompileTime, Cols = Traits::ColsAtCompileTime;
+    static const int MaxRows = Traits::MaxRowsAtCompileTime, MaxCols = Traits::MaxColsAtCompileTime;
+
+    typedef std::complex<Scalar> ComplexScalar;
+    typedef Matrix<ComplexScalar, Rows, Cols, 0, MaxRows, MaxCols> ComplexMatrix;
+
+    ComplexMatrix CA = A.template cast<ComplexScalar>();
+    ComplexMatrix Cresult;
+    matrix_function_compute<ComplexMatrix>::run(CA, atomic, Cresult);
+    result = Cresult.real();
+  }
+};
+
+/** \internal \ingroup MatrixFunctions_Module 
+  * \brief Partial specialization of MatrixFunction for complex matrices
+  */
+template <typename MatrixType>
+struct matrix_function_compute<MatrixType, 1>
+{
+  template <typename AtomicType, typename ResultType> 
+  static void run(const MatrixType& A, AtomicType& atomic, ResultType &result)
+  {
+    typedef internal::traits<MatrixType> Traits;
+    typedef typename MatrixType::Index Index;
+    
+    // compute Schur decomposition of A
+    const ComplexSchur<MatrixType> schurOfA(A);  
+    MatrixType T = schurOfA.matrixT();
+    MatrixType U = schurOfA.matrixU();
+
+    // partition eigenvalues into clusters of ei'vals "close" to each other
+    std::list<std::list<Index> > clusters; 
+    matrix_function_partition_eigenvalues(T.diagonal(), clusters);
+
+    // compute size of each cluster
+    Matrix<Index, Dynamic, 1> clusterSize;
+    matrix_function_compute_cluster_size(clusters, clusterSize);
+
+    // blockStart[i] is row index at which block corresponding to i-th cluster starts 
+    Matrix<Index, Dynamic, 1> blockStart; 
+    matrix_function_compute_block_start(clusterSize, blockStart);
+
+    // compute map so that eivalToCluster[i] = j means that i-th ei'val is in j-th cluster 
+    Matrix<Index, Dynamic, 1> eivalToCluster;
+    matrix_function_compute_map(T.diagonal(), clusters, eivalToCluster);
+
+    // compute permutation which groups ei'vals in same cluster together 
+    Matrix<Index, Traits::RowsAtCompileTime, 1> permutation;
+    matrix_function_compute_permutation(blockStart, eivalToCluster, permutation);
+
+    // permute Schur decomposition
+    matrix_function_permute_schur(permutation, U, T);
+
+    // compute result
+    MatrixType fT; // matrix function applied to T
+    matrix_function_compute_block_atomic(T, atomic, blockStart, clusterSize, fT);
+    matrix_function_compute_above_diagonal(T, blockStart, clusterSize, fT);
+    result = U * (fT.template triangularView<Upper>() * U.adjoint());
+  }
+};
+
+} // end of namespace internal
+
 /** \ingroup MatrixFunctions_Module
   *
   * \brief Proxy for the matrix function of some matrix (expression).
   *
   * \tparam Derived  Type of the argument to the matrix function.
   *
-  * This class holds the argument to the matrix function until it is
-  * assigned or evaluated for some other reason (so the argument
-  * should not be changed in the meantime). It is the return type of
-  * matrixBase::matrixFunction() and related functions and most of the
-  * time this is the only way it is used.
+  * This class holds the argument to the matrix function until it is assigned or evaluated for some other
+  * reason (so the argument should not be changed in the meantime). It is the return type of
+  * matrixBase::matrixFunction() and related functions and most of the time this is the only way it is used.
   */
 template<typename Derived> class MatrixFunctionReturnValue
 : public ReturnByValue<MatrixFunctionReturnValue<Derived> >
 {
   public:
-
     typedef typename Derived::Scalar Scalar;
     typedef typename Derived::Index Index;
     typedef typename internal::stem_function<Scalar>::type StemFunction;
 
-   /** \brief Constructor.
+  protected:
+    typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+  public:
+
+    /** \brief Constructor.
       *
-      * \param[in] A  %Matrix (expression) forming the argument of the
-      * matrix function.
+      * \param[in] A  %Matrix (expression) forming the argument of the matrix function.
       * \param[in] f  Stem function for matrix function under consideration.
       */
     MatrixFunctionReturnValue(const Derived& A, StemFunction f) : m_A(A), m_f(f) { }
 
     /** \brief Compute the matrix function.
       *
-      * \param[out] result \p f applied to \p A, where \p f and \p A
-      * are as in the constructor.
+      * \param[out] result \p f applied to \p A, where \p f and \p A are as in the constructor.
       */
     template <typename ResultType>
     inline void evalTo(ResultType& result) const
     {
-      typedef typename Derived::PlainObject PlainObject;
-      typedef internal::traits<PlainObject> Traits;
+      typedef typename internal::nested_eval<Derived, 10>::type NestedEvalType;
+      typedef typename internal::remove_all<NestedEvalType>::type NestedEvalTypeClean;
+      typedef internal::traits<NestedEvalTypeClean> Traits;
       static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
       static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
-      static const int Options = PlainObject::Options;
       typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
-      typedef MatrixFunctionAtomic<DynMatrixType> AtomicType;
+      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+
+      typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
       AtomicType atomic(m_f);
 
-      const PlainObject Aevaluated = m_A.eval();
-      MatrixFunction<PlainObject, AtomicType> mf(Aevaluated, atomic);
-      mf.compute(result);
+      internal::matrix_function_compute<NestedEvalTypeClean>::run(m_A, atomic, result);
     }
 
     Index rows() const { return m_A.rows(); }
     Index cols() const { return m_A.cols(); }
 
   private:
-    typename internal::nested<Derived>::type m_A;
+    const DerivedNested m_A;
     StemFunction *m_f;
-
-    MatrixFunctionReturnValue& operator=(const MatrixFunctionReturnValue&);
 };
 
 namespace internal {
@@ -559,7 +549,7 @@ const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sin() const
 {
   eigen_assert(rows() == cols());
   typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
-  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::sin);
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_sin<ComplexScalar>);
 }
 
 template <typename Derived>
@@ -567,7 +557,7 @@ const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cos() const
 {
   eigen_assert(rows() == cols());
   typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
-  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::cos);
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_cos<ComplexScalar>);
 }
 
 template <typename Derived>
@@ -575,7 +565,7 @@ const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sinh() const
 {
   eigen_assert(rows() == cols());
   typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
-  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::sinh);
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_sinh<ComplexScalar>);
 }
 
 template <typename Derived>
@@ -583,7 +573,7 @@ const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cosh() const
 {
   eigen_assert(rows() == cols());
   typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
-  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::cosh);
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_cosh<ComplexScalar>);
 }
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h
deleted file mode 100644
index efe332c48..000000000
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h
+++ /dev/null
@@ -1,131 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Jitse Niesen <jitse@maths.leeds.ac.uk>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MATRIX_FUNCTION_ATOMIC
-#define EIGEN_MATRIX_FUNCTION_ATOMIC
-
-namespace Eigen { 
-
-/** \ingroup MatrixFunctions_Module
-  * \class MatrixFunctionAtomic
-  * \brief Helper class for computing matrix functions of atomic matrices.
-  *
-  * \internal
-  * Here, an atomic matrix is a triangular matrix whose diagonal
-  * entries are close to each other.
-  */
-template <typename MatrixType>
-class MatrixFunctionAtomic
-{
-  public:
-
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename internal::stem_function<Scalar>::type StemFunction;
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-
-    /** \brief Constructor
-      * \param[in]  f  matrix function to compute.
-      */
-    MatrixFunctionAtomic(StemFunction f) : m_f(f) { }
-
-    /** \brief Compute matrix function of atomic matrix
-      * \param[in]  A  argument of matrix function, should be upper triangular and atomic
-      * \returns  f(A), the matrix function evaluated at the given matrix
-      */
-    MatrixType compute(const MatrixType& A);
-
-  private:
-
-    // Prevent copying
-    MatrixFunctionAtomic(const MatrixFunctionAtomic&);
-    MatrixFunctionAtomic& operator=(const MatrixFunctionAtomic&);
-
-    void computeMu();
-    bool taylorConverged(Index s, const MatrixType& F, const MatrixType& Fincr, const MatrixType& P);
-
-    /** \brief Pointer to scalar function */
-    StemFunction* m_f;
-
-    /** \brief Size of matrix function */
-    Index m_Arows;
-
-    /** \brief Mean of eigenvalues */
-    Scalar m_avgEival;
-
-    /** \brief Argument shifted by mean of eigenvalues */
-    MatrixType m_Ashifted;
-
-    /** \brief Constant used to determine whether Taylor series has converged */
-    RealScalar m_mu;
-};
-
-template <typename MatrixType>
-MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
-{
-  // TODO: Use that A is upper triangular
-  m_Arows = A.rows();
-  m_avgEival = A.trace() / Scalar(RealScalar(m_Arows));
-  m_Ashifted = A - m_avgEival * MatrixType::Identity(m_Arows, m_Arows);
-  computeMu();
-  MatrixType F = m_f(m_avgEival, 0) * MatrixType::Identity(m_Arows, m_Arows);
-  MatrixType P = m_Ashifted;
-  MatrixType Fincr;
-  for (Index s = 1; s < 1.1 * m_Arows + 10; s++) { // upper limit is fairly arbitrary
-    Fincr = m_f(m_avgEival, static_cast<int>(s)) * P;
-    F += Fincr;
-    P = Scalar(RealScalar(1.0/(s + 1))) * P * m_Ashifted;
-    if (taylorConverged(s, F, Fincr, P)) {
-      return F;
-    }
-  }
-  eigen_assert("Taylor series does not converge" && 0);
-  return F;
-}
-
-/** \brief Compute \c m_mu. */
-template <typename MatrixType>
-void MatrixFunctionAtomic<MatrixType>::computeMu()
-{
-  const MatrixType N = MatrixType::Identity(m_Arows, m_Arows) - m_Ashifted;
-  VectorType e = VectorType::Ones(m_Arows);
-  N.template triangularView<Upper>().solveInPlace(e);
-  m_mu = e.cwiseAbs().maxCoeff();
-}
-
-/** \brief Determine whether Taylor series has converged */
-template <typename MatrixType>
-bool MatrixFunctionAtomic<MatrixType>::taylorConverged(Index s, const MatrixType& F,
-						       const MatrixType& Fincr, const MatrixType& P)
-{
-  const Index n = F.rows();
-  const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff();
-  const RealScalar Fincr_norm = Fincr.cwiseAbs().rowwise().sum().maxCoeff();
-  if (Fincr_norm < NumTraits<Scalar>::epsilon() * F_norm) {
-    RealScalar delta = 0;
-    RealScalar rfactorial = 1;
-    for (Index r = 0; r < n; r++) {
-      RealScalar mx = 0;
-      for (Index i = 0; i < n; i++)
-        mx = (std::max)(mx, std::abs(m_f(m_Ashifted(i, i) + m_avgEival, static_cast<int>(s+r))));
-      if (r != 0)
-        rfactorial *= RealScalar(r);
-      delta = (std::max)(delta, mx / rfactorial);
-    }
-    const RealScalar P_norm = P.cwiseAbs().rowwise().sum().maxCoeff();
-    if (m_mu * delta * P_norm < NumTraits<Scalar>::epsilon() * F_norm)
-      return true;
-  }
-  return false;
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_MATRIX_FUNCTION_ATOMIC
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index c744fc05f..1acfbed9e 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
 // Copyright (C) 2011 Chen-Pang He <jdh8@ms63.hinet.net>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,91 +11,33 @@
 #ifndef EIGEN_MATRIX_LOGARITHM
 #define EIGEN_MATRIX_LOGARITHM
 
-#ifndef M_PI
-#define M_PI 3.141592653589793238462643383279503L
-#endif
-
 namespace Eigen { 
 
-/** \ingroup MatrixFunctions_Module
-  * \class MatrixLogarithmAtomic
-  * \brief Helper class for computing matrix logarithm of atomic matrices.
-  *
-  * \internal
-  * Here, an atomic matrix is a triangular matrix whose diagonal
-  * entries are close to each other.
-  *
-  * \sa class MatrixFunctionAtomic, MatrixBase::log()
-  */
-template <typename MatrixType>
-class MatrixLogarithmAtomic
-{
-public:
-
-  typedef typename MatrixType::Scalar Scalar;
-  // typedef typename MatrixType::Index Index;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  // typedef typename internal::stem_function<Scalar>::type StemFunction;
-  // typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-
-  /** \brief Constructor. */
-  MatrixLogarithmAtomic() { }
-
-  /** \brief Compute matrix logarithm of atomic matrix
-    * \param[in]  A  argument of matrix logarithm, should be upper triangular and atomic
-    * \returns  The logarithm of \p A.
-    */
-  MatrixType compute(const MatrixType& A);
-
-private:
+namespace internal { 
 
-  void compute2x2(const MatrixType& A, MatrixType& result);
-  void computeBig(const MatrixType& A, MatrixType& result);
-  int getPadeDegree(float normTminusI);
-  int getPadeDegree(double normTminusI);
-  int getPadeDegree(long double normTminusI);
-  void computePade(MatrixType& result, const MatrixType& T, int degree);
-  void computePade3(MatrixType& result, const MatrixType& T);
-  void computePade4(MatrixType& result, const MatrixType& T);
-  void computePade5(MatrixType& result, const MatrixType& T);
-  void computePade6(MatrixType& result, const MatrixType& T);
-  void computePade7(MatrixType& result, const MatrixType& T);
-  void computePade8(MatrixType& result, const MatrixType& T);
-  void computePade9(MatrixType& result, const MatrixType& T);
-  void computePade10(MatrixType& result, const MatrixType& T);
-  void computePade11(MatrixType& result, const MatrixType& T);
-
-  static const int minPadeDegree = 3;
-  static const int maxPadeDegree = std::numeric_limits<RealScalar>::digits<= 24?  5:  // single precision
-                                   std::numeric_limits<RealScalar>::digits<= 53?  7:  // double precision
-                                   std::numeric_limits<RealScalar>::digits<= 64?  8:  // extended precision
-                                   std::numeric_limits<RealScalar>::digits<=106? 10:  // double-double
-                                                                                 11;  // quadruple precision
-
-  // Prevent copying
-  MatrixLogarithmAtomic(const MatrixLogarithmAtomic&);
-  MatrixLogarithmAtomic& operator=(const MatrixLogarithmAtomic&);
+template <typename Scalar>
+struct matrix_log_min_pade_degree 
+{
+  static const int value = 3;
 };
 
-/** \brief Compute logarithm of triangular matrix with clustered eigenvalues. */
-template <typename MatrixType>
-MatrixType MatrixLogarithmAtomic<MatrixType>::compute(const MatrixType& A)
+template <typename Scalar>
+struct matrix_log_max_pade_degree 
 {
-  using std::log;
-  MatrixType result(A.rows(), A.rows());
-  if (A.rows() == 1)
-    result(0,0) = log(A(0,0));
-  else if (A.rows() == 2)
-    compute2x2(A, result);
-  else
-    computeBig(A, result);
-  return result;
-}
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static const int value = std::numeric_limits<RealScalar>::digits<= 24?  5:  // single precision
+                           std::numeric_limits<RealScalar>::digits<= 53?  7:  // double precision
+                           std::numeric_limits<RealScalar>::digits<= 64?  8:  // extended precision
+                           std::numeric_limits<RealScalar>::digits<=106? 10:  // double-double
+                                                                         11;  // quadruple precision
+};
 
 /** \brief Compute logarithm of 2x2 triangular matrix. */
 template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::compute2x2(const MatrixType& A, MatrixType& result)
+void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
 {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
   using std::abs;
   using std::ceil;
   using std::imag;
@@ -108,59 +50,31 @@ void MatrixLogarithmAtomic<MatrixType>::compute2x2(const MatrixType& A, MatrixTy
   result(1,0) = Scalar(0);
   result(1,1) = logA11;
 
-  if (A(0,0) == A(1,1)) {
+  Scalar y = A(1,1) - A(0,0);
+  if (y==Scalar(0))
+  {
     result(0,1) = A(0,1) / A(0,0);
-  } else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) {
-    result(0,1) = A(0,1) * (logA11 - logA00) / (A(1,1) - A(0,0));
-  } else {
-    // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
-    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI)));
-    Scalar y = A(1,1) - A(0,0), x = A(1,1) + A(0,0);
-    result(0,1) = A(0,1) * (Scalar(2) * numext::atanh2(y,x) + Scalar(0,2*M_PI*unwindingNumber)) / y;
   }
-}
-
-/** \brief Compute logarithm of triangular matrices with size > 2. 
-  * \details This uses a inverse scale-and-square algorithm. */
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computeBig(const MatrixType& A, MatrixType& result)
-{
-  using std::pow;
-  int numberOfSquareRoots = 0;
-  int numberOfExtraSquareRoots = 0;
-  int degree;
-  MatrixType T = A, sqrtT;
-  const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1:                     // single precision
-                                    maxPadeDegree<= 7? 2.6429608311114350e-1:                     // double precision
-                                    maxPadeDegree<= 8? 2.32777776523703892094e-1L:                // extended precision
-                                    maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L:    // double-double
-                                                       1.1880960220216759245467951592883642e-1L;  // quadruple precision
-
-  while (true) {
-    RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff();
-    if (normTminusI < maxNormForPade) {
-      degree = getPadeDegree(normTminusI);
-      int degree2 = getPadeDegree(normTminusI / RealScalar(2));
-      if ((degree - degree2 <= 1) || (numberOfExtraSquareRoots == 1)) 
-        break;
-      ++numberOfExtraSquareRoots;
-    }
-    MatrixSquareRootTriangular<MatrixType>(T).compute(sqrtT);
-    T = sqrtT.template triangularView<Upper>();
-    ++numberOfSquareRoots;
+  else if ((abs(A(0,0)) < RealScalar(0.5)*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1))))
+  {
+    result(0,1) = A(0,1) * (logA11 - logA00) / y;
+  }
+  else
+  {
+    // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
+    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)));
+    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
   }
-
-  computePade(result, T, degree);
-  result *= pow(RealScalar(2), numberOfSquareRoots);
 }
 
 /* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = float) */
-template <typename MatrixType>
-int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(float normTminusI)
+inline int matrix_log_get_pade_degree(float normTminusI)
 {
   const float maxNormForPade[] = { 2.5111573934555054e-1 /* degree = 3 */ , 4.0535837411880493e-1,
             5.3149729967117310e-1 };
-  int degree = 3;
+  const int minPadeDegree = matrix_log_min_pade_degree<float>::value;
+  const int maxPadeDegree = matrix_log_max_pade_degree<float>::value;
+  int degree = minPadeDegree;
   for (; degree <= maxPadeDegree; ++degree) 
     if (normTminusI <= maxNormForPade[degree - minPadeDegree])
       break;
@@ -168,12 +82,13 @@ int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(float normTminusI)
 }
 
 /* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = double) */
-template <typename MatrixType>
-int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(double normTminusI)
+inline int matrix_log_get_pade_degree(double normTminusI)
 {
   const double maxNormForPade[] = { 1.6206284795015624e-2 /* degree = 3 */ , 5.3873532631381171e-2,
             1.1352802267628681e-1, 1.8662860613541288e-1, 2.642960831111435e-1 };
-  int degree = 3;
+  const int minPadeDegree = matrix_log_min_pade_degree<double>::value;
+  const int maxPadeDegree = matrix_log_max_pade_degree<double>::value;
+  int degree = minPadeDegree;
   for (; degree <= maxPadeDegree; ++degree)
     if (normTminusI <= maxNormForPade[degree - minPadeDegree])
       break;
@@ -181,8 +96,7 @@ int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(double normTminusI)
 }
 
 /* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = long double) */
-template <typename MatrixType>
-int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(long double normTminusI)
+inline int matrix_log_get_pade_degree(long double normTminusI)
 {
 #if   LDBL_MANT_DIG == 53         // double precision
   const long double maxNormForPade[] = { 1.6206284795015624e-2L /* degree = 3 */ , 5.3873532631381171e-2L,
@@ -204,7 +118,9 @@ int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(long double normTminusI)
             3.6688019729653446926585242192447447e-2L, 5.9290962294020186998954055264528393e-2L,
             8.6998436081634343903250580992127677e-2L, 1.1880960220216759245467951592883642e-1L };
 #endif
-  int degree = 3;
+  const int minPadeDegree = matrix_log_min_pade_degree<long double>::value;
+  const int maxPadeDegree = matrix_log_max_pade_degree<long double>::value;
+  int degree = minPadeDegree;
   for (; degree <= maxPadeDegree; ++degree)
     if (normTminusI <= maxNormForPade[degree - minPadeDegree])
       break;
@@ -213,197 +129,168 @@ int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(long double normTminusI)
 
 /* \brief Compute Pade approximation to matrix logarithm */
 template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade(MatrixType& result, const MatrixType& T, int degree)
+void matrix_log_compute_pade(MatrixType& result, const MatrixType& T, int degree)
 {
-  switch (degree) {
-    case 3:  computePade3(result, T);  break;
-    case 4:  computePade4(result, T);  break;
-    case 5:  computePade5(result, T);  break;
-    case 6:  computePade6(result, T);  break;
-    case 7:  computePade7(result, T);  break;
-    case 8:  computePade8(result, T);  break;
-    case 9:  computePade9(result, T);  break;
-    case 10: computePade10(result, T); break;
-    case 11: computePade11(result, T); break;
-    default: assert(false); // should never happen
-  }
-} 
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  const int minPadeDegree = 3;
+  const int maxPadeDegree = 11;
+  assert(degree >= minPadeDegree && degree <= maxPadeDegree);
+
+  const RealScalar nodes[][maxPadeDegree] = { 
+    { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L,  // degree 3
+      0.8872983346207416885179265399782400L }, 
+    { 0.0694318442029737123880267555535953L, 0.3300094782075718675986671204483777L,  // degree 4
+      0.6699905217924281324013328795516223L, 0.9305681557970262876119732444464048L },
+    { 0.0469100770306680036011865608503035L, 0.2307653449471584544818427896498956L,  // degree 5
+      0.5000000000000000000000000000000000L, 0.7692346550528415455181572103501044L,
+      0.9530899229693319963988134391496965L },
+    { 0.0337652428984239860938492227530027L, 0.1693953067668677431693002024900473L,  // degree 6
+      0.3806904069584015456847491391596440L, 0.6193095930415984543152508608403560L,
+      0.8306046932331322568306997975099527L, 0.9662347571015760139061507772469973L },
+    { 0.0254460438286207377369051579760744L, 0.1292344072003027800680676133596058L,  // degree 7
+      0.2970774243113014165466967939615193L, 0.5000000000000000000000000000000000L,
+      0.7029225756886985834533032060384807L, 0.8707655927996972199319323866403942L,
+      0.9745539561713792622630948420239256L },
+    { 0.0198550717512318841582195657152635L, 0.1016667612931866302042230317620848L,  // degree 8
+      0.2372337950418355070911304754053768L, 0.4082826787521750975302619288199080L,
+      0.5917173212478249024697380711800920L, 0.7627662049581644929088695245946232L,
+      0.8983332387068133697957769682379152L, 0.9801449282487681158417804342847365L },
+    { 0.0159198802461869550822118985481636L, 0.0819844463366821028502851059651326L,  // degree 9
+      0.1933142836497048013456489803292629L, 0.3378732882980955354807309926783317L,
+      0.5000000000000000000000000000000000L, 0.6621267117019044645192690073216683L,
+      0.8066857163502951986543510196707371L, 0.9180155536633178971497148940348674L,
+      0.9840801197538130449177881014518364L },
+    { 0.0130467357414141399610179939577740L, 0.0674683166555077446339516557882535L,  // degree 10
+      0.1602952158504877968828363174425632L, 0.2833023029353764046003670284171079L,
+      0.4255628305091843945575869994351400L, 0.5744371694908156054424130005648600L,
+      0.7166976970646235953996329715828921L, 0.8397047841495122031171636825574368L,
+      0.9325316833444922553660483442117465L, 0.9869532642585858600389820060422260L },
+    { 0.0108856709269715035980309994385713L, 0.0564687001159523504624211153480364L,  // degree 11
+      0.1349239972129753379532918739844233L, 0.2404519353965940920371371652706952L,
+      0.3652284220238275138342340072995692L, 0.5000000000000000000000000000000000L,
+      0.6347715779761724861657659927004308L, 0.7595480646034059079628628347293048L,
+      0.8650760027870246620467081260155767L, 0.9435312998840476495375788846519636L,
+      0.9891143290730284964019690005614287L } };
+
+  const RealScalar weights[][maxPadeDegree] = { 
+    { 0.2777777777777777777777777777777778L, 0.4444444444444444444444444444444444L,  // degree 3
+      0.2777777777777777777777777777777778L },
+    { 0.1739274225687269286865319746109997L, 0.3260725774312730713134680253890003L,  // degree 4
+      0.3260725774312730713134680253890003L, 0.1739274225687269286865319746109997L },
+    { 0.1184634425280945437571320203599587L, 0.2393143352496832340206457574178191L,  // degree 5
+      0.2844444444444444444444444444444444L, 0.2393143352496832340206457574178191L,
+      0.1184634425280945437571320203599587L },
+    { 0.0856622461895851725201480710863665L, 0.1803807865240693037849167569188581L,  // degree 6
+      0.2339569672863455236949351719947755L, 0.2339569672863455236949351719947755L,
+      0.1803807865240693037849167569188581L, 0.0856622461895851725201480710863665L },
+    { 0.0647424830844348466353057163395410L, 0.1398526957446383339507338857118898L,  // degree 7
+      0.1909150252525594724751848877444876L, 0.2089795918367346938775510204081633L,
+      0.1909150252525594724751848877444876L, 0.1398526957446383339507338857118898L,
+      0.0647424830844348466353057163395410L },
+    { 0.0506142681451881295762656771549811L, 0.1111905172266872352721779972131204L,  // degree 8
+      0.1568533229389436436689811009933007L, 0.1813418916891809914825752246385978L,
+      0.1813418916891809914825752246385978L, 0.1568533229389436436689811009933007L,
+      0.1111905172266872352721779972131204L, 0.0506142681451881295762656771549811L },
+    { 0.0406371941807872059859460790552618L, 0.0903240803474287020292360156214564L,  // degree 9
+      0.1303053482014677311593714347093164L, 0.1561735385200014200343152032922218L,
+      0.1651196775006298815822625346434870L, 0.1561735385200014200343152032922218L,
+      0.1303053482014677311593714347093164L, 0.0903240803474287020292360156214564L,
+      0.0406371941807872059859460790552618L },
+    { 0.0333356721543440687967844049466659L, 0.0747256745752902965728881698288487L,  // degree 10
+      0.1095431812579910219977674671140816L, 0.1346333596549981775456134607847347L,
+      0.1477621123573764350869464973256692L, 0.1477621123573764350869464973256692L,
+      0.1346333596549981775456134607847347L, 0.1095431812579910219977674671140816L,
+      0.0747256745752902965728881698288487L, 0.0333356721543440687967844049466659L },
+    { 0.0278342835580868332413768602212743L, 0.0627901847324523123173471496119701L,  // degree 11
+      0.0931451054638671257130488207158280L, 0.1165968822959952399592618524215876L,
+      0.1314022722551233310903444349452546L, 0.1364625433889503153572417641681711L,
+      0.1314022722551233310903444349452546L, 0.1165968822959952399592618524215876L,
+      0.0931451054638671257130488207158280L, 0.0627901847324523123173471496119701L,
+      0.0278342835580868332413768602212743L } };
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade3(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 3;
-  const RealScalar nodes[]   = { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L,
-            0.8872983346207416885179265399782400L };
-  const RealScalar weights[] = { 0.2777777777777777777777777777777778L, 0.4444444444444444444444444444444444L,
-            0.2777777777777777777777777777777778L };
-  eigen_assert(degree <= maxPadeDegree);
   MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
   result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+  for (int k = 0; k < degree; ++k) {
+    RealScalar weight = weights[degree-minPadeDegree][k];
+    RealScalar node = nodes[degree-minPadeDegree][k];
+    result += weight * (MatrixType::Identity(T.rows(), T.rows()) + node * TminusI)
+                       .template triangularView<Upper>().solve(TminusI);
+  }
+} 
 
+/** \brief Compute logarithm of triangular matrices with size > 2. 
+  * \details This uses a inverse scale-and-square algorithm. */
 template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade4(MatrixType& result, const MatrixType& T)
+void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
 {
-  const int degree = 4;
-  const RealScalar nodes[]   = { 0.0694318442029737123880267555535953L, 0.3300094782075718675986671204483777L,
-            0.6699905217924281324013328795516223L, 0.9305681557970262876119732444464048L };
-  const RealScalar weights[] = { 0.1739274225687269286865319746109997L, 0.3260725774312730713134680253890003L,
-            0.3260725774312730713134680253890003L, 0.1739274225687269286865319746109997L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  using std::pow;
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade5(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 5;
-  const RealScalar nodes[]   = { 0.0469100770306680036011865608503035L, 0.2307653449471584544818427896498956L,
-            0.5000000000000000000000000000000000L, 0.7692346550528415455181572103501044L,
-            0.9530899229693319963988134391496965L };
-  const RealScalar weights[] = { 0.1184634425280945437571320203599587L, 0.2393143352496832340206457574178191L,
-            0.2844444444444444444444444444444444L, 0.2393143352496832340206457574178191L,
-            0.1184634425280945437571320203599587L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+  int numberOfSquareRoots = 0;
+  int numberOfExtraSquareRoots = 0;
+  int degree;
+  MatrixType T = A, sqrtT;
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade6(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 6;
-  const RealScalar nodes[]   = { 0.0337652428984239860938492227530027L, 0.1693953067668677431693002024900473L,
-            0.3806904069584015456847491391596440L, 0.6193095930415984543152508608403560L,
-            0.8306046932331322568306997975099527L, 0.9662347571015760139061507772469973L };
-  const RealScalar weights[] = { 0.0856622461895851725201480710863665L, 0.1803807865240693037849167569188581L,
-            0.2339569672863455236949351719947755L, 0.2339569672863455236949351719947755L,
-            0.1803807865240693037849167569188581L, 0.0856622461895851725201480710863665L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+  int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
+  const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1L:                    // single precision
+                                    maxPadeDegree<= 7? 2.6429608311114350e-1L:                    // double precision
+                                    maxPadeDegree<= 8? 2.32777776523703892094e-1L:                // extended precision
+                                    maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L:    // double-double
+                                                       1.1880960220216759245467951592883642e-1L;  // quadruple precision
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade7(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 7;
-  const RealScalar nodes[]   = { 0.0254460438286207377369051579760744L, 0.1292344072003027800680676133596058L,
-            0.2970774243113014165466967939615193L, 0.5000000000000000000000000000000000L,
-            0.7029225756886985834533032060384807L, 0.8707655927996972199319323866403942L,
-            0.9745539561713792622630948420239256L };
-  const RealScalar weights[] = { 0.0647424830844348466353057163395410L, 0.1398526957446383339507338857118898L,
-            0.1909150252525594724751848877444876L, 0.2089795918367346938775510204081633L,
-            0.1909150252525594724751848877444876L, 0.1398526957446383339507338857118898L,
-            0.0647424830844348466353057163395410L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+  while (true) {
+    RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff();
+    if (normTminusI < maxNormForPade) {
+      degree = matrix_log_get_pade_degree(normTminusI);
+      int degree2 = matrix_log_get_pade_degree(normTminusI / RealScalar(2));
+      if ((degree - degree2 <= 1) || (numberOfExtraSquareRoots == 1)) 
+        break;
+      ++numberOfExtraSquareRoots;
+    }
+    matrix_sqrt_triangular(T, sqrtT);
+    T = sqrtT.template triangularView<Upper>();
+    ++numberOfSquareRoots;
+  }
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade8(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 8;
-  const RealScalar nodes[]   = { 0.0198550717512318841582195657152635L, 0.1016667612931866302042230317620848L,
-            0.2372337950418355070911304754053768L, 0.4082826787521750975302619288199080L,
-            0.5917173212478249024697380711800920L, 0.7627662049581644929088695245946232L,
-            0.8983332387068133697957769682379152L, 0.9801449282487681158417804342847365L };
-  const RealScalar weights[] = { 0.0506142681451881295762656771549811L, 0.1111905172266872352721779972131204L,
-            0.1568533229389436436689811009933007L, 0.1813418916891809914825752246385978L,
-            0.1813418916891809914825752246385978L, 0.1568533229389436436689811009933007L,
-            0.1111905172266872352721779972131204L, 0.0506142681451881295762656771549811L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
+  matrix_log_compute_pade(result, T, degree);
+  result *= pow(RealScalar(2), numberOfSquareRoots);
 }
 
+/** \ingroup MatrixFunctions_Module
+  * \class MatrixLogarithmAtomic
+  * \brief Helper class for computing matrix logarithm of atomic matrices.
+  *
+  * Here, an atomic matrix is a triangular matrix whose diagonal entries are close to each other.
+  *
+  * \sa class MatrixFunctionAtomic, MatrixBase::log()
+  */
 template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade9(MatrixType& result, const MatrixType& T)
+class MatrixLogarithmAtomic
 {
-  const int degree = 9;
-  const RealScalar nodes[]   = { 0.0159198802461869550822118985481636L, 0.0819844463366821028502851059651326L,
-            0.1933142836497048013456489803292629L, 0.3378732882980955354807309926783317L,
-            0.5000000000000000000000000000000000L, 0.6621267117019044645192690073216683L,
-            0.8066857163502951986543510196707371L, 0.9180155536633178971497148940348674L,
-            0.9840801197538130449177881014518364L };
-  const RealScalar weights[] = { 0.0406371941807872059859460790552618L, 0.0903240803474287020292360156214564L,
-            0.1303053482014677311593714347093164L, 0.1561735385200014200343152032922218L,
-            0.1651196775006298815822625346434870L, 0.1561735385200014200343152032922218L,
-            0.1303053482014677311593714347093164L, 0.0903240803474287020292360156214564L,
-            0.0406371941807872059859460790552618L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+public:
+  /** \brief Compute matrix logarithm of atomic matrix
+    * \param[in]  A  argument of matrix logarithm, should be upper triangular and atomic
+    * \returns  The logarithm of \p A.
+    */
+  MatrixType compute(const MatrixType& A);
+};
 
 template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade10(MatrixType& result, const MatrixType& T)
+MatrixType MatrixLogarithmAtomic<MatrixType>::compute(const MatrixType& A)
 {
-  const int degree = 10;
-  const RealScalar nodes[]   = { 0.0130467357414141399610179939577740L, 0.0674683166555077446339516557882535L,
-            0.1602952158504877968828363174425632L, 0.2833023029353764046003670284171079L,
-            0.4255628305091843945575869994351400L, 0.5744371694908156054424130005648600L,
-            0.7166976970646235953996329715828921L, 0.8397047841495122031171636825574368L,
-            0.9325316833444922553660483442117465L, 0.9869532642585858600389820060422260L };
-  const RealScalar weights[] = { 0.0333356721543440687967844049466659L, 0.0747256745752902965728881698288487L,
-            0.1095431812579910219977674671140816L, 0.1346333596549981775456134607847347L,
-            0.1477621123573764350869464973256692L, 0.1477621123573764350869464973256692L,
-            0.1346333596549981775456134607847347L, 0.1095431812579910219977674671140816L,
-            0.0747256745752902965728881698288487L, 0.0333356721543440687967844049466659L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
+  using std::log;
+  MatrixType result(A.rows(), A.rows());
+  if (A.rows() == 1)
+    result(0,0) = log(A(0,0));
+  else if (A.rows() == 2)
+    matrix_log_compute_2x2(A, result);
+  else
+    matrix_log_compute_big(A, result);
+  return result;
 }
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade11(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 11;
-  const RealScalar nodes[]   = { 0.0108856709269715035980309994385713L, 0.0564687001159523504624211153480364L,
-            0.1349239972129753379532918739844233L, 0.2404519353965940920371371652706952L,
-            0.3652284220238275138342340072995692L, 0.5000000000000000000000000000000000L,
-            0.6347715779761724861657659927004308L, 0.7595480646034059079628628347293048L,
-            0.8650760027870246620467081260155767L, 0.9435312998840476495375788846519636L,
-            0.9891143290730284964019690005614287L };
-  const RealScalar weights[] = { 0.0278342835580868332413768602212743L, 0.0627901847324523123173471496119701L,
-            0.0931451054638671257130488207158280L, 0.1165968822959952399592618524215876L,
-            0.1314022722551233310903444349452546L, 0.1364625433889503153572417641681711L,
-            0.1314022722551233310903444349452546L, 0.1165968822959952399592618524215876L,
-            0.0931451054638671257130488207158280L, 0.0627901847324523123173471496119701L,
-            0.0278342835580868332413768602212743L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+} // end of namespace internal
 
 /** \ingroup MatrixFunctions_Module
   *
@@ -421,15 +308,19 @@ template<typename Derived> class MatrixLogarithmReturnValue
 : public ReturnByValue<MatrixLogarithmReturnValue<Derived> >
 {
 public:
-
   typedef typename Derived::Scalar Scalar;
   typedef typename Derived::Index Index;
 
+protected:
+  typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+public:
+
   /** \brief Constructor.
     *
     * \param[in]  A  %Matrix (expression) forming the argument of the matrix logarithm.
     */
-  MatrixLogarithmReturnValue(const Derived& A) : m_A(A) { }
+  explicit MatrixLogarithmReturnValue(const Derived& A) : m_A(A) { }
   
   /** \brief Compute the matrix logarithm.
     *
@@ -438,28 +329,24 @@ public:
   template <typename ResultType>
   inline void evalTo(ResultType& result) const
   {
-    typedef typename Derived::PlainObject PlainObject;
-    typedef internal::traits<PlainObject> Traits;
+    typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
+    typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
+    typedef internal::traits<DerivedEvalTypeClean> Traits;
     static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
     static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
-    static const int Options = PlainObject::Options;
     typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
-    typedef MatrixLogarithmAtomic<DynMatrixType> AtomicType;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+    typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
     AtomicType atomic;
     
-    const PlainObject Aevaluated = m_A.eval();
-    MatrixFunction<PlainObject, AtomicType> mf(Aevaluated, atomic);
-    mf.compute(result);
+    internal::matrix_function_compute<DerivedEvalTypeClean>::run(m_A, atomic, result);
   }
 
   Index rows() const { return m_A.rows(); }
   Index cols() const { return m_A.cols(); }
   
 private:
-  typename internal::nested<Derived>::type m_A;
-  
-  MatrixLogarithmReturnValue& operator=(const MatrixLogarithmReturnValue&);
+  const DerivedNested m_A;
 };
 
 namespace internal {
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index 78a307e96..ebc433d89 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -14,16 +14,48 @@ namespace Eigen {
 
 template<typename MatrixType> class MatrixPower;
 
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix.
+ *
+ * \tparam MatrixType  type of the base, a matrix.
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixPower::operator() and related functions and most of the
+ * time this is the only way it is used.
+ */
+/* TODO This class is only used by MatrixPower, so it should be nested
+ * into MatrixPower, like MatrixPower::ReturnValue. However, my
+ * compiler complained about unused template parameter in the
+ * following declaration in namespace internal.
+ *
+ * template<typename MatrixType>
+ * struct traits<MatrixPower<MatrixType>::ReturnValue>;
+ */
 template<typename MatrixType>
-class MatrixPowerRetval : public ReturnByValue< MatrixPowerRetval<MatrixType> >
+class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParenthesesReturnValue<MatrixType> >
 {
   public:
     typedef typename MatrixType::RealScalar RealScalar;
     typedef typename MatrixType::Index Index;
 
-    MatrixPowerRetval(MatrixPower<MatrixType>& pow, RealScalar p) : m_pow(pow), m_p(p)
+    /**
+     * \brief Constructor.
+     *
+     * \param[in] pow  %MatrixPower storing the base.
+     * \param[in] p    scalar, the exponent of the matrix power.
+     */
+    MatrixPowerParenthesesReturnValue(MatrixPower<MatrixType>& pow, RealScalar p) : m_pow(pow), m_p(p)
     { }
 
+    /**
+     * \brief Compute the matrix power.
+     *
+     * \param[out] result
+     */
     template<typename ResultType>
     inline void evalTo(ResultType& res) const
     { m_pow.compute(res, m_p); }
@@ -34,11 +66,25 @@ class MatrixPowerRetval : public ReturnByValue< MatrixPowerRetval<MatrixType> >
   private:
     MatrixPower<MatrixType>& m_pow;
     const RealScalar m_p;
-    MatrixPowerRetval& operator=(const MatrixPowerRetval&);
 };
 
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Class for computing matrix powers.
+ *
+ * \tparam MatrixType  type of the base, expected to be an instantiation
+ * of the Matrix class template.
+ *
+ * This class is capable of computing triangular real/complex matrices
+ * raised to a power in the interval \f$ (-1, 1) \f$.
+ *
+ * \note Currently this class is only used by MatrixPower. One may
+ * insist that this be nested into MatrixPower. This class is here to
+ * faciliate future development of triangular matrix functions.
+ */
 template<typename MatrixType>
-class MatrixPowerAtomic
+class MatrixPowerAtomic : internal::noncopyable
 {
   private:
     enum {
@@ -49,14 +95,14 @@ class MatrixPowerAtomic
     typedef typename MatrixType::RealScalar RealScalar;
     typedef std::complex<RealScalar> ComplexScalar;
     typedef typename MatrixType::Index Index;
-    typedef Array<Scalar, RowsAtCompileTime, 1, ColMajor, MaxRowsAtCompileTime> ArrayType;
+    typedef Block<MatrixType,Dynamic,Dynamic> ResultType;
 
     const MatrixType& m_A;
     RealScalar m_p;
 
-    void computePade(int degree, const MatrixType& IminusT, MatrixType& res) const;
-    void compute2x2(MatrixType& res, RealScalar p) const;
-    void computeBig(MatrixType& res) const;
+    void computePade(int degree, const MatrixType& IminusT, ResultType& res) const;
+    void compute2x2(ResultType& res, RealScalar p) const;
+    void computeBig(ResultType& res) const;
     static int getPadeDegree(float normIminusT);
     static int getPadeDegree(double normIminusT);
     static int getPadeDegree(long double normIminusT);
@@ -64,24 +110,45 @@ class MatrixPowerAtomic
     static RealScalar computeSuperDiag(RealScalar, RealScalar, RealScalar p);
 
   public:
+    /**
+     * \brief Constructor.
+     *
+     * \param[in] T  the base of the matrix power.
+     * \param[in] p  the exponent of the matrix power, should be in
+     * \f$ (-1, 1) \f$.
+     *
+     * The class stores a reference to T, so it should not be changed
+     * (or destroyed) before evaluation. Only the upper triangular
+     * part of T is read.
+     */
     MatrixPowerAtomic(const MatrixType& T, RealScalar p);
-    void compute(MatrixType& res) const;
+    
+    /**
+     * \brief Compute the matrix power.
+     *
+     * \param[out] res  \f$ A^p \f$ where A and p are specified in the
+     * constructor.
+     */
+    void compute(ResultType& res) const;
 };
 
 template<typename MatrixType>
 MatrixPowerAtomic<MatrixType>::MatrixPowerAtomic(const MatrixType& T, RealScalar p) :
   m_A(T), m_p(p)
-{ eigen_assert(T.rows() == T.cols()); }
+{
+  eigen_assert(T.rows() == T.cols());
+  eigen_assert(p > -1 && p < 1);
+}
 
 template<typename MatrixType>
-void MatrixPowerAtomic<MatrixType>::compute(MatrixType& res) const
+void MatrixPowerAtomic<MatrixType>::compute(ResultType& res) const
 {
-  res.resizeLike(m_A);
+  using std::pow;
   switch (m_A.rows()) {
     case 0:
       break;
     case 1:
-      res(0,0) = std::pow(m_A(0,0), m_p);
+      res(0,0) = pow(m_A(0,0), m_p);
       break;
     case 2:
       compute2x2(res, m_p);
@@ -92,24 +159,24 @@ void MatrixPowerAtomic<MatrixType>::compute(MatrixType& res) const
 }
 
 template<typename MatrixType>
-void MatrixPowerAtomic<MatrixType>::computePade(int degree, const MatrixType& IminusT, MatrixType& res) const
+void MatrixPowerAtomic<MatrixType>::computePade(int degree, const MatrixType& IminusT, ResultType& res) const
 {
-  int i = degree<<1;
-  res = (m_p-degree) / ((i-1)<<1) * IminusT;
+  int i = 2*degree;
+  res = (m_p-degree) / (2*i-2) * IminusT;
+
   for (--i; i; --i) {
     res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView<Upper>()
-	.solve((i==1 ? -m_p : i&1 ? (-m_p-(i>>1))/(i<<1) : (m_p-(i>>1))/((i-1)<<1)) * IminusT).eval();
+	.solve((i==1 ? -m_p : i&1 ? (-m_p-i/2)/(2*i) : (m_p-i/2)/(2*i-2)) * IminusT).eval();
   }
   res += MatrixType::Identity(IminusT.rows(), IminusT.cols());
 }
 
 // This function assumes that res has the correct size (see bug 614)
 template<typename MatrixType>
-void MatrixPowerAtomic<MatrixType>::compute2x2(MatrixType& res, RealScalar p) const
+void MatrixPowerAtomic<MatrixType>::compute2x2(ResultType& res, RealScalar p) const
 {
   using std::abs;
   using std::pow;
-  
   res.coeffRef(0,0) = pow(m_A.coeff(0,0), p);
 
   for (Index i=1; i < m_A.cols(); ++i) {
@@ -125,32 +192,20 @@ void MatrixPowerAtomic<MatrixType>::compute2x2(MatrixType& res, RealScalar p) co
 }
 
 template<typename MatrixType>
-void MatrixPowerAtomic<MatrixType>::computeBig(MatrixType& res) const
+void MatrixPowerAtomic<MatrixType>::computeBig(ResultType& res) const
 {
+  using std::ldexp;
   const int digits = std::numeric_limits<RealScalar>::digits;
-  const RealScalar maxNormForPade = digits <=  24? 4.3386528e-1f:                           // sigle precision
-				    digits <=  53? 2.789358995219730e-1:                    // double precision
-				    digits <=  64? 2.4471944416607995472e-1L:               // extended precision
-				    digits <= 106? 1.1016843812851143391275867258512e-1L:   // double-double
-						   9.134603732914548552537150753385375e-2L; // quadruple precision
+  const RealScalar maxNormForPade = digits <=  24? 4.3386528e-1L                            // single precision
+                                  : digits <=  53? 2.789358995219730e-1L                    // double precision
+                                  : digits <=  64? 2.4471944416607995472e-1L                // extended precision
+                                  : digits <= 106? 1.1016843812851143391275867258512e-1L    // double-double
+                                  :                9.134603732914548552537150753385375e-2L; // quadruple precision
   MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>();
   RealScalar normIminusT;
   int degree, degree2, numberOfSquareRoots = 0;
   bool hasExtraSquareRoot = false;
 
-  /* FIXME
-   * For singular T, norm(I - T) >= 1 but maxNormForPade < 1, leads to infinite
-   * loop.  We should move 0 eigenvalues to bottom right corner.  We need not
-   * worry about tiny values (e.g. 1e-300) because they will reach 1 if
-   * repetitively sqrt'ed.
-   *
-   * If the 0 eigenvalues are semisimple, they can form a 0 matrix at the
-   * bottom right corner.
-   *
-   * [ T  A ]^p   [ T^p  (T^-1 T^p A) ]
-   * [      ]   = [                   ]
-   * [ 0  0 ]     [  0         0      ]
-   */
   for (Index i=0; i < m_A.cols(); ++i)
     eigen_assert(m_A(i,i) != RealScalar(0));
 
@@ -164,14 +219,14 @@ void MatrixPowerAtomic<MatrixType>::computeBig(MatrixType& res) const
 	break;
       hasExtraSquareRoot = true;
     }
-    MatrixSquareRootTriangular<MatrixType>(T).compute(sqrtT);
+    matrix_sqrt_triangular(T, sqrtT);
     T = sqrtT.template triangularView<Upper>();
     ++numberOfSquareRoots;
   }
   computePade(degree, IminusT, res);
 
   for (; numberOfSquareRoots; --numberOfSquareRoots) {
-    compute2x2(res, std::ldexp(m_p, -numberOfSquareRoots));
+    compute2x2(res, ldexp(m_p, -numberOfSquareRoots));
     res = res.template triangularView<Upper>() * res;
   }
   compute2x2(res, m_p);
@@ -209,7 +264,7 @@ inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(long double normIminusT)
       1.999045567181744e-1L, 2.789358995219730e-1L };
 #elif LDBL_MANT_DIG <= 64
   const int maxPadeDegree = 8;
-  const double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L,
+  const long double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L,
       6.4216043030404063729e-2L, 1.1701165502926694307e-1L, 1.7904284231268670284e-1L, 2.4471944416607995472e-1L };
 #elif LDBL_MANT_DIG <= 106
   const int maxPadeDegree = 10;
@@ -236,19 +291,28 @@ template<typename MatrixType>
 inline typename MatrixPowerAtomic<MatrixType>::ComplexScalar
 MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar& prev, RealScalar p)
 {
-  ComplexScalar logCurr = std::log(curr);
-  ComplexScalar logPrev = std::log(prev);
-  int unwindingNumber = std::ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI));
-  ComplexScalar w = numext::atanh2(curr - prev, curr + prev) + ComplexScalar(0, M_PI*unwindingNumber);
-  return RealScalar(2) * std::exp(RealScalar(0.5) * p * (logCurr + logPrev)) * std::sinh(p * w) / (curr - prev);
+  using std::ceil;
+  using std::exp;
+  using std::log;
+  using std::sinh;
+
+  ComplexScalar logCurr = log(curr);
+  ComplexScalar logPrev = log(prev);
+  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
+  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
+  return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
 }
 
 template<typename MatrixType>
 inline typename MatrixPowerAtomic<MatrixType>::RealScalar
 MatrixPowerAtomic<MatrixType>::computeSuperDiag(RealScalar curr, RealScalar prev, RealScalar p)
 {
-  RealScalar w = numext::atanh2(curr - prev, curr + prev);
-  return 2 * std::exp(p * (std::log(curr) + std::log(prev)) / 2) * std::sinh(p * w) / (curr - prev);
+  using std::exp;
+  using std::log;
+  using std::sinh;
+
+  RealScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2);
+  return 2 * exp(p * (log(curr) + log(prev)) / 2) * sinh(p * w) / (curr - prev);
 }
 
 /**
@@ -271,15 +335,9 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(RealScalar curr, RealScalar prev
  * Output: \verbinclude MatrixPower_optimal.out
  */
 template<typename MatrixType>
-class MatrixPower
+class MatrixPower : internal::noncopyable
 {
   private:
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef typename MatrixType::Index Index;
@@ -293,7 +351,11 @@ class MatrixPower
      * The class stores a reference to A, so it should not be changed
      * (or destroyed) before evaluation.
      */
-    explicit MatrixPower(const MatrixType& A) : m_A(A), m_conditionNumber(0)
+    explicit MatrixPower(const MatrixType& A) :
+      m_A(A),
+      m_conditionNumber(0),
+      m_rank(A.cols()),
+      m_nulls(0)
     { eigen_assert(A.rows() == A.cols()); }
 
     /**
@@ -303,8 +365,8 @@ class MatrixPower
      * \return The expression \f$ A^p \f$, where A is specified in the
      * constructor.
      */
-    const MatrixPowerRetval<MatrixType> operator()(RealScalar p)
-    { return MatrixPowerRetval<MatrixType>(*this, p); }
+    const MatrixPowerParenthesesReturnValue<MatrixType> operator()(RealScalar p)
+    { return MatrixPowerParenthesesReturnValue<MatrixType>(*this, p); }
 
     /**
      * \brief Compute the matrix power.
@@ -321,21 +383,54 @@ class MatrixPower
 
   private:
     typedef std::complex<RealScalar> ComplexScalar;
-    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, MatrixType::Options,
-              MaxRowsAtCompileTime, MaxColsAtCompileTime> ComplexMatrix;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0,
+              MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> ComplexMatrix;
 
+    /** \brief Reference to the base of matrix power. */
     typename MatrixType::Nested m_A;
+
+    /** \brief Temporary storage. */
     MatrixType m_tmp;
-    ComplexMatrix m_T, m_U, m_fT;
+
+    /** \brief Store the result of Schur decomposition. */
+    ComplexMatrix m_T, m_U;
+    
+    /** \brief Store fractional power of m_T. */
+    ComplexMatrix m_fT;
+
+    /**
+     * \brief Condition number of m_A.
+     *
+     * It is initialized as 0 to avoid performing unnecessary Schur
+     * decomposition, which is the bottleneck.
+     */
     RealScalar m_conditionNumber;
 
-    RealScalar modfAndInit(RealScalar, RealScalar*);
+    /** \brief Rank of m_A. */
+    Index m_rank;
+    
+    /** \brief Rank deficiency of m_A. */
+    Index m_nulls;
+
+    /**
+     * \brief Split p into integral part and fractional part.
+     *
+     * \param[in]  p        The exponent.
+     * \param[out] p        The fractional part ranging in \f$ (-1, 1) \f$.
+     * \param[out] intpart  The integral part.
+     *
+     * Only if the fractional part is nonzero, it calls initialize().
+     */
+    void split(RealScalar& p, RealScalar& intpart);
+
+    /** \brief Perform Schur decomposition for fractional power. */
+    void initialize();
 
     template<typename ResultType>
-    void computeIntPower(ResultType&, RealScalar);
+    void computeIntPower(ResultType& res, RealScalar p);
 
     template<typename ResultType>
-    void computeFracPower(ResultType&, RealScalar);
+    void computeFracPower(ResultType& res, RealScalar p);
 
     template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
     static void revertSchur(
@@ -354,59 +449,102 @@ template<typename MatrixType>
 template<typename ResultType>
 void MatrixPower<MatrixType>::compute(ResultType& res, RealScalar p)
 {
+  using std::pow;
   switch (cols()) {
     case 0:
       break;
     case 1:
-      res(0,0) = std::pow(m_A.coeff(0,0), p);
+      res(0,0) = pow(m_A.coeff(0,0), p);
       break;
     default:
-      RealScalar intpart, x = modfAndInit(p, &intpart);
+      RealScalar intpart;
+      split(p, intpart);
+
+      res = MatrixType::Identity(rows(), cols());
       computeIntPower(res, intpart);
-      computeFracPower(res, x);
+      if (p) computeFracPower(res, p);
   }
 }
 
 template<typename MatrixType>
-typename MatrixPower<MatrixType>::RealScalar
-MatrixPower<MatrixType>::modfAndInit(RealScalar x, RealScalar* intpart)
+void MatrixPower<MatrixType>::split(RealScalar& p, RealScalar& intpart)
 {
-  typedef Array<RealScalar, RowsAtCompileTime, 1, ColMajor, MaxRowsAtCompileTime> RealArray;
+  using std::floor;
+  using std::pow;
 
-  *intpart = std::floor(x);
-  RealScalar res = x - *intpart;
+  intpart = floor(p);
+  p -= intpart;
 
-  if (!m_conditionNumber && res) {
-    const ComplexSchur<MatrixType> schurOfA(m_A);
-    m_T = schurOfA.matrixT();
-    m_U = schurOfA.matrixU();
-    
-    const RealArray absTdiag = m_T.diagonal().array().abs();
-    m_conditionNumber = absTdiag.maxCoeff() / absTdiag.minCoeff();
+  // Perform Schur decomposition if it is not yet performed and the power is
+  // not an integer.
+  if (!m_conditionNumber && p)
+    initialize();
+
+  // Choose the more stable of intpart = floor(p) and intpart = ceil(p).
+  if (p > RealScalar(0.5) && p > (1-p) * pow(m_conditionNumber, p)) {
+    --p;
+    ++intpart;
+  }
+}
+
+template<typename MatrixType>
+void MatrixPower<MatrixType>::initialize()
+{
+  const ComplexSchur<MatrixType> schurOfA(m_A);
+  JacobiRotation<ComplexScalar> rot;
+  ComplexScalar eigenvalue;
+
+  m_fT.resizeLike(m_A);
+  m_T = schurOfA.matrixT();
+  m_U = schurOfA.matrixU();
+  m_conditionNumber = m_T.diagonal().array().abs().maxCoeff() / m_T.diagonal().array().abs().minCoeff();
+
+  // Move zero eigenvalues to the bottom right corner.
+  for (Index i = cols()-1; i>=0; --i) {
+    if (m_rank <= 2)
+      return;
+    if (m_T.coeff(i,i) == RealScalar(0)) {
+      for (Index j=i+1; j < m_rank; ++j) {
+        eigenvalue = m_T.coeff(j,j);
+        rot.makeGivens(m_T.coeff(j-1,j), eigenvalue);
+        m_T.applyOnTheRight(j-1, j, rot);
+        m_T.applyOnTheLeft(j-1, j, rot.adjoint());
+        m_T.coeffRef(j-1,j-1) = eigenvalue;
+        m_T.coeffRef(j,j) = RealScalar(0);
+        m_U.applyOnTheRight(j-1, j, rot);
+      }
+      --m_rank;
+    }
   }
 
-  if (res>RealScalar(0.5) && res>(1-res)*std::pow(m_conditionNumber, res)) {
-    --res;
-    ++*intpart;
+  m_nulls = rows() - m_rank;
+  if (m_nulls) {
+    eigen_assert(m_T.bottomRightCorner(m_nulls, m_nulls).isZero()
+        && "Base of matrix power should be invertible or with a semisimple zero eigenvalue.");
+    m_fT.bottomRows(m_nulls).fill(RealScalar(0));
   }
-  return res;
 }
 
 template<typename MatrixType>
 template<typename ResultType>
 void MatrixPower<MatrixType>::computeIntPower(ResultType& res, RealScalar p)
 {
-  RealScalar pp = std::abs(p);
+  using std::abs;
+  using std::fmod;
+  RealScalar pp = abs(p);
 
-  if (p<0)  m_tmp = m_A.inverse();
-  else      m_tmp = m_A;
+  if (p<0) 
+    m_tmp = m_A.inverse();
+  else     
+    m_tmp = m_A;
 
-  res = MatrixType::Identity(rows(), cols());
-  while (pp >= 1) {
-    if (std::fmod(pp, 2) >= 1)
+  while (true) {
+    if (fmod(pp, 2) >= 1)
       res = m_tmp * res;
-    m_tmp *= m_tmp;
     pp /= 2;
+    if (pp < 1)
+      break;
+    m_tmp *= m_tmp;
   }
 }
 
@@ -414,12 +552,17 @@ template<typename MatrixType>
 template<typename ResultType>
 void MatrixPower<MatrixType>::computeFracPower(ResultType& res, RealScalar p)
 {
-  if (p) {
-    eigen_assert(m_conditionNumber);
-    MatrixPowerAtomic<ComplexMatrix>(m_T, p).compute(m_fT);
-    revertSchur(m_tmp, m_fT, m_U);
-    res = m_tmp * res;
+  Block<ComplexMatrix,Dynamic,Dynamic> blockTp(m_fT, 0, 0, m_rank, m_rank);
+  eigen_assert(m_conditionNumber);
+  eigen_assert(m_rank + m_nulls == rows());
+
+  MatrixPowerAtomic<ComplexMatrix>(m_T.topLeftCorner(m_rank, m_rank), p).compute(blockTp);
+  if (m_nulls) {
+    m_fT.topRightCorner(m_rank, m_nulls) = m_T.topLeftCorner(m_rank, m_rank).template triangularView<Upper>()
+        .solve(blockTp * m_T.topRightCorner(m_rank, m_nulls));
   }
+  revertSchur(m_tmp, m_fT, m_U);
+  res = m_tmp * res;
 }
 
 template<typename MatrixType>
@@ -463,7 +606,7 @@ class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Deri
      * \brief Constructor.
      *
      * \param[in] A  %Matrix (expression), the base of the matrix power.
-     * \param[in] p  scalar, the exponent of the matrix power.
+     * \param[in] p  real scalar, the exponent of the matrix power.
      */
     MatrixPowerReturnValue(const Derived& A, RealScalar p) : m_A(A), m_p(p)
     { }
@@ -484,25 +627,83 @@ class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Deri
   private:
     const Derived& m_A;
     const RealScalar m_p;
-    MatrixPowerReturnValue& operator=(const MatrixPowerReturnValue&);
+};
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix (expression).
+ *
+ * \tparam Derived  type of the base, a matrix (expression).
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::pow() and related functions and most of the
+ * time this is the only way it is used.
+ */
+template<typename Derived>
+class MatrixComplexPowerReturnValue : public ReturnByValue< MatrixComplexPowerReturnValue<Derived> >
+{
+  public:
+    typedef typename Derived::PlainObject PlainObject;
+    typedef typename std::complex<typename Derived::RealScalar> ComplexScalar;
+    typedef typename Derived::Index Index;
+
+    /**
+     * \brief Constructor.
+     *
+     * \param[in] A  %Matrix (expression), the base of the matrix power.
+     * \param[in] p  complex scalar, the exponent of the matrix power.
+     */
+    MatrixComplexPowerReturnValue(const Derived& A, const ComplexScalar& p) : m_A(A), m_p(p)
+    { }
+
+    /**
+     * \brief Compute the matrix power.
+     *
+     * Because \p p is complex, \f$ A^p \f$ is simply evaluated as \f$
+     * \exp(p \log(A)) \f$.
+     *
+     * \param[out] result  \f$ A^p \f$ where \p A and \p p are as in the
+     * constructor.
+     */
+    template<typename ResultType>
+    inline void evalTo(ResultType& res) const
+    { res = (m_p * m_A.log()).exp(); }
+
+    Index rows() const { return m_A.rows(); }
+    Index cols() const { return m_A.cols(); }
+
+  private:
+    const Derived& m_A;
+    const ComplexScalar m_p;
 };
 
 namespace internal {
 
 template<typename MatrixPowerType>
-struct traits< MatrixPowerRetval<MatrixPowerType> >
+struct traits< MatrixPowerParenthesesReturnValue<MatrixPowerType> >
 { typedef typename MatrixPowerType::PlainObject ReturnType; };
 
 template<typename Derived>
 struct traits< MatrixPowerReturnValue<Derived> >
 { typedef typename Derived::PlainObject ReturnType; };
 
+template<typename Derived>
+struct traits< MatrixComplexPowerReturnValue<Derived> >
+{ typedef typename Derived::PlainObject ReturnType; };
+
 }
 
 template<typename Derived>
 const MatrixPowerReturnValue<Derived> MatrixBase<Derived>::pow(const RealScalar& p) const
 { return MatrixPowerReturnValue<Derived>(derived(), p); }
 
+template<typename Derived>
+const MatrixComplexPowerReturnValue<Derived> MatrixBase<Derived>::pow(const std::complex<RealScalar>& p) const
+{ return MatrixComplexPowerReturnValue<Derived>(derived(), p); }
+
 } // namespace Eigen
 
 #endif // EIGEN_MATRIX_POWER
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index b48ea9d46..afd88ec4d 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,133 +12,16 @@
 
 namespace Eigen { 
 
-/** \ingroup MatrixFunctions_Module
-  * \brief Class for computing matrix square roots of upper quasi-triangular matrices.
-  * \tparam  MatrixType  type of the argument of the matrix square root,
-  *                      expected to be an instantiation of the Matrix class template.
-  *
-  * This class computes the square root of the upper quasi-triangular
-  * matrix stored in the upper Hessenberg part of the matrix passed to
-  * the constructor.
-  *
-  * \sa MatrixSquareRoot, MatrixSquareRootTriangular
-  */
-template <typename MatrixType>
-class MatrixSquareRootQuasiTriangular
-{
-  public:
-
-    /** \brief Constructor. 
-      *
-      * \param[in]  A  upper quasi-triangular matrix whose square root 
-      *                is to be computed.
-      *
-      * The class stores a reference to \p A, so it should not be
-      * changed (or destroyed) before compute() is called.
-      */
-    MatrixSquareRootQuasiTriangular(const MatrixType& A) 
-      : m_A(A) 
-    {
-      eigen_assert(A.rows() == A.cols());
-    }
-    
-    /** \brief Compute the matrix square root
-      *
-      * \param[out] result  square root of \p A, as specified in the constructor.
-      *
-      * Only the upper Hessenberg part of \p result is updated, the
-      * rest is not touched.  See MatrixBase::sqrt() for details on
-      * how this computation is implemented.
-      */
-    template <typename ResultType> void compute(ResultType &result);    
-    
-  private:
-    typedef typename MatrixType::Index Index;
-    typedef typename MatrixType::Scalar Scalar;
-    
-    void computeDiagonalPartOfSqrt(MatrixType& sqrtT, const MatrixType& T);
-    void computeOffDiagonalPartOfSqrt(MatrixType& sqrtT, const MatrixType& T);
-    void compute2x2diagonalBlock(MatrixType& sqrtT, const MatrixType& T, typename MatrixType::Index i);
-    void compute1x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j);
-    void compute1x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j);
-    void compute2x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j);
-    void compute2x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j);
-  
-    template <typename SmallMatrixType>
-    static void solveAuxiliaryEquation(SmallMatrixType& X, const SmallMatrixType& A, 
-				     const SmallMatrixType& B, const SmallMatrixType& C);
-  
-    const MatrixType& m_A;
-};
-
-template <typename MatrixType>
-template <typename ResultType> 
-void MatrixSquareRootQuasiTriangular<MatrixType>::compute(ResultType &result)
-{
-  result.resize(m_A.rows(), m_A.cols());
-  computeDiagonalPartOfSqrt(result, m_A);
-  computeOffDiagonalPartOfSqrt(result, m_A);
-}
-
-// pre:  T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
-// post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>::computeDiagonalPartOfSqrt(MatrixType& sqrtT, 
-									  const MatrixType& T)
-{
-  using std::sqrt;
-  const Index size = m_A.rows();
-  for (Index i = 0; i < size; i++) {
-    if (i == size - 1 || T.coeff(i+1, i) == 0) {
-      eigen_assert(T(i,i) >= 0);
-      sqrtT.coeffRef(i,i) = sqrt(T.coeff(i,i));
-    }
-    else {
-      compute2x2diagonalBlock(sqrtT, T, i);
-      ++i;
-    }
-  }
-}
-
-// pre:  T is quasi-upper-triangular and diagonal blocks of sqrtT are square root of diagonal blocks of T.
-// post: sqrtT is the square root of T.
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>::computeOffDiagonalPartOfSqrt(MatrixType& sqrtT, 
-									     const MatrixType& T)
-{
-  const Index size = m_A.rows();
-  for (Index j = 1; j < size; j++) {
-      if (T.coeff(j, j-1) != 0)  // if T(j-1:j, j-1:j) is a 2-by-2 block
-	continue;
-    for (Index i = j-1; i >= 0; i--) {
-      if (i > 0 && T.coeff(i, i-1) != 0)  // if T(i-1:i, i-1:i) is a 2-by-2 block
-	continue;
-      bool iBlockIs2x2 = (i < size - 1) && (T.coeff(i+1, i) != 0);
-      bool jBlockIs2x2 = (j < size - 1) && (T.coeff(j+1, j) != 0);
-      if (iBlockIs2x2 && jBlockIs2x2) 
-	compute2x2offDiagonalBlock(sqrtT, T, i, j);
-      else if (iBlockIs2x2 && !jBlockIs2x2) 
-	compute2x1offDiagonalBlock(sqrtT, T, i, j);
-      else if (!iBlockIs2x2 && jBlockIs2x2) 
-	compute1x2offDiagonalBlock(sqrtT, T, i, j);
-      else if (!iBlockIs2x2 && !jBlockIs2x2) 
-	compute1x1offDiagonalBlock(sqrtT, T, i, j);
-    }
-  }
-}
+namespace internal {
 
 // pre:  T.block(i,i,2,2) has complex conjugate eigenvalues
 // post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::compute2x2diagonalBlock(MatrixType& sqrtT, const MatrixType& T, typename MatrixType::Index i)
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typename MatrixType::Index i, ResultType& sqrtT)
 {
   // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
   //       in EigenSolver. If we expose it, we could call it directly from here.
+  typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,2,2> block = T.template block<2,2>(i,i);
   EigenSolver<Matrix<Scalar,2,2> > es(block);
   sqrtT.template block<2,2>(i,i)
@@ -148,21 +31,19 @@ void MatrixSquareRootQuasiTriangular<MatrixType>
 // pre:  block structure of T is such that (i,j) is a 1x1 block,
 //       all blocks of sqrtT to left of and below (i,j) are correct
 // post: sqrtT(i,j) has the correct value
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::compute1x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j)
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
 {
+  typedef typename traits<MatrixType>::Scalar Scalar;
   Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value();
   sqrtT.coeffRef(i,j) = (T.coeff(i,j) - tmp) / (sqrtT.coeff(i,i) + sqrtT.coeff(j,j));
 }
 
 // similar to compute1x1offDiagonalBlock()
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::compute1x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j)
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
 {
+  typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,1,2> rhs = T.template block<1,2>(i,j);
   if (j-i > 1)
     rhs -= sqrtT.block(i, i+1, 1, j-i-1) * sqrtT.block(i+1, j, j-i-1, 2);
@@ -172,11 +53,10 @@ void MatrixSquareRootQuasiTriangular<MatrixType>
 }
 
 // similar to compute1x1offDiagonalBlock()
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::compute2x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j)
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
 {
+  typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,2,1> rhs = T.template block<2,1>(i,j);
   if (j-i > 2)
     rhs -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 1);
@@ -185,32 +65,11 @@ void MatrixSquareRootQuasiTriangular<MatrixType>
   sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs);
 }
 
-// similar to compute1x1offDiagonalBlock()
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::compute2x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j)
-{
-  Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
-  Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
-  Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
-  if (j-i > 2)
-    C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
-  Matrix<Scalar,2,2> X;
-  solveAuxiliaryEquation(X, A, B, C);
-  sqrtT.template block<2,2>(i,j) = X;
-}
-
 // solves the equation A X + X B = C where all matrices are 2-by-2
 template <typename MatrixType>
-template <typename SmallMatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::solveAuxiliaryEquation(SmallMatrixType& X, const SmallMatrixType& A,
-			      const SmallMatrixType& B, const SmallMatrixType& C)
+void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B, const MatrixType& C)
 {
-  EIGEN_STATIC_ASSERT((internal::is_same<SmallMatrixType, Matrix<Scalar,2,2> >::value),
-		      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
-
+  typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,4,4> coeffMatrix = Matrix<Scalar,4,4>::Zero();
   coeffMatrix.coeffRef(0,0) = A.coeff(0,0) + B.coeff(0,0);
   coeffMatrix.coeffRef(1,1) = A.coeff(0,0) + B.coeff(1,1);
@@ -224,13 +83,13 @@ void MatrixSquareRootQuasiTriangular<MatrixType>
   coeffMatrix.coeffRef(2,3) = B.coeff(1,0);
   coeffMatrix.coeffRef(3,1) = A.coeff(1,0);
   coeffMatrix.coeffRef(3,2) = B.coeff(0,1);
-  
+
   Matrix<Scalar,4,1> rhs;
   rhs.coeffRef(0) = C.coeff(0,0);
   rhs.coeffRef(1) = C.coeff(0,1);
   rhs.coeffRef(2) = C.coeff(1,0);
   rhs.coeffRef(3) = C.coeff(1,1);
-  
+
   Matrix<Scalar,4,1> result;
   result = coeffMatrix.fullPivLu().solve(rhs);
 
@@ -240,165 +99,208 @@ void MatrixSquareRootQuasiTriangular<MatrixType>
   X.coeffRef(1,1) = result.coeff(3);
 }
 
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+{
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
+  Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
+  Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
+  if (j-i > 2)
+    C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
+  Matrix<Scalar,2,2> X;
+  matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
+  sqrtT.template block<2,2>(i,j) = X;
+}
+
+// pre:  T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
+// post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrtT)
+{
+  using std::sqrt;
+  typedef typename MatrixType::Index Index;
+  const Index size = T.rows();
+  for (Index i = 0; i < size; i++) {
+    if (i == size - 1 || T.coeff(i+1, i) == 0) {
+      eigen_assert(T(i,i) >= 0);
+      sqrtT.coeffRef(i,i) = sqrt(T.coeff(i,i));
+    }
+    else {
+      matrix_sqrt_quasi_triangular_2x2_diagonal_block(T, i, sqrtT);
+      ++i;
+    }
+  }
+}
+
+// pre:  T is quasi-upper-triangular and diagonal blocks of sqrtT are square root of diagonal blocks of T.
+// post: sqrtT is the square root of T.
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_off_diagonal(const MatrixType& T, ResultType& sqrtT)
+{
+  typedef typename MatrixType::Index Index;
+  const Index size = T.rows();
+  for (Index j = 1; j < size; j++) {
+      if (T.coeff(j, j-1) != 0)  // if T(j-1:j, j-1:j) is a 2-by-2 block
+	continue;
+    for (Index i = j-1; i >= 0; i--) {
+      if (i > 0 && T.coeff(i, i-1) != 0)  // if T(i-1:i, i-1:i) is a 2-by-2 block
+	continue;
+      bool iBlockIs2x2 = (i < size - 1) && (T.coeff(i+1, i) != 0);
+      bool jBlockIs2x2 = (j < size - 1) && (T.coeff(j+1, j) != 0);
+      if (iBlockIs2x2 && jBlockIs2x2) 
+        matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(T, i, j, sqrtT);
+      else if (iBlockIs2x2 && !jBlockIs2x2) 
+        matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(T, i, j, sqrtT);
+      else if (!iBlockIs2x2 && jBlockIs2x2) 
+        matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(T, i, j, sqrtT);
+      else if (!iBlockIs2x2 && !jBlockIs2x2) 
+        matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(T, i, j, sqrtT);
+    }
+  }
+}
+
+} // end of namespace internal
 
 /** \ingroup MatrixFunctions_Module
-  * \brief Class for computing matrix square roots of upper triangular matrices.
-  * \tparam  MatrixType  type of the argument of the matrix square root,
+  * \brief Compute matrix square root of quasi-triangular matrix.
+  *
+  * \tparam  MatrixType  type of \p arg, the argument of matrix square root,
   *                      expected to be an instantiation of the Matrix class template.
+  * \tparam  ResultType  type of \p result, where result is to be stored.
+  * \param[in]  arg      argument of matrix square root.
+  * \param[out] result   matrix square root of upper Hessenberg part of \p arg.
   *
-  * This class computes the square root of the upper triangular matrix
-  * stored in the upper triangular part (including the diagonal) of
-  * the matrix passed to the constructor.
+  * This function computes the square root of the upper quasi-triangular matrix stored in the upper
+  * Hessenberg part of \p arg.  Only the upper Hessenberg part of \p result is updated, the rest is
+  * not touched.  See MatrixBase::sqrt() for details on how this computation is implemented.
   *
   * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular
   */
-template <typename MatrixType>
-class MatrixSquareRootTriangular
+template <typename MatrixType, typename ResultType> 
+void matrix_sqrt_quasi_triangular(const MatrixType &arg, ResultType &result)
 {
-  public:
-    MatrixSquareRootTriangular(const MatrixType& A) 
-      : m_A(A) 
-    {
-      eigen_assert(A.rows() == A.cols());
-    }
-
-    /** \brief Compute the matrix square root
-      *
-      * \param[out] result  square root of \p A, as specified in the constructor.
-      *
-      * Only the upper triangular part (including the diagonal) of 
-      * \p result is updated, the rest is not touched.  See
-      * MatrixBase::sqrt() for details on how this computation is
-      * implemented.
-      */
-    template <typename ResultType> void compute(ResultType &result);    
+  eigen_assert(arg.rows() == arg.cols());
+  result.resize(arg.rows(), arg.cols());
+  internal::matrix_sqrt_quasi_triangular_diagonal(arg, result);
+  internal::matrix_sqrt_quasi_triangular_off_diagonal(arg, result);
+}
 
- private:
-    const MatrixType& m_A;
-};
 
-template <typename MatrixType>
-template <typename ResultType> 
-void MatrixSquareRootTriangular<MatrixType>::compute(ResultType &result)
+/** \ingroup MatrixFunctions_Module
+  * \brief Compute matrix square root of triangular matrix.
+  *
+  * \tparam  MatrixType  type of \p arg, the argument of matrix square root,
+  *                      expected to be an instantiation of the Matrix class template.
+  * \tparam  ResultType  type of \p result, where result is to be stored.
+  * \param[in]  arg      argument of matrix square root.
+  * \param[out] result   matrix square root of upper triangular part of \p arg.
+  *
+  * Only the upper triangular part (including the diagonal) of \p result is updated, the rest is not
+  * touched.  See MatrixBase::sqrt() for details on how this computation is implemented.
+  *
+  * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular
+  */
+template <typename MatrixType, typename ResultType> 
+void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result)
 {
   using std::sqrt;
+  typedef typename MatrixType::Index Index;
+      typedef typename MatrixType::Scalar Scalar;
 
-  // Compute square root of m_A and store it in upper triangular part of result
+  eigen_assert(arg.rows() == arg.cols());
+
+  // Compute square root of arg and store it in upper triangular part of result
   // This uses that the square root of triangular matrices can be computed directly.
-  result.resize(m_A.rows(), m_A.cols());
-  typedef typename MatrixType::Index Index;
-  for (Index i = 0; i < m_A.rows(); i++) {
-    result.coeffRef(i,i) = sqrt(m_A.coeff(i,i));
+  result.resize(arg.rows(), arg.cols());
+  for (Index i = 0; i < arg.rows(); i++) {
+    result.coeffRef(i,i) = sqrt(arg.coeff(i,i));
   }
-  for (Index j = 1; j < m_A.cols(); j++) {
+  for (Index j = 1; j < arg.cols(); j++) {
     for (Index i = j-1; i >= 0; i--) {
-      typedef typename MatrixType::Scalar Scalar;
       // if i = j-1, then segment has length 0 so tmp = 0
       Scalar tmp = (result.row(i).segment(i+1,j-i-1) * result.col(j).segment(i+1,j-i-1)).value();
       // denominator may be zero if original matrix is singular
-      result.coeffRef(i,j) = (m_A.coeff(i,j) - tmp) / (result.coeff(i,i) + result.coeff(j,j));
+      result.coeffRef(i,j) = (arg.coeff(i,j) - tmp) / (result.coeff(i,i) + result.coeff(j,j));
     }
   }
 }
 
 
+namespace internal {
+
 /** \ingroup MatrixFunctions_Module
-  * \brief Class for computing matrix square roots of general matrices.
+  * \brief Helper struct for computing matrix square roots of general matrices.
   * \tparam  MatrixType  type of the argument of the matrix square root,
   *                      expected to be an instantiation of the Matrix class template.
   *
   * \sa MatrixSquareRootTriangular, MatrixSquareRootQuasiTriangular, MatrixBase::sqrt()
   */
 template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
-class MatrixSquareRoot
+struct matrix_sqrt_compute
 {
-  public:
-
-    /** \brief Constructor. 
-      *
-      * \param[in]  A  matrix whose square root is to be computed.
-      *
-      * The class stores a reference to \p A, so it should not be
-      * changed (or destroyed) before compute() is called.
-      */
-    MatrixSquareRoot(const MatrixType& A); 
-    
-    /** \brief Compute the matrix square root
-      *
-      * \param[out] result  square root of \p A, as specified in the constructor.
-      *
-      * See MatrixBase::sqrt() for details on how this computation is
-      * implemented.
-      */
-    template <typename ResultType> void compute(ResultType &result);    
+  /** \brief Compute the matrix square root
+    *
+    * \param[in]  arg     matrix whose square root is to be computed.
+    * \param[out] result  square root of \p arg.
+    *
+    * See MatrixBase::sqrt() for details on how this computation is implemented.
+    */
+  template <typename ResultType> static void run(const MatrixType &arg, ResultType &result);    
 };
 
 
 // ********** Partial specialization for real matrices **********
 
 template <typename MatrixType>
-class MatrixSquareRoot<MatrixType, 0>
+struct matrix_sqrt_compute<MatrixType, 0>
 {
-  public:
-
-    MatrixSquareRoot(const MatrixType& A) 
-      : m_A(A) 
-    {  
-      eigen_assert(A.rows() == A.cols());
-    }
-  
-    template <typename ResultType> void compute(ResultType &result)
-    {
-      // Compute Schur decomposition of m_A
-      const RealSchur<MatrixType> schurOfA(m_A);  
-      const MatrixType& T = schurOfA.matrixT();
-      const MatrixType& U = schurOfA.matrixU();
-    
-      // Compute square root of T
-      MatrixType sqrtT = MatrixType::Zero(m_A.rows(), m_A.cols());
-      MatrixSquareRootQuasiTriangular<MatrixType>(T).compute(sqrtT);
+  template <typename ResultType>
+  static void run(const MatrixType &arg, ResultType &result)
+  {
+    eigen_assert(arg.rows() == arg.cols());
+
+    // Compute Schur decomposition of arg
+    const RealSchur<MatrixType> schurOfA(arg);  
+    const MatrixType& T = schurOfA.matrixT();
+    const MatrixType& U = schurOfA.matrixU();
     
-      // Compute square root of m_A
-      result = U * sqrtT * U.adjoint();
-    }
+    // Compute square root of T
+    MatrixType sqrtT = MatrixType::Zero(arg.rows(), arg.cols());
+    matrix_sqrt_quasi_triangular(T, sqrtT);
     
-  private:
-    const MatrixType& m_A;
+    // Compute square root of arg
+    result = U * sqrtT * U.adjoint();
+  }
 };
 
 
 // ********** Partial specialization for complex matrices **********
 
 template <typename MatrixType>
-class MatrixSquareRoot<MatrixType, 1>
+struct matrix_sqrt_compute<MatrixType, 1>
 {
-  public:
-
-    MatrixSquareRoot(const MatrixType& A) 
-      : m_A(A) 
-    {  
-      eigen_assert(A.rows() == A.cols());
-    }
-  
-    template <typename ResultType> void compute(ResultType &result)
-    {
-      // Compute Schur decomposition of m_A
-      const ComplexSchur<MatrixType> schurOfA(m_A);  
-      const MatrixType& T = schurOfA.matrixT();
-      const MatrixType& U = schurOfA.matrixU();
+  template <typename ResultType>
+  static void run(const MatrixType &arg, ResultType &result)
+  {
+    eigen_assert(arg.rows() == arg.cols());
+
+    // Compute Schur decomposition of arg
+    const ComplexSchur<MatrixType> schurOfA(arg);  
+    const MatrixType& T = schurOfA.matrixT();
+    const MatrixType& U = schurOfA.matrixU();
     
-      // Compute square root of T
-      MatrixType sqrtT;
-      MatrixSquareRootTriangular<MatrixType>(T).compute(sqrtT);
+    // Compute square root of T
+    MatrixType sqrtT;
+    matrix_sqrt_triangular(T, sqrtT);
     
-      // Compute square root of m_A
-      result = U * (sqrtT.template triangularView<Upper>() * U.adjoint());
-    }
-    
-  private:
-    const MatrixType& m_A;
+    // Compute square root of arg
+    result = U * (sqrtT.template triangularView<Upper>() * U.adjoint());
+  }
 };
 
+} // end namespace internal
 
 /** \ingroup MatrixFunctions_Module
   *
@@ -415,14 +317,17 @@ class MatrixSquareRoot<MatrixType, 1>
 template<typename Derived> class MatrixSquareRootReturnValue
 : public ReturnByValue<MatrixSquareRootReturnValue<Derived> >
 {
+  protected:
     typedef typename Derived::Index Index;
+    typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
   public:
     /** \brief Constructor.
       *
       * \param[in]  src  %Matrix (expression) forming the argument of the
       * matrix square root.
       */
-    MatrixSquareRootReturnValue(const Derived& src) : m_src(src) { }
+    explicit MatrixSquareRootReturnValue(const Derived& src) : m_src(src) { }
 
     /** \brief Compute the matrix square root.
       *
@@ -432,18 +337,17 @@ template<typename Derived> class MatrixSquareRootReturnValue
     template <typename ResultType>
     inline void evalTo(ResultType& result) const
     {
-      const typename Derived::PlainObject srcEvaluated = m_src.eval();
-      MatrixSquareRoot<typename Derived::PlainObject> me(srcEvaluated);
-      me.compute(result);
+      typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
+      typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
+      DerivedEvalType tmp(m_src);
+      internal::matrix_sqrt_compute<DerivedEvalTypeClean>::run(tmp, result);
     }
 
     Index rows() const { return m_src.rows(); }
     Index cols() const { return m_src.cols(); }
 
   protected:
-    const Derived& m_src;
-  private:
-    MatrixSquareRootReturnValue& operator=(const MatrixSquareRootReturnValue&);
+    const DerivedNested m_src;
 };
 
 namespace internal {
diff --git a/unsupported/Eigen/src/MatrixFunctions/StemFunction.h b/unsupported/Eigen/src/MatrixFunctions/StemFunction.h
index 724e55c1d..7604df903 100644
--- a/unsupported/Eigen/src/MatrixFunctions/StemFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/StemFunction.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2010 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2010, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,93 +12,105 @@
 
 namespace Eigen { 
 
-/** \ingroup MatrixFunctions_Module 
-  * \brief Stem functions corresponding to standard mathematical functions.
-  */
+namespace internal {
+
+/** \brief The exponential function (and its derivatives). */
 template <typename Scalar>
-class StdStemFunctions
+Scalar stem_function_exp(Scalar x, int)
 {
-  public:
+  using std::exp;
+  return exp(x);
+}
 
-    /** \brief The exponential function (and its derivatives). */
-    static Scalar exp(Scalar x, int)
-    {
-      return std::exp(x);
-    }
+/** \brief Cosine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_cos(Scalar x, int n)
+{
+  using std::cos;
+  using std::sin;
+  Scalar res;
 
-    /** \brief Cosine (and its derivatives). */
-    static Scalar cos(Scalar x, int n)
-    {
-      Scalar res;
-      switch (n % 4) {
-      case 0: 
-	res = std::cos(x);
-	break;
-      case 1:
-	res = -std::sin(x);
-	break;
-      case 2:
-	res = -std::cos(x);
-	break;
-      case 3:
-	res = std::sin(x);
-	break;
-      }
-      return res;
-    }
+  switch (n % 4) {
+  case 0: 
+    res = std::cos(x);
+    break;
+  case 1:
+    res = -std::sin(x);
+    break;
+  case 2:
+    res = -std::cos(x);
+    break;
+  case 3:
+    res = std::sin(x);
+    break;
+  }
+  return res;
+}
+
+/** \brief Sine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_sin(Scalar x, int n)
+{
+  using std::cos;
+  using std::sin;
+  Scalar res;
 
-    /** \brief Sine (and its derivatives). */
-    static Scalar sin(Scalar x, int n)
-    {
-      Scalar res;
-      switch (n % 4) {
-      case 0:
-	res = std::sin(x);
-	break;
-      case 1:
-	res = std::cos(x);
-	break;
-      case 2:
-	res = -std::sin(x);
-	break;
-      case 3:
-	res = -std::cos(x);
-	break;
-      }
-      return res;
-    }
+  switch (n % 4) {
+  case 0:
+    res = std::sin(x);
+    break;
+  case 1:
+    res = std::cos(x);
+    break;
+  case 2:
+    res = -std::sin(x);
+    break;
+  case 3:
+    res = -std::cos(x);
+    break;
+  }
+  return res;
+}
 
-    /** \brief Hyperbolic cosine (and its derivatives). */
-    static Scalar cosh(Scalar x, int n)
-    {
-      Scalar res;
-      switch (n % 2) {
-      case 0:
-	res = std::cosh(x);
-	break;
-      case 1:
-	res = std::sinh(x);
-	break;
-      }
-      return res;
-    }
+/** \brief Hyperbolic cosine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_cosh(Scalar x, int n)
+{
+  using std::cosh;
+  using std::sinh;
+  Scalar res;
+  
+  switch (n % 2) {
+  case 0:
+    res = std::cosh(x);
+    break;
+  case 1:
+    res = std::sinh(x);
+    break;
+  }
+  return res;
+}
 	
-    /** \brief Hyperbolic sine (and its derivatives). */
-    static Scalar sinh(Scalar x, int n)
-    {
-      Scalar res;
-      switch (n % 2) {
-      case 0:
-	res = std::sinh(x);
-	break;
-      case 1:
-	res = std::cosh(x);
-	break;
-      }
-      return res;
-    }
+/** \brief Hyperbolic sine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_sinh(Scalar x, int n)
+{
+  using std::cosh;
+  using std::sinh;
+  Scalar res;
+  
+  switch (n % 2) {
+  case 0:
+    res = std::sinh(x);
+    break;
+  case 1:
+    res = std::cosh(x);
+    break;
+  }
+  return res;
+}
 
-}; // end of class StdStemFunctions
+} // end namespace internal
 
 } // end namespace Eigen
 
diff --git a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt b/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
deleted file mode 100644
index 1b887cc8e..000000000
--- a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MoreVectorization_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_MoreVectorization_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MoreVectorization COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt b/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
deleted file mode 100644
index 9322ddadf..000000000
--- a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NonLinearOptimization_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_NonLinearOptimization_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NonLinearOptimization COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
index b8ba6ddcb..8fe3ed86b 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -150,7 +150,7 @@ HybridNonLinearSolver<FunctorType,Scalar>::solveInit(FVectorType  &x)
     fjac.resize(n, n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
 
     /* Function Body */
     nfev = 0;
@@ -390,7 +390,7 @@ HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffInit(FVectorType  &
     fvec.resize(n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
 
     /* Function Body */
     nfev = 0;
diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
index bfeb26fc9..fe3b79ca7 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
@@ -45,18 +45,24 @@ namespace LevenbergMarquardtSpace {
 template<typename FunctorType, typename Scalar=double>
 class LevenbergMarquardt
 {
+    static Scalar sqrt_epsilon()
+    {
+      using std::sqrt;
+      return sqrt(NumTraits<Scalar>::epsilon());
+    }
+    
 public:
     LevenbergMarquardt(FunctorType &_functor)
         : functor(_functor) { nfev = njev = iter = 0;  fnorm = gnorm = 0.; useExternalScaling=false; }
 
     typedef DenseIndex Index;
-
+    
     struct Parameters {
         Parameters()
             : factor(Scalar(100.))
             , maxfev(400)
-            , ftol(std::sqrt(NumTraits<Scalar>::epsilon()))
-            , xtol(std::sqrt(NumTraits<Scalar>::epsilon()))
+            , ftol(sqrt_epsilon())
+            , xtol(sqrt_epsilon())
             , gtol(Scalar(0.))
             , epsfcn(Scalar(0.)) {}
         Scalar factor;
@@ -72,7 +78,7 @@ public:
 
     LevenbergMarquardtSpace::Status lmder1(
             FVectorType &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            const Scalar tol = sqrt_epsilon()
             );
 
     LevenbergMarquardtSpace::Status minimize(FVectorType &x);
@@ -83,12 +89,12 @@ public:
             FunctorType &functor,
             FVectorType &x,
             Index *nfev,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            const Scalar tol = sqrt_epsilon()
             );
 
     LevenbergMarquardtSpace::Status lmstr1(
             FVectorType  &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            const Scalar tol = sqrt_epsilon()
             );
 
     LevenbergMarquardtSpace::Status minimizeOptimumStorage(FVectorType  &x);
@@ -109,6 +115,7 @@ public:
 
     Scalar lm_param(void) { return par; }
 private:
+    
     FunctorType &functor;
     Index n;
     Index m;
@@ -172,7 +179,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeInit(FVectorType  &x)
     fjac.resize(m, n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
     qtf.resize(n);
 
     /* Function Body */
@@ -208,7 +215,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType  &x)
 {
     using std::abs;
     using std::sqrt;
-    
+
     eigen_assert(x.size()==n); // check the caller is not cheating us
 
     /* calculate the jacobian matrix. */
@@ -391,7 +398,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageInit(FVectorType
     fjac.resize(n, n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
     qtf.resize(n);
 
     /* Function Body */
diff --git a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt b/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
deleted file mode 100644
index 1199aca2f..000000000
--- a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NumericalDiff_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_NumericalDiff_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NumericalDiff COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Polynomials/CMakeLists.txt b/unsupported/Eigen/src/Polynomials/CMakeLists.txt
deleted file mode 100644
index 51f13f3cb..000000000
--- a/unsupported/Eigen/src/Polynomials/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Polynomials_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Polynomials_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Polynomials COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
index cd5c04bbf..03198ec8e 100644
--- a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
+++ b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
@@ -41,7 +41,7 @@ class PolynomialSolverBase
   protected:
     template< typename OtherPolynomial >
     inline void setPolynomial( const OtherPolynomial& poly ){
-      m_roots.resize(poly.size()); }
+      m_roots.resize(poly.size()-1); }
 
   public:
     template< typename OtherPolynomial >
@@ -316,7 +316,7 @@ class PolynomialSolverBase
   * - real roots with greatest, smallest absolute real value.
   * - greatest, smallest real roots.
   *
-  * WARNING: this polynomial solver is experimental, part of the unsuported Eigen modules.
+  * WARNING: this polynomial solver is experimental, part of the unsupported Eigen modules.
   *
   *
   * Currently a QR algorithm is used to compute the eigenvalues of the companion matrix of
@@ -345,10 +345,19 @@ class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg>
     void compute( const OtherPolynomial& poly )
     {
       eigen_assert( Scalar(0) != poly[poly.size()-1] );
-      internal::companion<Scalar,_Deg> companion( poly );
-      companion.balance();
-      m_eigenSolver.compute( companion.denseMatrix() );
-      m_roots = m_eigenSolver.eigenvalues();
+      eigen_assert( poly.size() > 1 );
+      if(poly.size() >  2 )
+      {
+        internal::companion<Scalar,_Deg> companion( poly );
+        companion.balance();
+        m_eigenSolver.compute( companion.denseMatrix() );
+        m_roots = m_eigenSolver.eigenvalues();
+      }
+      else if(poly.size () == 2)
+      {
+        m_roots.resize(1);
+        m_roots[0] = -poly[0]/poly[1];
+      }
     }
 
   public:
@@ -376,10 +385,18 @@ class PolynomialSolver<_Scalar,1> : public PolynomialSolverBase<_Scalar,1>
     template< typename OtherPolynomial >
     void compute( const OtherPolynomial& poly )
     {
-      eigen_assert( Scalar(0) != poly[poly.size()-1] );
-      m_roots[0] = -poly[0]/poly[poly.size()-1];
+      eigen_assert( poly.size() == 2 );
+      eigen_assert( Scalar(0) != poly[1] );
+      m_roots[0] = -poly[0]/poly[1];
     }
 
+  public:
+    template< typename OtherPolynomial >
+    inline PolynomialSolver( const OtherPolynomial& poly ){
+      compute( poly ); }
+
+    inline PolynomialSolver(){}
+
   protected:
     using                   PS_Base::m_roots;
 };
diff --git a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
index 2bb8bc84a..40ba65b7e 100644
--- a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
+++ b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
@@ -56,7 +56,7 @@ T poly_eval( const Polynomials& poly, const T& x )
     for( DenseIndex i=1; i<poly.size(); ++i ){
       val = val*inv_x + poly[i]; }
 
-    return std::pow(x,(T)(poly.size()-1)) * val;
+    return numext::pow(x,(T)(poly.size()-1)) * val;
   }
 }
 
diff --git a/unsupported/Eigen/src/SVD/BDCSVD.h b/unsupported/Eigen/src/SVD/BDCSVD.h
deleted file mode 100644
index 11d4882e4..000000000
--- a/unsupported/Eigen/src/SVD/BDCSVD.h
+++ /dev/null
@@ -1,748 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-// 
-// We used the "A Divide-And-Conquer Algorithm for the Bidiagonal SVD"
-// research report written by Ming Gu and Stanley C.Eisenstat
-// The code variable names correspond to the names they used in their 
-// report
-//
-// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
-// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
-// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
-// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
-//
-// Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_BDCSVD_H
-#define EIGEN_BDCSVD_H
-
-#define EPSILON 0.0000000000000001
-
-#define ALGOSWAP 32
-
-namespace Eigen {
-/** \ingroup SVD_Module
- *
- *
- * \class BDCSVD
- *
- * \brief class Bidiagonal Divide and Conquer SVD
- *
- * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
- * We plan to have a very similar interface to JacobiSVD on this class.
- * It should be used to speed up the calcul of SVD for big matrices. 
- */
-template<typename _MatrixType> 
-class BDCSVD : public SVDBase<_MatrixType>
-{
-  typedef SVDBase<_MatrixType> Base;
-    
-public:
-  using Base::rows;
-  using Base::cols;
-  
-  typedef _MatrixType MatrixType;
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  typedef typename MatrixType::Index Index;
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime, 
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime, 
-    DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime), 
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, 
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, 
-    MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime, MaxColsAtCompileTime), 
-    MatrixOptions = MatrixType::Options
-  };
-
-  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, 
-		 MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-  MatrixUType;
-  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime, 
-		 MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime>
-  MatrixVType;
-  typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
-  typedef typename internal::plain_row_type<MatrixType>::type RowType;
-  typedef typename internal::plain_col_type<MatrixType>::type ColType;
-  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixX;
-  typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixXr;
-  typedef Matrix<RealScalar, Dynamic, 1> VectorType;
-
-  /** \brief Default Constructor.
-   *
-   * The default constructor is useful in cases in which the user intends to
-   * perform decompositions via BDCSVD::compute(const MatrixType&).
-   */
-  BDCSVD()
-    : SVDBase<_MatrixType>::SVDBase(), 
-      algoswap(ALGOSWAP)
-  {}
-
-
-  /** \brief Default Constructor with memory preallocation
-   *
-   * Like the default constructor but with preallocation of the internal data
-   * according to the specified problem size.
-   * \sa BDCSVD()
-   */
-  BDCSVD(Index rows, Index cols, unsigned int computationOptions = 0)
-    : SVDBase<_MatrixType>::SVDBase(), 
-      algoswap(ALGOSWAP)
-  {
-    allocate(rows, cols, computationOptions);
-  }
-
-  /** \brief Constructor performing the decomposition of given matrix.
-   *
-   * \param matrix the matrix to decompose
-   * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
-   *                           #ComputeFullV, #ComputeThinV.
-   *
-   * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-   * available with the (non - default) FullPivHouseholderQR preconditioner.
-   */
-  BDCSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
-    : SVDBase<_MatrixType>::SVDBase(), 
-      algoswap(ALGOSWAP)
-  {
-    compute(matrix, computationOptions);
-  }
-
-  ~BDCSVD() 
-  {
-  }
-  /** \brief Method performing the decomposition of given matrix using custom options.
-   *
-   * \param matrix the matrix to decompose
-   * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
-   *                           #ComputeFullV, #ComputeThinV.
-   *
-   * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-   * available with the (non - default) FullPivHouseholderQR preconditioner.
-   */
-  SVDBase<MatrixType>& compute(const MatrixType& matrix, unsigned int computationOptions);
-
-  /** \brief Method performing the decomposition of given matrix using current options.
-   *
-   * \param matrix the matrix to decompose
-   *
-   * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int).
-   */
-  SVDBase<MatrixType>& compute(const MatrixType& matrix)
-  {
-    return compute(matrix, this->m_computationOptions);
-  }
-
-  void setSwitchSize(int s) 
-  {
-    eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 4");
-    algoswap = s;
-  }
-
-
-  /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-   *
-   * \param b the right - hand - side of the equation to solve.
-   *
-   * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
-   *
-   * \note SVD solving is implicitly least - squares. Thus, this method serves both purposes of exact solving and least - squares solving.
-   * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
-   */
-  template<typename Rhs>
-  inline const internal::solve_retval<BDCSVD, Rhs>
-  solve(const MatrixBase<Rhs>& b) const
-  {
-    eigen_assert(this->m_isInitialized && "BDCSVD is not initialized.");
-    eigen_assert(SVDBase<_MatrixType>::computeU() && SVDBase<_MatrixType>::computeV() && 
-		 "BDCSVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-    return internal::solve_retval<BDCSVD, Rhs>(*this, b.derived());
-  }
-
- 
-  const MatrixUType& matrixU() const
-  {
-    eigen_assert(this->m_isInitialized && "SVD is not initialized.");
-    if (isTranspose){
-      eigen_assert(this->computeV() && "This SVD decomposition didn't compute U. Did you ask for it?");
-      return this->m_matrixV;
-    }
-    else 
-    {
-      eigen_assert(this->computeU() && "This SVD decomposition didn't compute U. Did you ask for it?");
-      return this->m_matrixU;
-    }
-     
-  }
-
-
-  const MatrixVType& matrixV() const
-  {
-    eigen_assert(this->m_isInitialized && "SVD is not initialized.");
-    if (isTranspose){
-      eigen_assert(this->computeU() && "This SVD decomposition didn't compute V. Did you ask for it?");
-      return this->m_matrixU;
-    }
-    else
-    {
-      eigen_assert(this->computeV() && "This SVD decomposition didn't compute V. Did you ask for it?");
-      return this->m_matrixV;
-    }
-  }
- 
-private:
-  void allocate(Index rows, Index cols, unsigned int computationOptions);
-  void divide (Index firstCol, Index lastCol, Index firstRowW, 
-	       Index firstColW, Index shift);
-  void deflation43(Index firstCol, Index shift, Index i, Index size);
-  void deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
-  void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
-  void copyUV(MatrixXr naiveU, MatrixXr naiveV, MatrixX householderU, MatrixX houseHolderV);
-
-protected:
-  MatrixXr m_naiveU, m_naiveV;
-  MatrixXr m_computed;
-  Index nRec;
-  int algoswap;
-  bool isTranspose, compU, compV;
-  
-}; //end class BDCSVD
-
-
-// Methode to allocate ans initialize matrix and attributs
-template<typename MatrixType>
-void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
-{
-  isTranspose = (cols > rows);
-  if (SVDBase<MatrixType>::allocate(rows, cols, computationOptions)) return;
-  m_computed = MatrixXr::Zero(this->m_diagSize + 1, this->m_diagSize );
-  if (isTranspose){
-    compU = this->computeU();
-    compV = this->computeV();    
-  } 
-  else
-  {
-    compV = this->computeU();
-    compU = this->computeV();   
-  }
-  if (compU) m_naiveU = MatrixXr::Zero(this->m_diagSize + 1, this->m_diagSize + 1 );
-  else m_naiveU = MatrixXr::Zero(2, this->m_diagSize + 1 );
-  
-  if (compV) m_naiveV = MatrixXr::Zero(this->m_diagSize, this->m_diagSize);
-  
-
-  //should be changed for a cleaner implementation
-  if (isTranspose){
-    bool aux;
-    if (this->computeU()||this->computeV()){
-      aux = this->m_computeFullU;
-      this->m_computeFullU = this->m_computeFullV;
-      this->m_computeFullV = aux;
-      aux = this->m_computeThinU;
-      this->m_computeThinU = this->m_computeThinV;
-      this->m_computeThinV = aux;
-    } 
-  }
-}// end allocate
-
-// Methode which compute the BDCSVD for the int
-template<>
-SVDBase<Matrix<int, Dynamic, Dynamic> >&
-BDCSVD<Matrix<int, Dynamic, Dynamic> >::compute(const MatrixType& matrix, unsigned int computationOptions) {
-  allocate(matrix.rows(), matrix.cols(), computationOptions);
-  this->m_nonzeroSingularValues = 0;
-  m_computed = Matrix<int, Dynamic, Dynamic>::Zero(rows(), cols());
-  for (int i=0; i<this->m_diagSize; i++)   {
-    this->m_singularValues.coeffRef(i) = 0;
-  }
-  if (this->m_computeFullU) this->m_matrixU = Matrix<int, Dynamic, Dynamic>::Zero(rows(), rows());
-  if (this->m_computeFullV) this->m_matrixV = Matrix<int, Dynamic, Dynamic>::Zero(cols(), cols()); 
-  this->m_isInitialized = true;
-  return *this;
-}
-
-
-// Methode which compute the BDCSVD
-template<typename MatrixType>
-SVDBase<MatrixType>&
-BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsigned int computationOptions) 
-{
-  allocate(matrix.rows(), matrix.cols(), computationOptions);
-  using std::abs;
-
-  //**** step 1 Bidiagonalization  isTranspose = (matrix.cols()>matrix.rows()) ;
-  MatrixType copy;
-  if (isTranspose) copy = matrix.adjoint();
-  else copy = matrix;
-  
-  internal::UpperBidiagonalization<MatrixX > bid(copy);
-
-  //**** step 2 Divide
-  // this is ugly and has to be redone (care of complex cast)
-  MatrixXr temp;
-  temp = bid.bidiagonal().toDenseMatrix().transpose();
-  m_computed.setZero();
-  for (int i=0; i<this->m_diagSize - 1; i++)   {
-    m_computed(i, i) = temp(i, i);
-    m_computed(i + 1, i) = temp(i + 1, i);
-  }
-  m_computed(this->m_diagSize - 1, this->m_diagSize - 1) = temp(this->m_diagSize - 1, this->m_diagSize - 1);
-  divide(0, this->m_diagSize - 1, 0, 0, 0);
-
-  //**** step 3 copy
-  for (int i=0; i<this->m_diagSize; i++)   {
-    RealScalar a = abs(m_computed.coeff(i, i));
-    this->m_singularValues.coeffRef(i) = a;
-    if (a == 0){
-      this->m_nonzeroSingularValues = i;
-      break;
-    }
-    else  if (i == this->m_diagSize - 1)
-    {
-      this->m_nonzeroSingularValues = i + 1;
-      break;
-    }
-  }
-  copyUV(m_naiveV, m_naiveU, bid.householderU(), bid.householderV());
-  this->m_isInitialized = true;
-  return *this;
-}// end compute
-
-
-template<typename MatrixType>
-void BDCSVD<MatrixType>::copyUV(MatrixXr naiveU, MatrixXr naiveV, MatrixX householderU, MatrixX householderV){
-  if (this->computeU()){
-    MatrixX temp = MatrixX::Zero(naiveU.rows(), naiveU.cols());
-    temp.real() = naiveU;
-    if (this->m_computeThinU){
-      this->m_matrixU = MatrixX::Identity(householderU.cols(), this->m_nonzeroSingularValues );
-      this->m_matrixU.block(0, 0, this->m_diagSize, this->m_nonzeroSingularValues) = 
-	temp.block(0, 0, this->m_diagSize, this->m_nonzeroSingularValues);
-      this->m_matrixU = householderU * this->m_matrixU ;
-    }
-    else
-    {
-      this->m_matrixU = MatrixX::Identity(householderU.cols(), householderU.cols());
-      this->m_matrixU.block(0, 0, this->m_diagSize, this->m_diagSize) = temp.block(0, 0, this->m_diagSize, this->m_diagSize);
-      this->m_matrixU = householderU * this->m_matrixU ;
-    }
-  }
-  if (this->computeV()){
-    MatrixX temp = MatrixX::Zero(naiveV.rows(), naiveV.cols());
-    temp.real() = naiveV;
-    if (this->m_computeThinV){
-      this->m_matrixV = MatrixX::Identity(householderV.cols(),this->m_nonzeroSingularValues );
-      this->m_matrixV.block(0, 0, this->m_nonzeroSingularValues, this->m_nonzeroSingularValues) = 
-	temp.block(0, 0, this->m_nonzeroSingularValues, this->m_nonzeroSingularValues);
-      this->m_matrixV = householderV * this->m_matrixV ;
-    }
-    else  
-    {
-      this->m_matrixV = MatrixX::Identity(householderV.cols(), householderV.cols());
-      this->m_matrixV.block(0, 0, this->m_diagSize, this->m_diagSize) = temp.block(0, 0, this->m_diagSize, this->m_diagSize);
-      this->m_matrixV = householderV * this->m_matrixV;
-    }
-  }
-}
-
-// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the 
-// place of the submatrix we are currently working on.
-
-//@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
-//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; 
-// lastCol + 1 - firstCol is the size of the submatrix.
-//@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W)
-//@param firstRowW : Same as firstRowW with the column.
-//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
-// to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
-template<typename MatrixType>
-void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, 
-				 Index firstColW, Index shift)
-{
-  // requires nbRows = nbCols + 1;
-  using std::pow;
-  using std::sqrt;
-  using std::abs;
-  const Index n = lastCol - firstCol + 1;
-  const Index k = n/2;
-  RealScalar alphaK;
-  RealScalar betaK; 
-  RealScalar r0; 
-  RealScalar lambda, phi, c0, s0;
-  MatrixXr l, f;
-  // We use the other algorithm which is more efficient for small 
-  // matrices.
-  if (n < algoswap){
-    JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), 
-			  ComputeFullU | (ComputeFullV * compV)) ;
-    if (compU) m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() << b.matrixU();
-    else 
-    {
-      m_naiveU.row(0).segment(firstCol, n + 1).real() << b.matrixU().row(0);
-      m_naiveU.row(1).segment(firstCol, n + 1).real() << b.matrixU().row(n);
-    }
-    if (compV) m_naiveV.block(firstRowW, firstColW, n, n).real() << b.matrixV();
-    m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero();
-    for (int i=0; i<n; i++)
-    {
-      m_computed(firstCol + shift + i, firstCol + shift +i) = b.singularValues().coeffRef(i);
-    }
-    return;
-  }
-  // We use the divide and conquer algorithm
-  alphaK =  m_computed(firstCol + k, firstCol + k);
-  betaK = m_computed(firstCol + k + 1, firstCol + k);
-  // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
-  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the 
-  // right submatrix before the left one. 
-  divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
-  divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
-  if (compU)
-  {
-    lambda = m_naiveU(firstCol + k, firstCol + k);
-    phi = m_naiveU(firstCol + k + 1, lastCol + 1);
-  } 
-  else 
-  {
-    lambda = m_naiveU(1, firstCol + k);
-    phi = m_naiveU(0, lastCol + 1);
-  }
-  r0 = sqrt((abs(alphaK * lambda) * abs(alphaK * lambda))
-	    + abs(betaK * phi) * abs(betaK * phi));
-  if (compU)
-  {
-    l = m_naiveU.row(firstCol + k).segment(firstCol, k);
-    f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
-  } 
-  else 
-  {
-    l = m_naiveU.row(1).segment(firstCol, k);
-    f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
-  }
-  if (compV) m_naiveV(firstRowW+k, firstColW) = 1;
-  if (r0 == 0)
-  {
-    c0 = 1;
-    s0 = 0;
-  }
-  else
-  {
-    c0 = alphaK * lambda / r0;
-    s0 = betaK * phi / r0;
-  }
-  if (compU)
-  {
-    MatrixXr q1 (m_naiveU.col(firstCol + k).segment(firstCol, k + 1));     
-    // we shiftW Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
-    {
-      m_naiveU.col(i + 1).segment(firstCol, k + 1) << m_naiveU.col(i).segment(firstCol, k + 1);
-    }
-    // we shift q1 at the left with a factor c0
-    m_naiveU.col(firstCol).segment( firstCol, k + 1) << (q1 * c0);
-    // last column = q1 * - s0
-    m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) << (q1 * ( - s0));
-    // first column = q2 * s0
-    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) << 
-      m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *s0; 
-    // q2 *= c0
-    m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0; 
-  } 
-  else 
-  {
-    RealScalar q1 = (m_naiveU(0, firstCol + k));
-    // we shift Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
-    {
-      m_naiveU(0, i + 1) = m_naiveU(0, i);
-    }
-    // we shift q1 at the left with a factor c0
-    m_naiveU(0, firstCol) = (q1 * c0);
-    // last column = q1 * - s0
-    m_naiveU(0, lastCol + 1) = (q1 * ( - s0));
-    // first column = q2 * s0
-    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) *s0; 
-    // q2 *= c0
-    m_naiveU(1, lastCol + 1) *= c0;
-    m_naiveU.row(1).segment(firstCol + 1, k).setZero();
-    m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
-  }
-  m_computed(firstCol + shift, firstCol + shift) = r0;
-  m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) << alphaK * l.transpose().real();
-  m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) << betaK * f.transpose().real();
-
-
-  // the line below do the deflation of the matrix for the third part of the algorithm
-  // Here the deflation is commented because the third part of the algorithm is not implemented
-  // the third part of the algorithm is a fast SVD on the matrix m_computed which works thanks to the deflation
-
-  deflation(firstCol, lastCol, k, firstRowW, firstColW, shift);
-
-  // Third part of the algorithm, since the real third part of the algorithm is not implemeted we use a JacobiSVD
-  JacobiSVD<MatrixXr> res= JacobiSVD<MatrixXr>(m_computed.block(firstCol + shift, firstCol +shift, n + 1, n), 
-					       ComputeFullU | (ComputeFullV * compV)) ;
-  if (compU) m_naiveU.block(firstCol, firstCol, n + 1, n + 1) *= res.matrixU();
-  else m_naiveU.block(0, firstCol, 2, n + 1) *= res.matrixU();
-  
-  if (compV) m_naiveV.block(firstRowW, firstColW, n, n) *= res.matrixV();
-  m_computed.block(firstCol + shift, firstCol + shift, n, n) << MatrixXr::Zero(n, n);
-  for (int i=0; i<n; i++)
-    m_computed(firstCol + shift + i, firstCol + shift +i) = res.singularValues().coeffRef(i);
-  // end of the third part
-
-
-}// end divide
-
-
-// page 12_13
-// i >= 1, di almost null and zi non null.
-// We use a rotation to zero out zi applied to the left of M
-template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index size){
-  using std::abs;
-  using std::sqrt;
-  using std::pow;
-  RealScalar c = m_computed(firstCol + shift, firstCol + shift);
-  RealScalar s = m_computed(i, firstCol + shift);
-  RealScalar r = sqrt(pow(abs(c), 2) + pow(abs(s), 2));
-  if (r == 0){
-    m_computed(i, i)=0;
-    return;
-  }
-  c/=r;
-  s/=r;
-  m_computed(firstCol + shift, firstCol + shift) = r;  
-  m_computed(i, firstCol + shift) = 0;
-  m_computed(i, i) = 0;
-  if (compU){
-    m_naiveU.col(firstCol).segment(firstCol,size) = 
-      c * m_naiveU.col(firstCol).segment(firstCol, size) - 
-      s * m_naiveU.col(i).segment(firstCol, size) ;
-
-    m_naiveU.col(i).segment(firstCol, size) = 
-      (c + s*s/c) * m_naiveU.col(i).segment(firstCol, size) + 
-      (s/c) * m_naiveU.col(firstCol).segment(firstCol,size);
-  }
-}// end deflation 43
-
-
-// page 13
-// i,j >= 1, i != j and |di - dj| < epsilon * norm2(M)
-// We apply two rotations to have zj = 0;
-template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size){
-  using std::abs;
-  using std::sqrt;
-  using std::conj;
-  using std::pow;
-  RealScalar c = m_computed(firstColm, firstColm + j - 1);
-  RealScalar s = m_computed(firstColm, firstColm + i - 1);
-  RealScalar r = sqrt(pow(abs(c), 2) + pow(abs(s), 2));
-  if (r==0){
-    m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j);
-    return;
-  }
-  c/=r;
-  s/=r;
-  m_computed(firstColm + i, firstColm) = r;  
-  m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j);
-  m_computed(firstColm + j, firstColm) = 0;
-  if (compU){
-    m_naiveU.col(firstColu + i).segment(firstColu, size) = 
-      c * m_naiveU.col(firstColu + i).segment(firstColu, size) - 
-      s * m_naiveU.col(firstColu + j).segment(firstColu, size) ;
-
-    m_naiveU.col(firstColu + j).segment(firstColu, size) = 
-      (c + s*s/c) *  m_naiveU.col(firstColu + j).segment(firstColu, size) + 
-      (s/c) * m_naiveU.col(firstColu + i).segment(firstColu, size);
-  } 
-  if (compV){
-    m_naiveV.col(firstColW + i).segment(firstRowW, size - 1) = 
-      c * m_naiveV.col(firstColW + i).segment(firstRowW, size - 1) + 
-      s * m_naiveV.col(firstColW + j).segment(firstRowW, size - 1) ;
-
-    m_naiveV.col(firstColW + j).segment(firstRowW, size - 1)  = 
-      (c + s*s/c) * m_naiveV.col(firstColW + j).segment(firstRowW, size - 1) - 
-      (s/c) * m_naiveV.col(firstColW + i).segment(firstRowW, size - 1);
-  }
-}// end deflation 44
-
-
-
-template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift){
-  //condition 4.1
-  RealScalar EPS = EPSILON * (std::max<RealScalar>(m_computed(firstCol + shift + 1, firstCol + shift + 1), m_computed(firstCol + k, firstCol + k)));
-  const Index length = lastCol + 1 - firstCol;
-  if (m_computed(firstCol + shift, firstCol + shift) < EPS){
-    m_computed(firstCol + shift, firstCol + shift) = EPS;
-  }
-  //condition 4.2
-  for (Index i=firstCol + shift + 1;i<=lastCol + shift;i++){
-    if (std::abs(m_computed(i, firstCol + shift)) < EPS){
-      m_computed(i, firstCol + shift) = 0;
-    }
-  }
-
-  //condition 4.3
-  for (Index i=firstCol + shift + 1;i<=lastCol + shift; i++){
-    if (m_computed(i, i) < EPS){
-      deflation43(firstCol, shift, i, length);
-    }
-  }
-
-  //condition 4.4
- 
-  Index i=firstCol + shift + 1, j=firstCol + shift + k + 1;
-  //we stock the final place of each line
-  Index *permutation = new Index[length];
-
-  for (Index p =1; p < length; p++) {
-    if (i> firstCol + shift + k){
-      permutation[p] = j;
-      j++;
-    } else if (j> lastCol + shift) 
-    {
-      permutation[p] = i;
-      i++;
-    }
-    else 
-    {
-      if (m_computed(i, i) < m_computed(j, j)){
-        permutation[p] = j;
-        j++;
-      } 
-      else
-      {
-        permutation[p] = i;
-        i++;
-      }
-    }
-  }
-  //we do the permutation
-  RealScalar aux;
-  //we stock the current index of each col
-  //and the column of each index
-  Index *realInd = new Index[length];
-  Index *realCol = new Index[length];
-  for (int pos = 0; pos< length; pos++){
-    realCol[pos] = pos + firstCol + shift;
-    realInd[pos] = pos;
-  }
-  const Index Zero = firstCol + shift;
-  VectorType temp;
-  for (int i = 1; i < length - 1; i++){
-    const Index I = i + Zero;
-    const Index realI = realInd[i];
-    const Index j  = permutation[length - i] - Zero;
-    const Index J = realCol[j];
-    
-    //diag displace
-    aux = m_computed(I, I); 
-    m_computed(I, I) = m_computed(J, J);
-    m_computed(J, J) = aux;
-    
-    //firstrow displace
-    aux = m_computed(I, Zero); 
-    m_computed(I, Zero) = m_computed(J, Zero);
-    m_computed(J, Zero) = aux;
-
-    // change columns
-    if (compU) {
-      temp = m_naiveU.col(I - shift).segment(firstCol, length + 1);
-      m_naiveU.col(I - shift).segment(firstCol, length + 1) << 
-        m_naiveU.col(J - shift).segment(firstCol, length + 1);
-      m_naiveU.col(J - shift).segment(firstCol, length + 1) << temp;
-    } 
-    else
-    {
-      temp = m_naiveU.col(I - shift).segment(0, 2);
-      m_naiveU.col(I - shift).segment(0, 2) << 
-        m_naiveU.col(J - shift).segment(0, 2);
-      m_naiveU.col(J - shift).segment(0, 2) << temp;      
-    }
-    if (compV) {
-      const Index CWI = I + firstColW - Zero;
-      const Index CWJ = J + firstColW - Zero;
-      temp = m_naiveV.col(CWI).segment(firstRowW, length);
-      m_naiveV.col(CWI).segment(firstRowW, length) << m_naiveV.col(CWJ).segment(firstRowW, length);
-      m_naiveV.col(CWJ).segment(firstRowW, length) << temp;
-    }
-
-    //update real pos
-    realCol[realI] = J;
-    realCol[j] = I;
-    realInd[J - Zero] = realI;
-    realInd[I - Zero] = j;
-  }
-  for (Index i = firstCol + shift + 1; i<lastCol + shift;i++){
-    if ((m_computed(i + 1, i + 1) - m_computed(i, i)) < EPS){
-      deflation44(firstCol , 
-		  firstCol + shift, 
-		  firstRowW, 
-		  firstColW, 
-		  i - Zero, 
-		  i + 1 - Zero, 
-		  length);
-    }
-  }
-  delete [] permutation;
-  delete [] realInd;
-  delete [] realCol;
-
-}//end deflation
-
-
-namespace internal{
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<BDCSVD<_MatrixType>, Rhs>
-  : solve_retval_base<BDCSVD<_MatrixType>, Rhs>
-{
-  typedef BDCSVD<_MatrixType> BDCSVDType;
-  EIGEN_MAKE_SOLVE_HELPERS(BDCSVDType, Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    eigen_assert(rhs().rows() == dec().rows());
-    // A = U S V^*
-    // So A^{ - 1} = V S^{ - 1} U^*    
-    Index diagSize = (std::min)(dec().rows(), dec().cols());
-    typename BDCSVDType::SingularValuesType invertedSingVals(diagSize);
-    Index nonzeroSingVals = dec().nonzeroSingularValues();
-    invertedSingVals.head(nonzeroSingVals) = dec().singularValues().head(nonzeroSingVals).array().inverse();
-    invertedSingVals.tail(diagSize - nonzeroSingVals).setZero();
-    
-    dst = dec().matrixV().leftCols(diagSize)
-      * invertedSingVals.asDiagonal()
-      * dec().matrixU().leftCols(diagSize).adjoint()
-      * rhs();	
-    return;
-  }
-};
-
-} //end namespace internal
-
-  /** \svd_module
-   *
-   * \return the singular value decomposition of \c *this computed by 
-   *  BDC Algorithm
-   *
-   * \sa class BDCSVD
-   */
-/*
-template<typename Derived>
-BDCSVD<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::bdcSvd(unsigned int computationOptions) const
-{
-  return BDCSVD<PlainObject>(*this, computationOptions);
-}
-*/
-
-} // end namespace Eigen
-
-#endif
diff --git a/unsupported/Eigen/src/SVD/CMakeLists.txt b/unsupported/Eigen/src/SVD/CMakeLists.txt
deleted file mode 100644
index b40baf092..000000000
--- a/unsupported/Eigen/src/SVD/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SVD_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SVD_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}unsupported/Eigen/src/SVD COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/SVD/JacobiSVD.h b/unsupported/Eigen/src/SVD/JacobiSVD.h
deleted file mode 100644
index 02fac409e..000000000
--- a/unsupported/Eigen/src/SVD/JacobiSVD.h
+++ /dev/null
@@ -1,782 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_JACOBISVD_H
-#define EIGEN_JACOBISVD_H
-
-namespace Eigen { 
-
-namespace internal {
-// forward declaration (needed by ICC)
-// the empty body is required by MSVC
-template<typename MatrixType, int QRPreconditioner,
-         bool IsComplex = NumTraits<typename MatrixType::Scalar>::IsComplex>
-struct svd_precondition_2x2_block_to_be_real {};
-
-/*** QR preconditioners (R-SVD)
- ***
- *** Their role is to reduce the problem of computing the SVD to the case of a square matrix.
- *** This approach, known as R-SVD, is an optimization for rectangular-enough matrices, and is a requirement for
- *** JacobiSVD which by itself is only able to work on square matrices.
- ***/
-
-enum { PreconditionIfMoreColsThanRows, PreconditionIfMoreRowsThanCols };
-
-template<typename MatrixType, int QRPreconditioner, int Case>
-struct qr_preconditioner_should_do_anything
-{
-  enum { a = MatrixType::RowsAtCompileTime != Dynamic &&
-             MatrixType::ColsAtCompileTime != Dynamic &&
-             MatrixType::ColsAtCompileTime <= MatrixType::RowsAtCompileTime,
-         b = MatrixType::RowsAtCompileTime != Dynamic &&
-             MatrixType::ColsAtCompileTime != Dynamic &&
-             MatrixType::RowsAtCompileTime <= MatrixType::ColsAtCompileTime,
-         ret = !( (QRPreconditioner == NoQRPreconditioner) ||
-                  (Case == PreconditionIfMoreColsThanRows && bool(a)) ||
-                  (Case == PreconditionIfMoreRowsThanCols && bool(b)) )
-  };
-};
-
-template<typename MatrixType, int QRPreconditioner, int Case,
-         bool DoAnything = qr_preconditioner_should_do_anything<MatrixType, QRPreconditioner, Case>::ret
-> struct qr_preconditioner_impl {};
-
-template<typename MatrixType, int QRPreconditioner, int Case>
-class qr_preconditioner_impl<MatrixType, QRPreconditioner, Case, false>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  void allocate(const JacobiSVD<MatrixType, QRPreconditioner>&) {}
-  bool run(JacobiSVD<MatrixType, QRPreconditioner>&, const MatrixType&)
-  {
-    return false;
-  }
-};
-
-/*** preconditioner using FullPivHouseholderQR ***/
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime
-  };
-  typedef Matrix<Scalar, 1, RowsAtCompileTime, RowMajor, 1, MaxRowsAtCompileTime> WorkspaceType;
-
-  void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.rows(), svd.cols());
-    }
-    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
-  }
-
-  bool run(JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.rows() > matrix.cols())
-    {
-      m_qr.compute(matrix);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
-      if(svd.m_computeFullU) m_qr.matrixQ().evalTo(svd.m_matrixU, m_workspace);
-      if(svd.computeV()) svd.m_matrixV = m_qr.colsPermutation();
-      return true;
-    }
-    return false;
-  }
-private:
-  typedef FullPivHouseholderQR<MatrixType> QRType;
-  QRType m_qr;
-  WorkspaceType m_workspace;
-};
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
-  };
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
-
-  void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.cols(), svd.rows());
-    }
-    m_adjoint.resize(svd.cols(), svd.rows());
-    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
-  }
-
-  bool run(JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.cols() > matrix.rows())
-    {
-      m_adjoint = matrix.adjoint();
-      m_qr.compute(m_adjoint);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
-      if(svd.m_computeFullV) m_qr.matrixQ().evalTo(svd.m_matrixV, m_workspace);
-      if(svd.computeU()) svd.m_matrixU = m_qr.colsPermutation();
-      return true;
-    }
-    else return false;
-  }
-private:
-  typedef FullPivHouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
-  QRType m_qr;
-  TransposeTypeWithSameStorageOrder m_adjoint;
-  typename internal::plain_row_type<MatrixType>::type m_workspace;
-};
-
-/*** preconditioner using ColPivHouseholderQR ***/
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-
-  void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.rows(), svd.cols());
-    }
-    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
-    else if (svd.m_computeThinU) m_workspace.resize(svd.cols());
-  }
-
-  bool run(JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.rows() > matrix.cols())
-    {
-      m_qr.compute(matrix);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
-      if(svd.m_computeFullU) m_qr.householderQ().evalTo(svd.m_matrixU, m_workspace);
-      else if(svd.m_computeThinU)
-      {
-        svd.m_matrixU.setIdentity(matrix.rows(), matrix.cols());
-        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixU, m_workspace);
-      }
-      if(svd.computeV()) svd.m_matrixV = m_qr.colsPermutation();
-      return true;
-    }
-    return false;
-  }
-
-private:
-  typedef ColPivHouseholderQR<MatrixType> QRType;
-  QRType m_qr;
-  typename internal::plain_col_type<MatrixType>::type m_workspace;
-};
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
-  };
-
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
-
-  void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.cols(), svd.rows());
-    }
-    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
-    else if (svd.m_computeThinV) m_workspace.resize(svd.rows());
-    m_adjoint.resize(svd.cols(), svd.rows());
-  }
-
-  bool run(JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.cols() > matrix.rows())
-    {
-      m_adjoint = matrix.adjoint();
-      m_qr.compute(m_adjoint);
-
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
-      if(svd.m_computeFullV) m_qr.householderQ().evalTo(svd.m_matrixV, m_workspace);
-      else if(svd.m_computeThinV)
-      {
-        svd.m_matrixV.setIdentity(matrix.cols(), matrix.rows());
-        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixV, m_workspace);
-      }
-      if(svd.computeU()) svd.m_matrixU = m_qr.colsPermutation();
-      return true;
-    }
-    else return false;
-  }
-
-private:
-  typedef ColPivHouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
-  QRType m_qr;
-  TransposeTypeWithSameStorageOrder m_adjoint;
-  typename internal::plain_row_type<MatrixType>::type m_workspace;
-};
-
-/*** preconditioner using HouseholderQR ***/
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-
-  void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
-  {
-    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.rows(), svd.cols());
-    }
-    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
-    else if (svd.m_computeThinU) m_workspace.resize(svd.cols());
-  }
-
-  bool run(JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.rows() > matrix.cols())
-    {
-      m_qr.compute(matrix);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
-      if(svd.m_computeFullU) m_qr.householderQ().evalTo(svd.m_matrixU, m_workspace);
-      else if(svd.m_computeThinU)
-      {
-        svd.m_matrixU.setIdentity(matrix.rows(), matrix.cols());
-        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixU, m_workspace);
-      }
-      if(svd.computeV()) svd.m_matrixV.setIdentity(matrix.cols(), matrix.cols());
-      return true;
-    }
-    return false;
-  }
-private:
-  typedef HouseholderQR<MatrixType> QRType;
-  QRType m_qr;
-  typename internal::plain_col_type<MatrixType>::type m_workspace;
-};
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
-  };
-
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
-
-  void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
-  {
-    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.cols(), svd.rows());
-    }
-    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
-    else if (svd.m_computeThinV) m_workspace.resize(svd.rows());
-    m_adjoint.resize(svd.cols(), svd.rows());
-  }
-
-  bool run(JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.cols() > matrix.rows())
-    {
-      m_adjoint = matrix.adjoint();
-      m_qr.compute(m_adjoint);
-
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
-      if(svd.m_computeFullV) m_qr.householderQ().evalTo(svd.m_matrixV, m_workspace);
-      else if(svd.m_computeThinV)
-      {
-        svd.m_matrixV.setIdentity(matrix.cols(), matrix.rows());
-        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixV, m_workspace);
-      }
-      if(svd.computeU()) svd.m_matrixU.setIdentity(matrix.rows(), matrix.rows());
-      return true;
-    }
-    else return false;
-  }
-
-private:
-  typedef HouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
-  QRType m_qr;
-  TransposeTypeWithSameStorageOrder m_adjoint;
-  typename internal::plain_row_type<MatrixType>::type m_workspace;
-};
-
-/*** 2x2 SVD implementation
- ***
- *** JacobiSVD consists in performing a series of 2x2 SVD subproblems
- ***/
-
-template<typename MatrixType, int QRPreconditioner>
-struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false>
-{
-  typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
-  typedef typename SVD::Index Index;
-  static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {}
-};
-
-template<typename MatrixType, int QRPreconditioner>
-struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
-{
-  typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename SVD::Index Index;
-  static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q)
-  {
-    using std::sqrt;
-    Scalar z;
-    JacobiRotation<Scalar> rot;
-    RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p)));
-    if(n==0)
-    {
-      z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
-      work_matrix.row(p) *= z;
-      if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
-      z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
-      work_matrix.row(q) *= z;
-      if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
-    }
-    else
-    {
-      rot.c() = conj(work_matrix.coeff(p,p)) / n;
-      rot.s() = work_matrix.coeff(q,p) / n;
-      work_matrix.applyOnTheLeft(p,q,rot);
-      if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint());
-      if(work_matrix.coeff(p,q) != Scalar(0))
-      {
-        Scalar z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
-        work_matrix.col(q) *= z;
-        if(svd.computeV()) svd.m_matrixV.col(q) *= z;
-      }
-      if(work_matrix.coeff(q,q) != Scalar(0))
-      {
-        z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
-        work_matrix.row(q) *= z;
-        if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
-      }
-    }
-  }
-};
-
-template<typename MatrixType, typename RealScalar, typename Index>
-void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
-                            JacobiRotation<RealScalar> *j_left,
-                            JacobiRotation<RealScalar> *j_right)
-{
-  using std::sqrt;
-  Matrix<RealScalar,2,2> m;
-  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
-       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
-  JacobiRotation<RealScalar> rot1;
-  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
-  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
-  if(t == RealScalar(0))
-  {
-    rot1.c() = RealScalar(0);
-    rot1.s() = d > RealScalar(0) ? RealScalar(1) : RealScalar(-1);
-  }
-  else
-  {
-    RealScalar u = d / t;
-    rot1.c() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u));
-    rot1.s() = rot1.c() * u;
-  }
-  m.applyOnTheLeft(0,1,rot1);
-  j_right->makeJacobi(m,0,1);
-  *j_left  = rot1 * j_right->transpose();
-}
-
-} // end namespace internal
-
-/** \ingroup SVD_Module
-  *
-  *
-  * \class JacobiSVD
-  *
-  * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix
-  *
-  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
-  * \param QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally
-  *                        for the R-SVD step for non-square matrices. See discussion of possible values below.
-  *
-  * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
-  *   \f[ A = U S V^* \f]
-  * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero outside of its main diagonal;
-  * the diagonal entries of S are known as the \em singular \em values of \a A and the columns of \a U and \a V are known as the left
-  * and right \em singular \em vectors of \a A respectively.
-  *
-  * Singular values are always sorted in decreasing order.
-  *
-  * This JacobiSVD decomposition computes only the singular values by default. If you want \a U or \a V, you need to ask for them explicitly.
-  *
-  * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p matrix, letting \a m be the
-  * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
-  * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
-  * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
-  *
-  * Here's an example demonstrating basic usage:
-  * \include JacobiSVD_basic.cpp
-  * Output: \verbinclude JacobiSVD_basic.out
-  *
-  * This JacobiSVD class is a two-sided Jacobi R-SVD decomposition, ensuring optimal reliability and accuracy. The downside is that it's slower than
-  * bidiagonalizing SVD algorithms for large square matrices; however its complexity is still \f$ O(n^2p) \f$ where \a n is the smaller dimension and
-  * \a p is the greater dimension, meaning that it is still of the same order of complexity as the faster bidiagonalizing R-SVD algorithms.
-  * In particular, like any R-SVD, it takes advantage of non-squareness in that its complexity is only linear in the greater dimension.
-  *
-  * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
-  * terminate in finite (and reasonable) time.
-  *
-  * The possible values for QRPreconditioner are:
-  * \li ColPivHouseholderQRPreconditioner is the default. In practice it's very safe. It uses column-pivoting QR.
-  * \li FullPivHouseholderQRPreconditioner, is the safest and slowest. It uses full-pivoting QR.
-  *     Contrary to other QRs, it doesn't allow computing thin unitaries.
-  * \li HouseholderQRPreconditioner is the fastest, and less safe and accurate than the pivoting variants. It uses non-pivoting QR.
-  *     This is very similar in safety and accuracy to the bidiagonalization process used by bidiagonalizing SVD algorithms (since bidiagonalization
-  *     is inherently non-pivoting). However the resulting SVD is still more reliable than bidiagonalizing SVDs because the Jacobi-based iterarive
-  *     process is more reliable than the optimized bidiagonal SVD iterations.
-  * \li NoQRPreconditioner allows not to use a QR preconditioner at all. This is useful if you know that you will only be computing
-  *     JacobiSVD decompositions of square matrices. Non-square matrices require a QR preconditioner. Using this option will result in
-  *     faster compilation and smaller executable code. It won't significantly speed up computation, since JacobiSVD is always checking
-  *     if QR preconditioning is needed before applying it anyway.
-  *
-  * \sa MatrixBase::jacobiSvd()
-  */
-template<typename _MatrixType, int QRPreconditioner> 
-class JacobiSVD : public SVDBase<_MatrixType>
-{
-  public:
-
-    typedef _MatrixType MatrixType;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime),
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-      MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime,MaxColsAtCompileTime),
-      MatrixOptions = MatrixType::Options
-    };
-
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime,
-                   MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-            MatrixUType;
-    typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime,
-                   MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime>
-            MatrixVType;
-    typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
-    typedef typename internal::plain_row_type<MatrixType>::type RowType;
-    typedef typename internal::plain_col_type<MatrixType>::type ColType;
-    typedef Matrix<Scalar, DiagSizeAtCompileTime, DiagSizeAtCompileTime,
-                   MatrixOptions, MaxDiagSizeAtCompileTime, MaxDiagSizeAtCompileTime>
-            WorkMatrixType;
-
-    /** \brief Default Constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via JacobiSVD::compute(const MatrixType&).
-      */
-    JacobiSVD()
-      : SVDBase<_MatrixType>::SVDBase()
-    {}
-
-
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem size.
-      * \sa JacobiSVD()
-      */
-    JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0)
-      : SVDBase<_MatrixType>::SVDBase() 
-    {
-      allocate(rows, cols, computationOptions);
-    }
-
-    /** \brief Constructor performing the decomposition of given matrix.
-     *
-     * \param matrix the matrix to decompose
-     * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-     *                           By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU,
-     *                           #ComputeFullV, #ComputeThinV.
-     *
-     * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-     * available with the (non-default) FullPivHouseholderQR preconditioner.
-     */
-    JacobiSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
-      : SVDBase<_MatrixType>::SVDBase()
-    {
-      compute(matrix, computationOptions);
-    }
-
-    /** \brief Method performing the decomposition of given matrix using custom options.
-     *
-     * \param matrix the matrix to decompose
-     * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-     *                           By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU,
-     *                           #ComputeFullV, #ComputeThinV.
-     *
-     * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-     * available with the (non-default) FullPivHouseholderQR preconditioner.
-     */
-    SVDBase<MatrixType>& compute(const MatrixType& matrix, unsigned int computationOptions);
-
-    /** \brief Method performing the decomposition of given matrix using current options.
-     *
-     * \param matrix the matrix to decompose
-     *
-     * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int).
-     */
-    SVDBase<MatrixType>& compute(const MatrixType& matrix)
-    {
-      return compute(matrix, this->m_computationOptions);
-    }
-    
-    /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-      *
-      * \param b the right-hand-side of the equation to solve.
-      *
-      * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
-      *
-      * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and least-squares solving.
-      * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<JacobiSVD, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(this->m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(SVDBase<MatrixType>::computeU() && SVDBase<MatrixType>::computeV() && "JacobiSVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-      return internal::solve_retval<JacobiSVD, Rhs>(*this, b.derived());
-    }
-
-    
-
-  private:
-    void allocate(Index rows, Index cols, unsigned int computationOptions);
-
-  protected:
-    WorkMatrixType m_workMatrix;
-   
-    template<typename __MatrixType, int _QRPreconditioner, bool _IsComplex>
-    friend struct internal::svd_precondition_2x2_block_to_be_real;
-    template<typename __MatrixType, int _QRPreconditioner, int _Case, bool _DoAnything>
-    friend struct internal::qr_preconditioner_impl;
-
-    internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols;
-    internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows;
-};
-
-template<typename MatrixType, int QRPreconditioner>
-void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, unsigned int computationOptions)
-{
-  if (SVDBase<MatrixType>::allocate(rows, cols, computationOptions)) return;
-
-  if (QRPreconditioner == FullPivHouseholderQRPreconditioner)
-  {
-      eigen_assert(!(this->m_computeThinU || this->m_computeThinV) &&
-              "JacobiSVD: can't compute thin U or thin V with the FullPivHouseholderQR preconditioner. "
-              "Use the ColPivHouseholderQR preconditioner instead.");
-  }
-
-  m_workMatrix.resize(this->m_diagSize, this->m_diagSize);
-  
-  if(this->m_cols>this->m_rows) m_qr_precond_morecols.allocate(*this);
-  if(this->m_rows>this->m_cols) m_qr_precond_morerows.allocate(*this);
-}
-
-template<typename MatrixType, int QRPreconditioner>
-SVDBase<MatrixType>&
-JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsigned int computationOptions)
-{
-  using std::abs;
-  allocate(matrix.rows(), matrix.cols(), computationOptions);
-
-  // currently we stop when we reach precision 2*epsilon as the last bit of precision can require an unreasonable number of iterations,
-  // only worsening the precision of U and V as we accumulate more rotations
-  const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
-
-  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
-  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
-
-  /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
-
-  if(!m_qr_precond_morecols.run(*this, matrix) && !m_qr_precond_morerows.run(*this, matrix))
-  {
-    m_workMatrix = matrix.block(0,0,this->m_diagSize,this->m_diagSize);
-    if(this->m_computeFullU) this->m_matrixU.setIdentity(this->m_rows,this->m_rows);
-    if(this->m_computeThinU) this->m_matrixU.setIdentity(this->m_rows,this->m_diagSize);
-    if(this->m_computeFullV) this->m_matrixV.setIdentity(this->m_cols,this->m_cols);
-    if(this->m_computeThinV) this->m_matrixV.setIdentity(this->m_cols, this->m_diagSize);
-  }
-
-  /*** step 2. The main Jacobi SVD iteration. ***/
-
-  bool finished = false;
-  while(!finished)
-  {
-    finished = true;
-
-    // do a sweep: for all index pairs (p,q), perform SVD of the corresponding 2x2 sub-matrix
-
-    for(Index p = 1; p < this->m_diagSize; ++p)
-    {
-      for(Index q = 0; q < p; ++q)
-      {
-        // if this 2x2 sub-matrix is not diagonal already...
-        // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
-        // keep us iterating forever. Similarly, small denormal numbers are considered zero.
-        using std::max;
-        RealScalar threshold = (max)(considerAsZero, precision * (max)(abs(m_workMatrix.coeff(p,p)),
-                                                                       abs(m_workMatrix.coeff(q,q))));
-        if((max)(abs(m_workMatrix.coeff(p,q)),abs(m_workMatrix.coeff(q,p))) > threshold)
-        {
-          finished = false;
-
-          // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q);
-          JacobiRotation<RealScalar> j_left, j_right;
-          internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
-
-          // accumulate resulting Jacobi rotations
-          m_workMatrix.applyOnTheLeft(p,q,j_left);
-          if(SVDBase<MatrixType>::computeU()) this->m_matrixU.applyOnTheRight(p,q,j_left.transpose());
-
-          m_workMatrix.applyOnTheRight(p,q,j_right);
-          if(SVDBase<MatrixType>::computeV()) this->m_matrixV.applyOnTheRight(p,q,j_right);
-        }
-      }
-    }
-  }
-
-  /*** step 3. The work matrix is now diagonal, so ensure it's positive so its diagonal entries are the singular values ***/
-
-  for(Index i = 0; i < this->m_diagSize; ++i)
-  {
-    RealScalar a = abs(m_workMatrix.coeff(i,i));
-    this->m_singularValues.coeffRef(i) = a;
-    if(SVDBase<MatrixType>::computeU() && (a!=RealScalar(0))) this->m_matrixU.col(i) *= this->m_workMatrix.coeff(i,i)/a;
-  }
-
-  /*** step 4. Sort singular values in descending order and compute the number of nonzero singular values ***/
-
-  this->m_nonzeroSingularValues = this->m_diagSize;
-  for(Index i = 0; i < this->m_diagSize; i++)
-  {
-    Index pos;
-    RealScalar maxRemainingSingularValue = this->m_singularValues.tail(this->m_diagSize-i).maxCoeff(&pos);
-    if(maxRemainingSingularValue == RealScalar(0))
-    {
-      this->m_nonzeroSingularValues = i;
-      break;
-    }
-    if(pos)
-    {
-      pos += i;
-      std::swap(this->m_singularValues.coeffRef(i), this->m_singularValues.coeffRef(pos));
-      if(SVDBase<MatrixType>::computeU()) this->m_matrixU.col(pos).swap(this->m_matrixU.col(i));
-      if(SVDBase<MatrixType>::computeV()) this->m_matrixV.col(pos).swap(this->m_matrixV.col(i));
-    }
-  }
-
-  this->m_isInitialized = true;
-  return *this;
-}
-
-namespace internal {
-template<typename _MatrixType, int QRPreconditioner, typename Rhs>
-struct solve_retval<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-  : solve_retval_base<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-{
-  typedef JacobiSVD<_MatrixType, QRPreconditioner> JacobiSVDType;
-  EIGEN_MAKE_SOLVE_HELPERS(JacobiSVDType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    eigen_assert(rhs().rows() == dec().rows());
-
-    // A = U S V^*
-    // So A^{-1} = V S^{-1} U^*
-
-    Index diagSize = (std::min)(dec().rows(), dec().cols());
-    typename JacobiSVDType::SingularValuesType invertedSingVals(diagSize);
-
-    Index nonzeroSingVals = dec().nonzeroSingularValues();
-    invertedSingVals.head(nonzeroSingVals) = dec().singularValues().head(nonzeroSingVals).array().inverse();
-    invertedSingVals.tail(diagSize - nonzeroSingVals).setZero();
-
-    dst = dec().matrixV().leftCols(diagSize)
-        * invertedSingVals.asDiagonal()
-        * dec().matrixU().leftCols(diagSize).adjoint()
-        * rhs();
-  }
-};
-} // end namespace internal
-
-/** \svd_module
-  *
-  * \return the singular value decomposition of \c *this computed by two-sided
-  * Jacobi transformations.
-  *
-  * \sa class JacobiSVD
-  */
-template<typename Derived>
-JacobiSVD<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::jacobiSvd(unsigned int computationOptions) const
-{
-  return JacobiSVD<PlainObject>(*this, computationOptions);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_JACOBISVD_H
diff --git a/unsupported/Eigen/src/SVD/TODOBdcsvd.txt b/unsupported/Eigen/src/SVD/TODOBdcsvd.txt
deleted file mode 100644
index 0bc9a46e6..000000000
--- a/unsupported/Eigen/src/SVD/TODOBdcsvd.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-TO DO LIST
-
-
-
-(optional optimization) - do all the allocations in the allocate part 
-                        - support static matrices
-                        - return a error at compilation time when using integer matrices (int, long, std::complex<int>, ...)
-
-to finish the algorithm :
-			-implement the last part of the algorithm as described on the reference paper. 
-			    You may find more information on that part on this paper
-
-			-to replace the call to JacobiSVD at the end of the divide algorithm, just after the call to 
-			    deflation.
-
-(suggested step by step resolution)
-                       0) comment the call to Jacobi in the last part of the divide method and everything right after
-                               until the end of the method. What is commented can be a guideline to steps 3) 4) and 6)
-                       1) solve the secular equation (Characteristic equation) on the values that are not null (zi!=0 and di!=0), after the deflation
-                               wich should be uncommented in the divide method
-                       2) remember the values of the singular values that are already computed (zi=0)
-                       3) assign the singular values found in m_computed at the right places (with the ones found in step 2) )
-                               in decreasing order
-                       4) set the firstcol to zero (except the first element) in m_computed
-                       5) compute all the singular vectors when CompV is set to true and only the left vectors when
-                               CompV is set to false
-                       6) multiply naiveU and naiveV to the right by the matrices found, only naiveU when CompV is set to
-                               false, /!\ if CompU is false NaiveU has only 2 rows
-                       7) delete everything commented in step 0)
diff --git a/unsupported/Eigen/src/SVD/doneInBDCSVD.txt b/unsupported/Eigen/src/SVD/doneInBDCSVD.txt
deleted file mode 100644
index 8563ddab8..000000000
--- a/unsupported/Eigen/src/SVD/doneInBDCSVD.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-This unsupported package is about a divide and conquer algorithm to compute SVD.
-
-The implementation follows as closely as possible the following reference paper : 
-http://www.cs.yale.edu/publications/techreports/tr933.pdf
-
-The code documentation uses the same names for variables as the reference paper. The code, deflation included, is
-working  but there are a few things that could be optimised as explained in the TODOBdsvd. 
-
-In the code comments were put at the line where would be the third step of the algorithm so one could simply add the call 
-of a function doing the last part of the algorithm and that would not require any knowledge of the part we implemented.
-
-In the TODOBdcsvd we explain what is the main difficulty of the last part and suggest a reference paper to help solve it.
-
-The implemented has trouble with fixed size matrices. 
-
-In the actual implementation, it returns matrices of zero when ask to do a svd on an int matrix. 
-
-
-Paper for the third part:
-http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
-
diff --git a/unsupported/Eigen/src/Skyline/CMakeLists.txt b/unsupported/Eigen/src/Skyline/CMakeLists.txt
deleted file mode 100644
index 3bf1b0dd4..000000000
--- a/unsupported/Eigen/src/Skyline/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Skyline_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Skyline_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Skyline COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Skyline/SkylineProduct.h b/unsupported/Eigen/src/Skyline/SkylineProduct.h
index 1ddf455e2..d9eb814c1 100644
--- a/unsupported/Eigen/src/Skyline/SkylineProduct.h
+++ b/unsupported/Eigen/src/Skyline/SkylineProduct.h
@@ -14,8 +14,8 @@ namespace Eigen {
 
 template<typename Lhs, typename Rhs, int ProductMode>
 struct SkylineProductReturnType {
-    typedef const typename internal::nested<Lhs, Rhs::RowsAtCompileTime>::type LhsNested;
-    typedef const typename internal::nested<Rhs, Lhs::RowsAtCompileTime>::type RhsNested;
+    typedef const typename internal::nested_eval<Lhs, Rhs::RowsAtCompileTime>::type LhsNested;
+    typedef const typename internal::nested_eval<Rhs, Lhs::RowsAtCompileTime>::type RhsNested;
 
     typedef SkylineProduct<LhsNested, RhsNested, ProductMode> Type;
 };
@@ -49,7 +49,7 @@ struct internal::traits<SkylineProduct<LhsNested, RhsNested, ProductMode> > {
         | EvalBeforeAssigningBit
         | EvalBeforeNestingBit,
 
-        CoeffReadCost = Dynamic
+        CoeffReadCost = HugeCost
     };
 
     typedef typename internal::conditional<ResultIsSkyline,
diff --git a/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
new file mode 100644
index 000000000..0e8350a7d
--- /dev/null
+++ b/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
@@ -0,0 +1,1079 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEBLOCKMATRIX_H
+#define EIGEN_SPARSEBLOCKMATRIX_H
+
+namespace Eigen { 
+/** \ingroup SparseCore_Module
+  *
+  * \class BlockSparseMatrix
+  *
+  * \brief A versatile sparse matrix representation where each element is a block
+  *
+  * This class provides routines to manipulate block sparse matrices stored in a
+  * BSR-like representation. There are two main types :
+  *
+  * 1. All blocks have the same number of rows and columns, called block size
+  * in the following. In this case, if this block size is known at compile time,
+  * it can be given as a template parameter like
+  * \code
+  * BlockSparseMatrix<Scalar, 3, ColMajor> bmat(b_rows, b_cols);
+  * \endcode
+  * Here, bmat is a b_rows x b_cols block sparse matrix
+  * where each coefficient is a 3x3 dense matrix.
+  * If the block size is fixed but will be given at runtime,
+  * \code
+  * BlockSparseMatrix<Scalar, Dynamic, ColMajor> bmat(b_rows, b_cols);
+  * bmat.setBlockSize(block_size);
+  * \endcode
+  *
+  * 2. The second case is for variable-block sparse matrices.
+  * Here each block has its own dimensions. The only restriction is that all the blocks
+  * in a row (resp. a column) should have the same number of rows (resp. of columns).
+  * It is thus required in this case to describe the layout of the matrix by calling
+  * setBlockLayout(rowBlocks, colBlocks).
+  *
+  * In any of the previous case, the matrix can be filled by calling setFromTriplets().
+  * A regular sparse matrix can be converted to a block sparse matrix and vice versa.
+  * It is obviously required to describe the block layout beforehand by calling either
+  * setBlockSize() for fixed-size blocks or setBlockLayout for variable-size blocks.
+  *
+  * \tparam _Scalar The Scalar type
+  * \tparam _BlockAtCompileTime The block layout option. It takes the following values
+  * Dynamic : block size known at runtime
+  * a numeric number : fixed-size block known at compile time
+  */
+template<typename _Scalar, int _BlockAtCompileTime=Dynamic, int _Options=ColMajor, typename _StorageIndex=int> class BlockSparseMatrix;
+
+template<typename BlockSparseMatrixT> class BlockSparseMatrixView;
+
+namespace internal {
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _Index>
+struct traits<BlockSparseMatrix<_Scalar,_BlockAtCompileTime,_Options, _Index> >
+{
+  typedef _Scalar Scalar;
+  typedef _Index Index;
+  typedef Sparse StorageKind; // FIXME Where is it used ??
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = Dynamic,
+    BlockSize = _BlockAtCompileTime,
+    Flags = _Options | NestByRefBit | LvalueBit,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    SupportedAccessPatterns = InnerRandomAccessPattern
+  };
+};
+template<typename BlockSparseMatrixT>
+struct traits<BlockSparseMatrixView<BlockSparseMatrixT> >
+{
+  typedef Ref<Matrix<typename BlockSparseMatrixT::Scalar, BlockSparseMatrixT::BlockSize, BlockSparseMatrixT::BlockSize> > Scalar;
+  typedef Ref<Matrix<typename BlockSparseMatrixT::RealScalar, BlockSparseMatrixT::BlockSize, BlockSparseMatrixT::BlockSize> > RealScalar;
+
+};
+
+// Function object to sort a triplet list
+template<typename Iterator, bool IsColMajor>
+struct TripletComp
+{
+  typedef typename Iterator::value_type Triplet;
+  bool operator()(const Triplet& a, const Triplet& b)
+  { if(IsColMajor)
+      return ((a.col() == b.col() && a.row() < b.row()) || (a.col() < b.col()));
+    else
+      return ((a.row() == b.row() && a.col() < b.col()) || (a.row() < b.row()));
+  }
+};
+} // end namespace internal
+
+
+/* Proxy to view the block sparse matrix as a regular sparse matrix */
+template<typename BlockSparseMatrixT>
+class BlockSparseMatrixView : public SparseMatrixBase<BlockSparseMatrixT>
+{
+  public:
+    typedef Ref<typename BlockSparseMatrixT::BlockScalar> Scalar;
+    typedef Ref<typename BlockSparseMatrixT::BlockRealScalar> RealScalar;
+    typedef typename BlockSparseMatrixT::Index Index;
+    typedef  BlockSparseMatrixT Nested;
+    enum {
+      Flags = BlockSparseMatrixT::Options,
+      Options = BlockSparseMatrixT::Options,
+      RowsAtCompileTime = BlockSparseMatrixT::RowsAtCompileTime,
+      ColsAtCompileTime = BlockSparseMatrixT::ColsAtCompileTime,
+      MaxColsAtCompileTime = BlockSparseMatrixT::MaxColsAtCompileTime,
+      MaxRowsAtCompileTime = BlockSparseMatrixT::MaxRowsAtCompileTime
+    };
+  public:
+    BlockSparseMatrixView(const BlockSparseMatrixT& spblockmat)
+     : m_spblockmat(spblockmat)
+    {}
+
+    Index outerSize() const
+    {
+      return (Flags&RowMajorBit) == 1 ? this->rows() : this->cols();
+    }
+    Index cols() const
+    {
+      return m_spblockmat.blockCols();
+    }
+    Index rows() const
+    {
+      return m_spblockmat.blockRows();
+    }
+    Scalar coeff(Index row, Index col)
+    {
+      return m_spblockmat.coeff(row, col);
+    }
+    Scalar coeffRef(Index row, Index col)
+    {
+      return m_spblockmat.coeffRef(row, col);
+    }
+    // Wrapper to iterate over all blocks
+    class InnerIterator : public BlockSparseMatrixT::BlockInnerIterator
+    {
+      public:
+      InnerIterator(const BlockSparseMatrixView& mat, Index outer)
+          : BlockSparseMatrixT::BlockInnerIterator(mat.m_spblockmat, outer)
+      {}
+
+    };
+
+  protected:
+    const BlockSparseMatrixT& m_spblockmat;
+};
+
+// Proxy to view a regular vector as a block vector
+template<typename BlockSparseMatrixT, typename VectorType>
+class BlockVectorView
+{
+  public:
+    enum {
+      BlockSize = BlockSparseMatrixT::BlockSize,
+      ColsAtCompileTime = VectorType::ColsAtCompileTime,
+      RowsAtCompileTime = VectorType::RowsAtCompileTime,
+      Flags = VectorType::Flags
+    };
+    typedef Ref<const Matrix<typename BlockSparseMatrixT::Scalar, (RowsAtCompileTime==1)? 1 : BlockSize, (ColsAtCompileTime==1)? 1 : BlockSize> >Scalar;
+    typedef typename BlockSparseMatrixT::Index Index;
+  public:
+    BlockVectorView(const BlockSparseMatrixT& spblockmat, const VectorType& vec)
+    : m_spblockmat(spblockmat),m_vec(vec)
+    { }
+    inline Index cols() const
+    {
+      return m_vec.cols();
+    }
+    inline Index size() const
+    {
+      return m_spblockmat.blockRows();
+    }
+    inline Scalar coeff(Index bi) const
+    {
+      Index startRow = m_spblockmat.blockRowsIndex(bi);
+      Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+      return m_vec.middleRows(startRow, rowSize);
+    }
+    inline Scalar coeff(Index bi, Index j) const
+    {
+      Index startRow = m_spblockmat.blockRowsIndex(bi);
+      Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+      return m_vec.block(startRow, j, rowSize, 1);
+    }
+  protected:
+    const BlockSparseMatrixT& m_spblockmat;
+    const VectorType& m_vec;
+};
+
+template<typename VectorType, typename Index> class BlockVectorReturn;
+
+
+// Proxy to view a regular vector as a block vector
+template<typename BlockSparseMatrixT, typename VectorType>
+class BlockVectorReturn
+{
+  public:
+    enum {
+      ColsAtCompileTime = VectorType::ColsAtCompileTime,
+      RowsAtCompileTime = VectorType::RowsAtCompileTime,
+      Flags = VectorType::Flags
+    };
+    typedef Ref<Matrix<typename VectorType::Scalar, RowsAtCompileTime, ColsAtCompileTime> > Scalar;
+    typedef typename BlockSparseMatrixT::Index Index;
+  public:
+    BlockVectorReturn(const BlockSparseMatrixT& spblockmat, VectorType& vec)
+    : m_spblockmat(spblockmat),m_vec(vec)
+    { }
+    inline Index size() const
+    {
+      return m_spblockmat.blockRows();
+    }
+    inline Scalar coeffRef(Index bi)
+    {
+      Index startRow = m_spblockmat.blockRowsIndex(bi);
+      Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+      return m_vec.middleRows(startRow, rowSize);
+    }
+    inline Scalar coeffRef(Index bi, Index j)
+    {
+      Index startRow = m_spblockmat.blockRowsIndex(bi);
+      Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+      return m_vec.block(startRow, j, rowSize, 1);
+    }
+
+  protected:
+    const BlockSparseMatrixT& m_spblockmat;
+    VectorType& m_vec;
+};
+
+// Block version of the sparse dense product
+template<typename Lhs, typename Rhs>
+class BlockSparseTimeDenseProduct;
+
+namespace internal {
+
+template<typename BlockSparseMatrixT, typename VecType>
+struct traits<BlockSparseTimeDenseProduct<BlockSparseMatrixT, VecType> >
+{
+  typedef Dense StorageKind;
+  typedef MatrixXpr XprKind;
+  typedef typename BlockSparseMatrixT::Scalar Scalar;
+  typedef typename BlockSparseMatrixT::Index Index;
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = Dynamic,
+    Flags = 0,
+    CoeffReadCost = internal::traits<BlockSparseMatrixT>::CoeffReadCost
+  };
+};
+} // end namespace internal
+
+template<typename Lhs, typename Rhs>
+class BlockSparseTimeDenseProduct
+  : public ProductBase<BlockSparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs>
+{
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(BlockSparseTimeDenseProduct)
+
+    BlockSparseTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    {}
+
+    template<typename Dest> void scaleAndAddTo(Dest& dest, const typename Rhs::Scalar& alpha) const
+    {
+      BlockVectorReturn<Lhs,Dest> tmpDest(m_lhs, dest);
+      internal::sparse_time_dense_product( BlockSparseMatrixView<Lhs>(m_lhs),  BlockVectorView<Lhs, Rhs>(m_lhs, m_rhs), tmpDest, alpha);
+    }
+
+  private:
+    BlockSparseTimeDenseProduct& operator=(const BlockSparseTimeDenseProduct&);
+};
+
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _StorageIndex>
+class BlockSparseMatrix : public SparseMatrixBase<BlockSparseMatrix<_Scalar,_BlockAtCompileTime, _Options,_StorageIndex> >
+{
+  public:
+    typedef _Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef _StorageIndex StorageIndex;
+    typedef typename internal::ref_selector<BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex> >::type Nested;
+
+    enum {
+      Options = _Options,
+      Flags = Options,
+      BlockSize=_BlockAtCompileTime,
+      RowsAtCompileTime = Dynamic,
+      ColsAtCompileTime = Dynamic,
+      MaxRowsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic,
+      IsVectorAtCompileTime = 0,
+      IsColMajor = Flags&RowMajorBit ? 0 : 1
+    };
+    typedef Matrix<Scalar, _BlockAtCompileTime, _BlockAtCompileTime,IsColMajor ? ColMajor : RowMajor> BlockScalar;
+    typedef Matrix<RealScalar, _BlockAtCompileTime, _BlockAtCompileTime,IsColMajor ? ColMajor : RowMajor> BlockRealScalar;
+    typedef typename internal::conditional<_BlockAtCompileTime==Dynamic, Scalar, BlockScalar>::type BlockScalarReturnType;
+    typedef BlockSparseMatrix<Scalar, BlockSize, IsColMajor ? ColMajor : RowMajor, StorageIndex> PlainObject;
+  public:
+    // Default constructor
+    BlockSparseMatrix()
+    : m_innerBSize(0),m_outerBSize(0),m_innerOffset(0),m_outerOffset(0),
+      m_nonzerosblocks(0),m_values(0),m_blockPtr(0),m_indices(0),
+      m_outerIndex(0),m_blockSize(BlockSize)
+    { }
+
+
+    /**
+     * \brief Construct and resize
+     *
+     */
+    BlockSparseMatrix(Index brow, Index bcol)
+      : m_innerBSize(IsColMajor ? brow : bcol),
+        m_outerBSize(IsColMajor ? bcol : brow),
+        m_innerOffset(0),m_outerOffset(0),m_nonzerosblocks(0),
+        m_values(0),m_blockPtr(0),m_indices(0),
+        m_outerIndex(0),m_blockSize(BlockSize)
+    { }
+
+    /**
+     * \brief Copy-constructor
+     */
+    BlockSparseMatrix(const BlockSparseMatrix& other)
+      : m_innerBSize(other.m_innerBSize),m_outerBSize(other.m_outerBSize),
+        m_nonzerosblocks(other.m_nonzerosblocks),m_nonzeros(other.m_nonzeros),
+        m_blockPtr(0),m_blockSize(other.m_blockSize)
+    {
+      // should we allow copying between variable-size blocks and fixed-size blocks ??
+      eigen_assert(m_blockSize == BlockSize && " CAN NOT COPY BETWEEN FIXED-SIZE AND VARIABLE-SIZE BLOCKS");
+
+      std::copy(other.m_innerOffset, other.m_innerOffset+m_innerBSize+1, m_innerOffset);
+      std::copy(other.m_outerOffset, other.m_outerOffset+m_outerBSize+1, m_outerOffset);
+      std::copy(other.m_values, other.m_values+m_nonzeros, m_values);
+
+      if(m_blockSize != Dynamic)
+        std::copy(other.m_blockPtr, other.m_blockPtr+m_nonzerosblocks, m_blockPtr);
+
+      std::copy(other.m_indices, other.m_indices+m_nonzerosblocks, m_indices);
+      std::copy(other.m_outerIndex, other.m_outerIndex+m_outerBSize, m_outerIndex);
+    }
+
+    friend void swap(BlockSparseMatrix& first, BlockSparseMatrix& second)
+    {
+      std::swap(first.m_innerBSize, second.m_innerBSize);
+      std::swap(first.m_outerBSize, second.m_outerBSize);
+      std::swap(first.m_innerOffset, second.m_innerOffset);
+      std::swap(first.m_outerOffset, second.m_outerOffset);
+      std::swap(first.m_nonzerosblocks, second.m_nonzerosblocks);
+      std::swap(first.m_nonzeros, second.m_nonzeros);
+      std::swap(first.m_values, second.m_values);
+      std::swap(first.m_blockPtr, second.m_blockPtr);
+      std::swap(first.m_indices, second.m_indices);
+      std::swap(first.m_outerIndex, second.m_outerIndex);
+      std::swap(first.m_BlockSize, second.m_blockSize);
+    }
+
+    BlockSparseMatrix& operator=(BlockSparseMatrix other)
+    {
+      //Copy-and-swap paradigm ... avoid leaked data if thrown
+      swap(*this, other);
+      return *this;
+    }
+
+    // Destructor
+    ~BlockSparseMatrix()
+    {
+      delete[] m_outerIndex;
+      delete[] m_innerOffset;
+      delete[] m_outerOffset;
+      delete[] m_indices;
+      delete[] m_blockPtr;
+      delete[] m_values;
+    }
+
+
+    /**
+      * \brief Constructor from a sparse matrix
+      *
+      */
+    template<typename MatrixType>
+    inline BlockSparseMatrix(const MatrixType& spmat) : m_blockSize(BlockSize)
+    {
+      EIGEN_STATIC_ASSERT((m_blockSize != Dynamic), THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE);
+
+      *this = spmat;
+    }
+
+    /**
+      * \brief Assignment from a sparse matrix with the same storage order
+      *
+      * Convert from a sparse matrix to block sparse matrix.
+      * \warning Before calling this function, tt is necessary to call
+      * either setBlockLayout() (matrices with variable-size blocks)
+      * or setBlockSize() (for fixed-size blocks).
+      */
+    template<typename MatrixType>
+    inline BlockSparseMatrix& operator=(const MatrixType& spmat)
+    {
+      eigen_assert((m_innerBSize != 0 && m_outerBSize != 0)
+                   && "Trying to assign to a zero-size matrix, call resize() first");
+      eigen_assert(((MatrixType::Options&RowMajorBit) != IsColMajor) && "Wrong storage order");
+      typedef SparseMatrix<bool,MatrixType::Options,typename MatrixType::Index> MatrixPatternType;
+      MatrixPatternType  blockPattern(blockRows(), blockCols());
+      m_nonzeros = 0;
+
+      // First, compute the number of nonzero blocks and their locations
+      for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+      {
+        // Browse each outer block and compute the structure
+        std::vector<bool> nzblocksFlag(m_innerBSize,false);  // Record the existing blocks
+        blockPattern.startVec(bj);
+        for(StorageIndex j = blockOuterIndex(bj); j < blockOuterIndex(bj+1); ++j)
+        {
+          typename MatrixType::InnerIterator it_spmat(spmat, j);
+          for(; it_spmat; ++it_spmat)
+          {
+            StorageIndex bi = innerToBlock(it_spmat.index()); // Index of the current nonzero block
+            if(!nzblocksFlag[bi])
+            {
+              // Save the index of this nonzero block
+              nzblocksFlag[bi] = true;
+              blockPattern.insertBackByOuterInnerUnordered(bj, bi) = true;
+              // Compute the total number of nonzeros (including explicit zeros in blocks)
+              m_nonzeros += blockOuterSize(bj) * blockInnerSize(bi);
+            }
+          }
+        } // end current outer block
+      }
+      blockPattern.finalize();
+
+      // Allocate the internal arrays
+      setBlockStructure(blockPattern);
+
+      for(StorageIndex nz = 0; nz < m_nonzeros; ++nz) m_values[nz] = Scalar(0);
+      for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+      {
+        // Now copy the values
+        for(StorageIndex j = blockOuterIndex(bj); j < blockOuterIndex(bj+1); ++j)
+        {
+          // Browse the outer block column by column (for column-major matrices)
+          typename MatrixType::InnerIterator it_spmat(spmat, j);
+          for(; it_spmat; ++it_spmat)
+          {
+            StorageIndex idx = 0; // Position of this block in the column block
+            StorageIndex bi = innerToBlock(it_spmat.index()); // Index of the current nonzero block
+            // Go to the inner block where this element belongs to
+            while(bi > m_indices[m_outerIndex[bj]+idx]) ++idx; // Not expensive for ordered blocks
+            StorageIndex idxVal;// Get the right position in the array of values for this element
+            if(m_blockSize == Dynamic)
+            {
+              // Offset from all blocks before ...
+              idxVal =  m_blockPtr[m_outerIndex[bj]+idx];
+              // ... and offset inside the block
+              idxVal += (j - blockOuterIndex(bj)) * blockOuterSize(bj) + it_spmat.index() - m_innerOffset[bi];
+            }
+            else
+            {
+              // All blocks before
+              idxVal = (m_outerIndex[bj] + idx) * m_blockSize * m_blockSize;
+              // inside the block
+              idxVal += (j - blockOuterIndex(bj)) * m_blockSize + (it_spmat.index()%m_blockSize);
+            }
+            // Insert the value
+            m_values[idxVal] = it_spmat.value();
+          } // end of this column
+        } // end of this block
+      } // end of this outer block
+
+      return *this;
+    }
+
+    /**
+      * \brief Set the nonzero block pattern of the matrix
+      *
+      * Given a sparse matrix describing the nonzero block pattern,
+      * this function prepares the internal pointers for values.
+      * After calling this function, any *nonzero* block (bi, bj) can be set
+      * with a simple call to coeffRef(bi,bj).
+      *
+      *
+      * \warning Before calling this function, tt is necessary to call
+      * either setBlockLayout() (matrices with variable-size blocks)
+      * or setBlockSize() (for fixed-size blocks).
+      *
+      * \param blockPattern Sparse matrix of boolean elements describing the block structure
+      *
+      * \sa setBlockLayout() \sa setBlockSize()
+      */
+    template<typename MatrixType>
+    void setBlockStructure(const MatrixType& blockPattern)
+    {
+      resize(blockPattern.rows(), blockPattern.cols());
+      reserve(blockPattern.nonZeros());
+
+      // Browse the block pattern and set up the various pointers
+      m_outerIndex[0] = 0;
+      if(m_blockSize == Dynamic) m_blockPtr[0] = 0;
+      for(StorageIndex nz = 0; nz < m_nonzeros; ++nz) m_values[nz] = Scalar(0);
+      for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+      {
+        //Browse each outer block
+
+        //First, copy and save the indices of nonzero blocks
+        //FIXME : find a way to avoid this ...
+        std::vector<int> nzBlockIdx;
+        typename MatrixType::InnerIterator it(blockPattern, bj);
+        for(; it; ++it)
+        {
+          nzBlockIdx.push_back(it.index());
+        }
+        std::sort(nzBlockIdx.begin(), nzBlockIdx.end());
+
+        // Now, fill block indices and (eventually) pointers to blocks
+        for(StorageIndex idx = 0; idx < nzBlockIdx.size(); ++idx)
+        {
+          StorageIndex offset = m_outerIndex[bj]+idx; // offset in m_indices
+          m_indices[offset] = nzBlockIdx[idx];
+          if(m_blockSize == Dynamic)
+            m_blockPtr[offset] = m_blockPtr[offset-1] + blockInnerSize(nzBlockIdx[idx]) * blockOuterSize(bj);
+          // There is no blockPtr for fixed-size blocks... not needed !???
+        }
+        // Save the pointer to the next outer block
+        m_outerIndex[bj+1] = m_outerIndex[bj] + nzBlockIdx.size();
+      }
+    }
+
+    /**
+      * \brief Set the number of rows and columns blocks
+      */
+    inline void resize(Index brow, Index bcol)
+    {
+      m_innerBSize = IsColMajor ? brow : bcol;
+      m_outerBSize = IsColMajor ? bcol : brow;
+    }
+
+    /**
+      * \brief set the block size at runtime for fixed-size block layout
+      *
+      * Call this only for fixed-size blocks
+      */
+    inline void setBlockSize(Index blockSize)
+    {
+      m_blockSize = blockSize;
+    }
+
+    /**
+      * \brief Set the row and column block layouts,
+      *
+      * This function set the size of each row and column block.
+      * So this function should be used only for blocks with variable size.
+      * \param rowBlocks : Number of rows per row block
+      * \param colBlocks : Number of columns per column block
+      * \sa resize(), setBlockSize()
+      */
+    inline void setBlockLayout(const VectorXi& rowBlocks, const VectorXi& colBlocks)
+    {
+      const VectorXi& innerBlocks = IsColMajor ? rowBlocks : colBlocks;
+      const VectorXi& outerBlocks = IsColMajor ? colBlocks : rowBlocks;
+      eigen_assert(m_innerBSize == innerBlocks.size() && "CHECK THE NUMBER OF ROW OR COLUMN BLOCKS");
+      eigen_assert(m_outerBSize == outerBlocks.size() && "CHECK THE NUMBER OF ROW OR COLUMN BLOCKS");
+      m_outerBSize = outerBlocks.size();
+      //  starting index of blocks... cumulative sums
+      m_innerOffset = new StorageIndex[m_innerBSize+1];
+      m_outerOffset = new StorageIndex[m_outerBSize+1];
+      m_innerOffset[0] = 0;
+      m_outerOffset[0] = 0;
+      std::partial_sum(&innerBlocks[0], &innerBlocks[m_innerBSize-1]+1, &m_innerOffset[1]);
+      std::partial_sum(&outerBlocks[0], &outerBlocks[m_outerBSize-1]+1, &m_outerOffset[1]);
+
+      // Compute the total number of nonzeros
+      m_nonzeros = 0;
+      for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+        for(StorageIndex bi = 0; bi < m_innerBSize; ++bi)
+          m_nonzeros += outerBlocks[bj] * innerBlocks[bi];
+
+    }
+
+    /**
+      * \brief Allocate the internal array of pointers to blocks and their inner indices
+      *
+      * \note For fixed-size blocks, call setBlockSize() to set the block.
+      * And For variable-size blocks, call setBlockLayout() before using this function
+      *
+      * \param nonzerosblocks Number of nonzero blocks. The total number of nonzeros is
+      * is computed in setBlockLayout() for variable-size blocks
+      * \sa setBlockSize()
+      */
+    inline void reserve(const Index nonzerosblocks)
+    {
+      eigen_assert((m_innerBSize != 0 && m_outerBSize != 0) &&
+          "TRYING TO RESERVE ZERO-SIZE MATRICES, CALL resize() first");
+
+      //FIXME Should free if already allocated
+      m_outerIndex = new StorageIndex[m_outerBSize+1];
+
+      m_nonzerosblocks = nonzerosblocks;
+      if(m_blockSize != Dynamic)
+      {
+        m_nonzeros = nonzerosblocks * (m_blockSize * m_blockSize);
+        m_blockPtr = 0;
+      }
+      else
+      {
+        // m_nonzeros  is already computed in setBlockLayout()
+        m_blockPtr = new StorageIndex[m_nonzerosblocks+1];
+      }
+      m_indices = new StorageIndex[m_nonzerosblocks+1];
+      m_values = new Scalar[m_nonzeros];
+    }
+
+
+    /**
+      * \brief Fill values in a matrix  from a triplet list.
+      *
+      * Each triplet item has a block stored in an Eigen dense matrix.
+      * The InputIterator class should provide the functions row(), col() and value()
+      *
+      * \note For fixed-size blocks, call setBlockSize() before this function.
+      *
+      * FIXME Do not accept duplicates
+      */
+    template<typename InputIterator>
+    void setFromTriplets(const InputIterator& begin, const InputIterator& end)
+    {
+      eigen_assert((m_innerBSize!=0 && m_outerBSize !=0) && "ZERO BLOCKS, PLEASE CALL resize() before");
+
+      /* First, sort the triplet list
+        * FIXME This can be unnecessarily expensive since only the inner indices have to be sorted
+        * The best approach is like in SparseMatrix::setFromTriplets()
+        */
+      internal::TripletComp<InputIterator, IsColMajor> tripletcomp;
+      std::sort(begin, end, tripletcomp);
+
+      /* Count the number of rows and column blocks,
+       * and the number of nonzero blocks per outer dimension
+       */
+      VectorXi rowBlocks(m_innerBSize); // Size of each block row
+      VectorXi colBlocks(m_outerBSize); // Size of each block column
+      rowBlocks.setZero(); colBlocks.setZero();
+      VectorXi nzblock_outer(m_outerBSize); // Number of nz blocks per outer vector
+      VectorXi nz_outer(m_outerBSize); // Number of nz per outer vector...for variable-size blocks
+      nzblock_outer.setZero();
+      nz_outer.setZero();
+      for(InputIterator it(begin); it !=end; ++it)
+      {
+        eigen_assert(it->row() >= 0 && it->row() < this->blockRows() && it->col() >= 0 && it->col() < this->blockCols());
+        eigen_assert((it->value().rows() == it->value().cols() && (it->value().rows() == m_blockSize))
+                     || (m_blockSize == Dynamic));
+
+        if(m_blockSize == Dynamic)
+        {
+          eigen_assert((rowBlocks[it->row()] == 0 || rowBlocks[it->row()] == it->value().rows()) &&
+              "NON CORRESPONDING SIZES FOR ROW BLOCKS");
+          eigen_assert((colBlocks[it->col()] == 0 || colBlocks[it->col()] == it->value().cols()) &&
+              "NON CORRESPONDING SIZES FOR COLUMN BLOCKS");
+          rowBlocks[it->row()] =it->value().rows();
+          colBlocks[it->col()] = it->value().cols();
+        }
+        nz_outer(IsColMajor ? it->col() : it->row()) += it->value().rows() * it->value().cols();
+        nzblock_outer(IsColMajor ? it->col() : it->row())++;
+      }
+      // Allocate member arrays
+      if(m_blockSize == Dynamic) setBlockLayout(rowBlocks, colBlocks);
+      StorageIndex nzblocks = nzblock_outer.sum();
+      reserve(nzblocks);
+
+       // Temporary markers
+      VectorXi block_id(m_outerBSize); // To be used as a block marker during insertion
+
+      // Setup outer index pointers and markers
+      m_outerIndex[0] = 0;
+      if (m_blockSize == Dynamic)  m_blockPtr[0] =  0;
+      for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+      {
+        m_outerIndex[bj+1] = m_outerIndex[bj] + nzblock_outer(bj);
+        block_id(bj) = m_outerIndex[bj];
+        if(m_blockSize==Dynamic)
+        {
+          m_blockPtr[m_outerIndex[bj+1]] = m_blockPtr[m_outerIndex[bj]] + nz_outer(bj);
+        }
+      }
+
+      // Fill the matrix
+      for(InputIterator it(begin); it!=end; ++it)
+      {
+        StorageIndex outer = IsColMajor ? it->col() : it->row();
+        StorageIndex inner = IsColMajor ? it->row() : it->col();
+        m_indices[block_id(outer)] = inner;
+        StorageIndex block_size = it->value().rows()*it->value().cols();
+        StorageIndex nz_marker = blockPtr(block_id[outer]);
+        memcpy(&(m_values[nz_marker]), it->value().data(), block_size * sizeof(Scalar));
+        if(m_blockSize == Dynamic)
+        {
+          m_blockPtr[block_id(outer)+1] = m_blockPtr[block_id(outer)] + block_size;
+        }
+        block_id(outer)++;
+      }
+
+      // An alternative when the outer indices are sorted...no need to use an array of markers
+//      for(Index bcol = 0; bcol < m_outerBSize; ++bcol)
+//      {
+//      Index id = 0, id_nz = 0, id_nzblock = 0;
+//      for(InputIterator it(begin); it!=end; ++it)
+//      {
+//        while (id<bcol) // one pass should do the job unless there are empty columns
+//        {
+//          id++;
+//          m_outerIndex[id+1]=m_outerIndex[id];
+//        }
+//        m_outerIndex[id+1] += 1;
+//        m_indices[id_nzblock]=brow;
+//        Index block_size = it->value().rows()*it->value().cols();
+//        m_blockPtr[id_nzblock+1] = m_blockPtr[id_nzblock] + block_size;
+//        id_nzblock++;
+//        memcpy(&(m_values[id_nz]),it->value().data(), block_size*sizeof(Scalar));
+//        id_nz += block_size;
+//      }
+//      while(id < m_outerBSize-1) // Empty columns at the end
+//      {
+//        id++;
+//        m_outerIndex[id+1]=m_outerIndex[id];
+//      }
+//      }
+    }
+
+
+    /**
+      * \returns the number of rows
+      */
+    inline Index rows() const
+    {
+//      return blockRows();
+      return (IsColMajor ? innerSize() : outerSize());
+    }
+
+    /**
+      * \returns the number of cols
+      */
+    inline Index cols() const
+    {
+//      return blockCols();
+      return (IsColMajor ? outerSize() : innerSize());
+    }
+
+    inline Index innerSize() const
+    {
+      if(m_blockSize == Dynamic) return m_innerOffset[m_innerBSize];
+      else return  (m_innerBSize * m_blockSize) ;
+    }
+
+    inline Index outerSize() const
+    {
+      if(m_blockSize == Dynamic) return m_outerOffset[m_outerBSize];
+      else return  (m_outerBSize * m_blockSize) ;
+    }
+    /** \returns the number of rows grouped by blocks */
+    inline Index blockRows() const
+    {
+      return (IsColMajor ? m_innerBSize : m_outerBSize);
+    }
+    /** \returns the number of columns grouped by blocks */
+    inline Index blockCols() const
+    {
+      return (IsColMajor ? m_outerBSize : m_innerBSize);
+    }
+
+    inline Index outerBlocks() const { return m_outerBSize; }
+    inline Index innerBlocks() const { return m_innerBSize; }
+
+    /** \returns the block index where outer belongs to */
+    inline Index outerToBlock(Index outer) const
+    {
+      eigen_assert(outer < outerSize() && "OUTER INDEX OUT OF BOUNDS");
+
+      if(m_blockSize != Dynamic)
+        return (outer / m_blockSize); // Integer division
+
+      StorageIndex b_outer = 0;
+      while(m_outerOffset[b_outer] <= outer) ++b_outer;
+      return b_outer - 1;
+    }
+    /** \returns  the block index where inner belongs to */
+    inline Index innerToBlock(Index inner) const
+    {
+      eigen_assert(inner < innerSize() && "OUTER INDEX OUT OF BOUNDS");
+
+      if(m_blockSize != Dynamic)
+        return (inner / m_blockSize); // Integer division
+
+      StorageIndex b_inner = 0;
+      while(m_innerOffset[b_inner] <= inner) ++b_inner;
+      return b_inner - 1;
+    }
+
+    /**
+      *\returns a reference to the (i,j) block as an Eigen Dense Matrix
+      */
+    Ref<BlockScalar> coeffRef(Index brow, Index bcol)
+    {
+      eigen_assert(brow < blockRows() && "BLOCK ROW INDEX OUT OF BOUNDS");
+      eigen_assert(bcol < blockCols() && "BLOCK nzblocksFlagCOLUMN OUT OF BOUNDS");
+
+      StorageIndex rsize = IsColMajor ? blockInnerSize(brow): blockOuterSize(bcol);
+      StorageIndex csize = IsColMajor ? blockOuterSize(bcol) : blockInnerSize(brow);
+      StorageIndex inner = IsColMajor ? brow : bcol;
+      StorageIndex outer = IsColMajor ? bcol : brow;
+      StorageIndex offset = m_outerIndex[outer];
+      while(offset < m_outerIndex[outer+1] && m_indices[offset] != inner)
+        offset++;
+      if(m_indices[offset] == inner)
+      {
+        return Map<BlockScalar>(&(m_values[blockPtr(offset)]), rsize, csize);
+      }
+      else
+      {
+        //FIXME the block does not exist, Insert it !!!!!!!!!
+        eigen_assert("DYNAMIC INSERTION IS NOT YET SUPPORTED");
+      }
+    }
+
+    /**
+      * \returns the value of the (i,j) block as an Eigen Dense Matrix
+      */
+    Map<const BlockScalar> coeff(Index brow, Index bcol) const
+    {
+      eigen_assert(brow < blockRows() && "BLOCK ROW INDEX OUT OF BOUNDS");
+      eigen_assert(bcol < blockCols() && "BLOCK COLUMN OUT OF BOUNDS");
+
+      StorageIndex rsize = IsColMajor ? blockInnerSize(brow): blockOuterSize(bcol);
+      StorageIndex csize = IsColMajor ? blockOuterSize(bcol) : blockInnerSize(brow);
+      StorageIndex inner = IsColMajor ? brow : bcol;
+      StorageIndex outer = IsColMajor ? bcol : brow;
+      StorageIndex offset = m_outerIndex[outer];
+      while(offset < m_outerIndex[outer+1] && m_indices[offset] != inner) offset++;
+      if(m_indices[offset] == inner)
+      {
+        return Map<const BlockScalar> (&(m_values[blockPtr(offset)]), rsize, csize);
+      }
+      else
+//        return BlockScalar::Zero(rsize, csize);
+        eigen_assert("NOT YET SUPPORTED");
+    }
+
+    // Block Matrix times vector product
+    template<typename VecType>
+    BlockSparseTimeDenseProduct<BlockSparseMatrix, VecType> operator*(const VecType& lhs) const
+    {
+      return BlockSparseTimeDenseProduct<BlockSparseMatrix, VecType>(*this, lhs);
+    }
+
+    /** \returns the number of nonzero blocks */
+    inline Index nonZerosBlocks() const { return m_nonzerosblocks; }
+    /** \returns the total number of nonzero elements, including eventual explicit zeros in blocks */
+    inline Index nonZeros() const { return m_nonzeros; }
+
+    inline BlockScalarReturnType *valuePtr() {return static_cast<BlockScalarReturnType *>(m_values);}
+//    inline Scalar *valuePtr(){ return m_values; }
+    inline StorageIndex *innerIndexPtr() {return m_indices; }
+    inline const StorageIndex *innerIndexPtr() const {return m_indices; }
+    inline StorageIndex *outerIndexPtr() {return m_outerIndex; }
+    inline const StorageIndex* outerIndexPtr() const {return m_outerIndex; }
+
+    /** \brief for compatibility purposes with the SparseMatrix class */
+    inline bool isCompressed() const {return true;}
+    /**
+      * \returns the starting index of the bi row block
+      */
+    inline Index blockRowsIndex(Index bi) const
+    {
+      return IsColMajor ? blockInnerIndex(bi) : blockOuterIndex(bi);
+    }
+
+    /**
+      * \returns the starting index of the bj col block
+      */
+    inline Index blockColsIndex(Index bj) const
+    {
+      return IsColMajor ? blockOuterIndex(bj) : blockInnerIndex(bj);
+    }
+
+    inline Index blockOuterIndex(Index bj) const
+    {
+      return (m_blockSize == Dynamic) ? m_outerOffset[bj] : (bj * m_blockSize);
+    }
+    inline Index blockInnerIndex(Index bi) const
+    {
+      return (m_blockSize == Dynamic) ? m_innerOffset[bi] : (bi * m_blockSize);
+    }
+
+    // Not needed ???
+    inline Index blockInnerSize(Index bi) const
+    {
+      return (m_blockSize == Dynamic) ? (m_innerOffset[bi+1] - m_innerOffset[bi]) : m_blockSize;
+    }
+    inline Index blockOuterSize(Index bj) const
+    {
+      return (m_blockSize == Dynamic) ? (m_outerOffset[bj+1]- m_outerOffset[bj]) : m_blockSize;
+    }
+
+    /**
+      * \brief Browse the matrix by outer index
+      */
+    class InnerIterator; // Browse column by column
+
+    /**
+      * \brief Browse the matrix by block outer index
+      */
+    class BlockInnerIterator; // Browse block by block
+
+    friend std::ostream & operator << (std::ostream & s, const BlockSparseMatrix& m)
+    {
+      for (StorageIndex j = 0; j < m.outerBlocks(); ++j)
+      {
+        BlockInnerIterator itb(m, j);
+        for(; itb; ++itb)
+        {
+          s << "("<<itb.row() << ", " << itb.col() << ")\n";
+          s << itb.value() <<"\n";
+        }
+      }
+      s << std::endl;
+      return s;
+    }
+
+    /**
+      * \returns the starting position of the block <id> in the array of values
+      */
+    Index blockPtr(Index id) const
+    {
+      if(m_blockSize == Dynamic) return m_blockPtr[id];
+      else return id * m_blockSize * m_blockSize;
+      //return blockDynIdx(id, typename internal::conditional<(BlockSize==Dynamic), internal::true_type, internal::false_type>::type());
+    }
+
+
+  protected:
+//    inline Index blockDynIdx(Index id, internal::true_type) const
+//    {
+//      return m_blockPtr[id];
+//    }
+//    inline Index blockDynIdx(Index id, internal::false_type) const
+//    {
+//      return id * BlockSize * BlockSize;
+//    }
+
+    // To be implemented
+    // Insert a block at a particular location... need to make a room for that
+    Map<BlockScalar> insert(Index brow, Index bcol);
+
+    Index m_innerBSize; // Number of block rows
+    Index m_outerBSize; // Number of block columns
+    StorageIndex *m_innerOffset; // Starting index of each inner block (size m_innerBSize+1)
+    StorageIndex *m_outerOffset; // Starting index of each outer block (size m_outerBSize+1)
+    Index m_nonzerosblocks; // Total nonzeros blocks (lower than  m_innerBSize x m_outerBSize)
+    Index m_nonzeros; // Total nonzeros elements
+    Scalar *m_values; //Values stored block column after block column (size m_nonzeros)
+    StorageIndex *m_blockPtr; // Pointer to the beginning of each block in m_values, size m_nonzeroblocks ... null for fixed-size blocks
+    StorageIndex *m_indices; //Inner block indices, size m_nonzerosblocks ... OK
+    StorageIndex *m_outerIndex; // Starting pointer of each block column in m_indices (size m_outerBSize)... OK
+    Index m_blockSize; // Size of a block for fixed-size blocks, otherwise -1
+};
+
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _StorageIndex>
+class BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex>::BlockInnerIterator
+{
+  public:
+
+    enum{
+      Flags = _Options
+    };
+
+    BlockInnerIterator(const BlockSparseMatrix& mat, const Index outer)
+    : m_mat(mat),m_outer(outer),
+      m_id(mat.m_outerIndex[outer]),
+      m_end(mat.m_outerIndex[outer+1])
+    {
+    }
+
+    inline BlockInnerIterator& operator++() {m_id++; return *this; }
+
+    inline const Map<const BlockScalar> value() const
+    {
+      return Map<const BlockScalar>(&(m_mat.m_values[m_mat.blockPtr(m_id)]),
+          rows(),cols());
+    }
+    inline Map<BlockScalar> valueRef()
+    {
+      return Map<BlockScalar>(&(m_mat.m_values[m_mat.blockPtr(m_id)]),
+          rows(),cols());
+    }
+    // Block inner index
+    inline Index index() const {return m_mat.m_indices[m_id]; }
+    inline Index outer() const { return m_outer; }
+    // block row index
+    inline Index row() const  {return index(); }
+    // block column index
+    inline Index col() const {return outer(); }
+    // FIXME Number of rows in the current block
+    inline Index rows() const { return (m_mat.m_blockSize==Dynamic) ? (m_mat.m_innerOffset[index()+1] - m_mat.m_innerOffset[index()]) : m_mat.m_blockSize; }
+    // Number of columns in the current block ...
+    inline Index cols() const { return (m_mat.m_blockSize==Dynamic) ? (m_mat.m_outerOffset[m_outer+1]-m_mat.m_outerOffset[m_outer]) : m_mat.m_blockSize;}
+    inline operator bool() const { return (m_id < m_end); }
+
+  protected:
+    const BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, StorageIndex>& m_mat;
+    const Index m_outer;
+    Index m_id;
+    Index m_end;
+};
+
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _StorageIndex>
+class BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex>::InnerIterator
+{
+  public:
+    InnerIterator(const BlockSparseMatrix& mat, Index outer)
+    : m_mat(mat),m_outerB(mat.outerToBlock(outer)),m_outer(outer),
+      itb(mat, mat.outerToBlock(outer)),
+      m_offset(outer - mat.blockOuterIndex(m_outerB))
+     {
+        if (itb)
+        {
+          m_id = m_mat.blockInnerIndex(itb.index());
+          m_start = m_id;
+          m_end = m_mat.blockInnerIndex(itb.index()+1);
+        }
+     }
+    inline InnerIterator& operator++()
+    {
+      m_id++;
+      if (m_id >= m_end)
+      {
+        ++itb;
+        if (itb)
+        {
+          m_id = m_mat.blockInnerIndex(itb.index());
+          m_start = m_id;
+          m_end = m_mat.blockInnerIndex(itb.index()+1);
+        }
+      }
+      return *this;
+    }
+    inline const Scalar& value() const
+    {
+      return itb.value().coeff(m_id - m_start, m_offset);
+    }
+    inline Scalar& valueRef()
+    {
+      return itb.valueRef().coeff(m_id - m_start, m_offset);
+    }
+    inline Index index() const { return m_id; }
+    inline Index outer() const {return m_outer; }
+    inline Index col() const {return outer(); }
+    inline Index row() const { return index();}
+    inline operator bool() const
+    {
+      return itb;
+    }
+  protected:
+    const BlockSparseMatrix& m_mat;
+    const Index m_outer;
+    const Index m_outerB;
+    BlockInnerIterator itb; // Iterator through the blocks
+    const Index m_offset; // Position of this column in the block
+    Index m_start; // starting inner index of this block
+    Index m_id; // current inner index in the block
+    Index m_end; // starting inner index of the next block
+
+};
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEBLOCKMATRIX_H
diff --git a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt b/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
deleted file mode 100644
index 7ea32ca5e..000000000
--- a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseExtra_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SparseExtra_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/SparseExtra COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
index dec16df28..037a13f86 100644
--- a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
+++ b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
@@ -33,11 +33,11 @@ namespace Eigen {
   */
 
 namespace internal {
-template<typename _Scalar, int _Options, typename _Index>
-struct traits<DynamicSparseMatrix<_Scalar, _Options, _Index> >
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct traits<DynamicSparseMatrix<_Scalar, _Options, _StorageIndex> >
 {
   typedef _Scalar Scalar;
-  typedef _Index Index;
+  typedef _StorageIndex StorageIndex;
   typedef Sparse StorageKind;
   typedef MatrixXpr XprKind;
   enum {
@@ -52,10 +52,12 @@ struct traits<DynamicSparseMatrix<_Scalar, _Options, _Index> >
 };
 }
 
-template<typename _Scalar, int _Options, typename _Index>
+template<typename _Scalar, int _Options, typename _StorageIndex>
  class  DynamicSparseMatrix
-  : public SparseMatrixBase<DynamicSparseMatrix<_Scalar, _Options, _Index> >
+  : public SparseMatrixBase<DynamicSparseMatrix<_Scalar, _Options, _StorageIndex> >
 {
+    typedef SparseMatrixBase<DynamicSparseMatrix> Base;
+    using Base::convert_index;
   public:
     EIGEN_SPARSE_PUBLIC_INTERFACE(DynamicSparseMatrix)
     // FIXME: why are these operator already alvailable ???
@@ -70,21 +72,21 @@ template<typename _Scalar, int _Options, typename _Index>
 
   protected:
 
-    typedef DynamicSparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0)> TransposedSparseMatrix;
+    typedef DynamicSparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0), StorageIndex> TransposedSparseMatrix;
 
     Index m_innerSize;
-    std::vector<internal::CompressedStorage<Scalar,Index> > m_data;
+    std::vector<internal::CompressedStorage<Scalar,StorageIndex> > m_data;
 
   public:
 
     inline Index rows() const { return IsRowMajor ? outerSize() : m_innerSize; }
     inline Index cols() const { return IsRowMajor ? m_innerSize : outerSize(); }
     inline Index innerSize() const { return m_innerSize; }
-    inline Index outerSize() const { return static_cast<Index>(m_data.size()); }
+    inline Index outerSize() const { return convert_index(m_data.size()); }
     inline Index innerNonZeros(Index j) const { return m_data[j].size(); }
 
-    std::vector<internal::CompressedStorage<Scalar,Index> >& _data() { return m_data; }
-    const std::vector<internal::CompressedStorage<Scalar,Index> >& _data() const { return m_data; }
+    std::vector<internal::CompressedStorage<Scalar,StorageIndex> >& _data() { return m_data; }
+    const std::vector<internal::CompressedStorage<Scalar,StorageIndex> >& _data() const { return m_data; }
 
     /** \returns the coefficient value at given position \a row, \a col
       * This operation involes a log(rho*outer_size) binary search.
@@ -121,7 +123,7 @@ template<typename _Scalar, int _Options, typename _Index>
     {
       Index res = 0;
       for (Index j=0; j<outerSize(); ++j)
-        res += static_cast<Index>(m_data[j].size());
+        res += m_data[j].size();
       return res;
     }
 
@@ -197,7 +199,7 @@ template<typename _Scalar, int _Options, typename _Index>
     void resize(Index rows, Index cols)
     {
       const Index outerSize = IsRowMajor ? rows : cols;
-      m_innerSize = IsRowMajor ? cols : rows;
+      m_innerSize = convert_index(IsRowMajor ? cols : rows);
       setZero();
       if (Index(m_data.size()) != outerSize)
       {
@@ -320,10 +322,10 @@ template<typename _Scalar, int _Options, typename _Index>
 #   endif
  };
 
-template<typename Scalar, int _Options, typename _Index>
-class DynamicSparseMatrix<Scalar,_Options,_Index>::InnerIterator : public SparseVector<Scalar,_Options,_Index>::InnerIterator
+template<typename Scalar, int _Options, typename _StorageIndex>
+class DynamicSparseMatrix<Scalar,_Options,_StorageIndex>::InnerIterator : public SparseVector<Scalar,_Options,_StorageIndex>::InnerIterator
 {
-    typedef typename SparseVector<Scalar,_Options,_Index>::InnerIterator Base;
+    typedef typename SparseVector<Scalar,_Options,_StorageIndex>::InnerIterator Base;
   public:
     InnerIterator(const DynamicSparseMatrix& mat, Index outer)
       : Base(mat.m_data[outer]), m_outer(outer)
@@ -331,15 +333,16 @@ class DynamicSparseMatrix<Scalar,_Options,_Index>::InnerIterator : public Sparse
 
     inline Index row() const { return IsRowMajor ? m_outer : Base::index(); }
     inline Index col() const { return IsRowMajor ? Base::index() : m_outer; }
+    inline Index outer() const { return m_outer; }
 
   protected:
     const Index m_outer;
 };
 
-template<typename Scalar, int _Options, typename _Index>
-class DynamicSparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator : public SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator
+template<typename Scalar, int _Options, typename _StorageIndex>
+class DynamicSparseMatrix<Scalar,_Options,_StorageIndex>::ReverseInnerIterator : public SparseVector<Scalar,_Options,_StorageIndex>::ReverseInnerIterator
 {
-    typedef typename SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator Base;
+    typedef typename SparseVector<Scalar,_Options,_StorageIndex>::ReverseInnerIterator Base;
   public:
     ReverseInnerIterator(const DynamicSparseMatrix& mat, Index outer)
       : Base(mat.m_data[outer]), m_outer(outer)
@@ -347,11 +350,43 @@ class DynamicSparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator : public
 
     inline Index row() const { return IsRowMajor ? m_outer : Base::index(); }
     inline Index col() const { return IsRowMajor ? Base::index() : m_outer; }
+    inline Index outer() const { return m_outer; }
 
   protected:
     const Index m_outer;
 };
 
+namespace internal {
+
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct evaluator<DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> >
+  : evaluator_base<DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> >
+{
+  typedef _Scalar Scalar;
+  typedef DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> SparseMatrixType;
+  typedef typename SparseMatrixType::InnerIterator InnerIterator;
+  typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
+  
+  enum {
+    CoeffReadCost = NumTraits<_Scalar>::ReadCost,
+    Flags = SparseMatrixType::Flags
+  };
+  
+  evaluator() : m_matrix(0) {}
+  evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {}
+  
+  operator SparseMatrixType&() { return m_matrix->const_cast_derived(); }
+  operator const SparseMatrixType&() const { return *m_matrix; }
+  
+  Scalar coeff(Index row, Index col) const { return m_matrix->coeff(row,col); }
+  
+  Index nonZerosEstimate() const { return m_matrix->nonZeros(); }
+
+  const SparseMatrixType *m_matrix;
+};
+
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_DYNAMIC_SPARSEMATRIX_H
diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 7aafce928..cdc14f86e 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -18,7 +18,7 @@ namespace Eigen {
 namespace internal 
 {
   template <typename Scalar>
-  inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, Scalar& value)
+  inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, Scalar& value)
   {
     line >> i >> j >> value;
     i--;
@@ -31,7 +31,7 @@ namespace internal
       return false;
   }
   template <typename Scalar>
-  inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, std::complex<Scalar>& value)
+  inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, std::complex<Scalar>& value)
   {
     Scalar valR, valI;
     line >> i >> j >> valR >> valI;
@@ -133,6 +133,7 @@ template<typename SparseMatrixType>
 bool loadMarket(SparseMatrixType& mat, const std::string& filename)
 {
   typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::Index Index;
   std::ifstream input(filename.c_str(),std::ios::in);
   if(!input)
     return false;
@@ -142,11 +143,11 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
   
   bool readsizes = false;
 
-  typedef Triplet<Scalar,int> T;
+  typedef Triplet<Scalar,Index> T;
   std::vector<T> elements;
   
-  int M(-1), N(-1), NNZ(-1);
-  int count = 0;
+  Index M(-1), N(-1), NNZ(-1);
+  Index count = 0;
   while(input.getline(buffer, maxBuffersize))
   {
     // skip comments   
@@ -162,14 +163,14 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
       if(M > 0 && N > 0 && NNZ > 0) 
       {
         readsizes = true;
-        std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n";
+        //std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n";
         mat.resize(M,N);
         mat.reserve(NNZ);
       }
     }
     else
     { 
-      int i(-1), j(-1);
+      Index i(-1), j(-1);
       Scalar value; 
       if( internal::GetMarketLine(line, M, N, i, j, value) ) 
       {
@@ -238,9 +239,9 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy
   for(int j=0; j<mat.outerSize(); ++j)
     for(typename SparseMatrixType::InnerIterator it(mat,j); it; ++it)
     {
-	++ count;
-	internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out);
-	// out << it.row()+1 << " " << it.col()+1 << " " << it.value() << "\n";
+      ++ count;
+      internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out);
+      // out << it.row()+1 << " " << it.col()+1 << " " << it.value() << "\n";
     }
   out.close();
   return true;
diff --git a/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h b/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
index bf13cf21f..02916ea6f 100644
--- a/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+++ b/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
@@ -41,20 +41,18 @@ enum {
 template <typename Scalar>
 class MatrixMarketIterator 
 {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
   public:
     typedef Matrix<Scalar,Dynamic,1> VectorType; 
     typedef SparseMatrix<Scalar,ColMajor> MatrixType; 
   
   public:
-    MatrixMarketIterator(const std::string folder):m_sym(0),m_isvalid(false),m_matIsLoaded(false),m_hasRhs(false),m_hasrefX(false),m_folder(folder)
+    MatrixMarketIterator(const std::string &folder)
+      : m_sym(0), m_isvalid(false), m_matIsLoaded(false), m_hasRhs(false), m_hasrefX(false), m_folder(folder)
     {
       m_folder_id = opendir(folder.c_str());
-      if (!m_folder_id){
-        m_isvalid = false;
-        std::cerr << "The provided Matrix folder could not be opened \n\n";
-        abort();
-      }
-      Getnextvalidmatrix();
+      if(m_folder_id)
+        Getnextvalidmatrix();
     }
     
     ~MatrixMarketIterator()
@@ -81,16 +79,30 @@ class MatrixMarketIterator
       std::string matrix_file = m_folder + "/" + m_matname + ".mtx";
       if ( !loadMarket(m_mat, matrix_file)) 
       {
+        std::cerr << "Warning loadMarket failed when loading \"" << matrix_file << "\"" << std::endl;
         m_matIsLoaded = false;
         return m_mat;
       }
       m_matIsLoaded = true; 
-      
+
       if (m_sym != NonSymmetric) 
-      { // Store the upper part of the matrix. It is needed by the solvers dealing with nonsymmetric matrices ??
-        MatrixType B; 
-        B = m_mat;
-        m_mat = B.template selfadjointView<Lower>();
+      {
+        // Check whether we need to restore a full matrix:
+        RealScalar diag_norm  = m_mat.diagonal().norm();
+        RealScalar lower_norm = m_mat.template triangularView<Lower>().norm();
+        RealScalar upper_norm = m_mat.template triangularView<Upper>().norm();
+        if(lower_norm>diag_norm && upper_norm==diag_norm)
+        {
+          // only the lower part is stored
+          MatrixType tmp(m_mat);
+          m_mat = tmp.template selfadjointView<Lower>();
+        }
+        else if(upper_norm>diag_norm && lower_norm==diag_norm)
+        {
+          // only the upper part is stored
+          MatrixType tmp(m_mat);
+          m_mat = tmp.template selfadjointView<Upper>();
+        }
       }
       return m_mat; 
     }
@@ -143,6 +155,8 @@ class MatrixMarketIterator
         m_refX.resize(m_mat.cols());
         m_hasrefX = loadMarketVector(m_refX, lhs_file);
       }
+      else
+        m_refX.resize(0);
       return m_refX; 
     }
     
@@ -150,8 +164,9 @@ class MatrixMarketIterator
     
     inline int sym() { return m_sym; }
     
-    inline bool hasRhs() {return m_hasRhs; }
-    inline bool hasrefX() {return m_hasrefX; }
+    bool hasRhs() {return m_hasRhs; }
+    bool hasrefX() {return m_hasrefX; }
+    bool isFolderValid() { return bool(m_folder_id); }
     
   protected:
     
diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
index dee1708e7..ee97299af 100644
--- a/unsupported/Eigen/src/SparseExtra/RandomSetter.h
+++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
@@ -95,10 +95,10 @@ template<typename Scalar> struct GoogleSparseHashMapTraits
   *
   * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access
   *
-  * \param SparseMatrixType the type of the sparse matrix we are updating
-  * \param MapTraits a traits class representing the map implementation used for the temporary sparse storage.
+  * \tparam SparseMatrixType the type of the sparse matrix we are updating
+  * \tparam MapTraits a traits class representing the map implementation used for the temporary sparse storage.
   *                  Its default value depends on the system.
-  * \param OuterPacketBits defines the number of rows (or columns) manage by a single map object
+  * \tparam OuterPacketBits defines the number of rows (or columns) manage by a single map object
   *                        as a power of two exponent.
   *
   * This class temporarily represents a sparse matrix object using a generic map implementation allowing for
@@ -154,7 +154,7 @@ template<typename SparseMatrixType,
 class RandomSetter
 {
     typedef typename SparseMatrixType::Scalar Scalar;
-    typedef typename SparseMatrixType::Index Index;
+    typedef typename SparseMatrixType::StorageIndex StorageIndex;
 
     struct ScalarWrapper
     {
@@ -296,7 +296,7 @@ class RandomSetter
       const Index inner = SetterRowMajor ? col : row;
       const Index outerMajor = outer >> OuterPacketBits; // index of the packet/map
       const Index outerMinor = outer & OuterPacketMask;  // index of the inner vector in the packet
-      const KeyType key = (KeyType(outerMinor)<<m_keyBitsOffset) | inner;
+      const KeyType key = internal::convert_index<KeyType>((outerMinor<<m_keyBitsOffset) | inner);
       return m_hashmaps[outerMajor][key].value;
     }
 
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
new file mode 100644
index 000000000..ed415db99
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -0,0 +1,124 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
+  *
+  * This function computes the coefficient-wise incomplete gamma function.
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igammac(), Eigen::lgamma()
+  */
+template<typename Derived,typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived()
+  );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
+  *
+  * This function computes the coefficient-wise complementary incomplete gamma function.
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template<typename Derived,typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived()
+  );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays.
+  *
+  * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x.
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::digamma()
+  */
+// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+// * \sa ArrayBase::polygamma()
+template<typename DerivedN,typename DerivedX>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
+    n.derived(),
+    x.derived()
+  );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays.
+  *
+  * This function computes the regularized incomplete beta function (integral).
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::betainc(), Eigen::lgamma()
+  */
+template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
+inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
+{
+  return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
+    a.derived(),
+    b.derived(),
+    x.derived()
+  );
+}
+
+
+/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays.
+  *
+  * It returns the Riemann zeta function of two arguments \a x and \a q:
+  *
+  * \param x is the exposent, it must be > 1
+  * \param q is the shift, it must be > 0
+  *
+  * \note This function supports only float and double scalar types. To support other scalar types, the user has
+  * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+  *
+  * \sa ArrayBase::zeta()
+  */
+template<typename DerivedX,typename DerivedQ>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
+    x.derived(),
+    q.derived()
+  );
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
new file mode 100644
index 000000000..d8f2363be
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+/** \internal
+  * \brief Template functor to compute the incomplete gamma function igamma(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igamma
+  */
+template<typename Scalar> struct scalar_igamma_op : binary_op_base<Scalar,Scalar>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+    using numext::igamma; return igamma(a, x);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma(a, x);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGamma
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igammac
+  */
+template<typename Scalar> struct scalar_igammac_op : binary_op_base<Scalar,Scalar>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+    using numext::igammac; return igammac(a, x);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
+  {
+    return internal::pigammac(a, x);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igammac_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammac
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the incomplete beta integral betainc(a, b, x)
+  *
+  */
+template<typename Scalar> struct scalar_betainc_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const {
+    using numext::betainc; return betainc(x, a, b);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const
+  {
+    return internal::pbetainc(x, a, b);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_betainc_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBetaInc
+  };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the natural log of the absolute
+ * value of Gamma of a scalar
+ * \sa class CwiseUnaryOp, Cwise::lgamma()
+ */
+template<typename Scalar> struct scalar_lgamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::lgamma; return lgamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasLGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
+ * \sa class CwiseUnaryOp, Cwise::digamma()
+ */
+template<typename Scalar> struct scalar_digamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::digamma; return digamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasDiGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Riemann Zeta function of two arguments.
+ * \sa class CwiseUnaryOp, Cwise::zeta()
+ */
+template<typename Scalar> struct scalar_zeta_op {
+    EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
+    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
+        using numext::zeta; return zeta(x, q);
+    }
+    typedef typename packet_traits<Scalar>::type Packet;
+    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_zeta_op<Scalar> >
+{
+    enum {
+        // Guesstimate
+        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+        PacketAccess = packet_traits<Scalar>::HasZeta
+    };
+};
+
+/** \internal
+ * \brief Template functor to compute the polygamma function.
+ * \sa class CwiseUnaryOp, Cwise::polygamma()
+ */
+template<typename Scalar> struct scalar_polygamma_op {
+    EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
+    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
+        using numext::polygamma; return polygamma(n, x);
+    }
+    typedef typename packet_traits<Scalar>::type Packet;
+    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_polygamma_op<Scalar> >
+{
+    enum {
+        // Guesstimate
+        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+        PacketAccess = packet_traits<Scalar>::HasPolygamma
+    };
+};
+
+/** \internal
+ * \brief Template functor to compute the Gauss error function of a
+ * scalar
+ * \sa class CwiseUnaryOp, Cwise::erf()
+ */
+template<typename Scalar> struct scalar_erf_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erf; return erf(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErf
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Complementary Error Function
+ * of a scalar
+ * \sa class CwiseUnaryOp, Cwise::erfc()
+ */
+template<typename Scalar> struct scalar_erfc_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erfc; return erfc(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErfc
+  };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
new file mode 100644
index 000000000..553bcda6a
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -0,0 +1,47 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H
+#define EIGEN_SPECIALFUNCTIONS_HALF_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+  return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_HALF_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
new file mode 100644
index 000000000..f524d7137
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -0,0 +1,1565 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIAL_FUNCTIONS_H
+#define EIGEN_SPECIAL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+namespace cephes {
+
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Scalar, int N>
+struct polevl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
+  }
+};
+
+template <typename Scalar>
+struct polevl<Scalar, 0> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
+    return coef[0];
+  }
+};
+
+}  // end namespace cephes
+
+/****************************************************************************
+ * Implementation of lgamma, requires C++11/C99                             *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct lgamma_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template <typename Scalar>
+struct lgamma_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct lgamma_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(float x) {
+#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+    int signgam;
+    return ::lgammaf_r(x, &signgam);
+#else
+    return ::lgammaf(x);
+#endif
+  }
+};
+
+template <>
+struct lgamma_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(double x) {
+#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+    int signgam;
+    return ::lgamma_r(x, &signgam);
+#else
+    return ::lgamma(x);
+#endif
+  }
+};
+#endif
+
+/****************************************************************************
+ * Implementation of digamma (psi), based on Cephes                         *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct digamma_retval {
+  typedef Scalar type;
+};
+
+/*
+ *
+ * Polynomial evaluation helper for the Psi (digamma) function.
+ *
+ * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for
+ * input Scalar s, assuming s is above 10.0.
+ *
+ * If s is above a certain threshold for the given Scalar type, zero
+ * is returned.  Otherwise the polynomial is evaluated with enough
+ * coefficients for results matching Scalar machine precision.
+ *
+ *
+ */
+template <typename Scalar>
+struct digamma_impl_maybe_poly {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+
+template <>
+struct digamma_impl_maybe_poly<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(const float s) {
+    const float A[] = {
+      -4.16666666666666666667E-3f,
+      3.96825396825396825397E-3f,
+      -8.33333333333333333333E-3f,
+      8.33333333333333333333E-2f
+    };
+
+    float z;
+    if (s < 1.0e8f) {
+      z = 1.0f / (s * s);
+      return z * cephes::polevl<float, 3>::run(z, A);
+    } else return 0.0f;
+  }
+};
+
+template <>
+struct digamma_impl_maybe_poly<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double s) {
+    const double A[] = {
+      8.33333333333333333333E-2,
+      -2.10927960927960927961E-2,
+      7.57575757575757575758E-3,
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2
+    };
+
+    double z;
+    if (s < 1.0e17) {
+      z = 1.0 / (s * s);
+      return z * cephes::polevl<double, 6>::run(z, A);
+    }
+    else return 0.0;
+  }
+};
+
+template <typename Scalar>
+struct digamma_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar x) {
+    /*
+     *
+     *     Psi (digamma) function (modified for Eigen)
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, psi();
+     *
+     * y = psi( x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     *              d      -
+     *   psi(x)  =  -- ln | (x)
+     *              dx
+     *
+     * is the logarithmic derivative of the gamma function.
+     * For integer x,
+     *                   n-1
+     *                    -
+     * psi(n) = -EUL  +   >  1/k.
+     *                    -
+     *                   k=1
+     *
+     * If x is negative, it is transformed to a positive argument by the
+     * reflection formula  psi(1-x) = psi(x) + pi cot(pi x).
+     * For general positive x, the argument is made greater than 10
+     * using the recurrence  psi(x+1) = psi(x) + 1/x.
+     * Then the following asymptotic expansion is applied:
+     *
+     *                           inf.   B
+     *                            -      2k
+     * psi(x) = log(x) - 1/2x -   >   -------
+     *                            -        2k
+     *                           k=1   2k x
+     *
+     * where the B2k are Bernoulli numbers.
+     *
+     * ACCURACY (float):
+     *    Relative error (except absolute when |psi| < 1):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       1.3e-15     1.4e-16
+     *    IEEE      -30,0       40000       1.5e-15     2.2e-16
+     *
+     * ACCURACY (double):
+     *    Absolute error,  relative when |psi| > 1 :
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      -33,0        30000      8.2e-7      1.2e-7
+     *    IEEE      0,33        100000      7.3e-7      7.7e-8
+     *
+     * ERROR MESSAGES:
+     *     message         condition      value returned
+     * psi singularity    x integer <=0      INFINITY
+     */
+
+    Scalar p, q, nz, s, w, y;
+    bool negative = false;
+
+    const Scalar maxnum = NumTraits<Scalar>::infinity();
+    const Scalar m_pi = Scalar(EIGEN_PI);
+
+    const Scalar zero = Scalar(0);
+    const Scalar one = Scalar(1);
+    const Scalar half = Scalar(0.5);
+    nz = zero;
+
+    if (x <= zero) {
+      negative = true;
+      q = x;
+      p = numext::floor(q);
+      if (p == q) {
+        return maxnum;
+      }
+      /* Remove the zeros of tan(m_pi x)
+       * by subtracting the nearest integer from x
+       */
+      nz = q - p;
+      if (nz != half) {
+        if (nz > half) {
+          p += one;
+          nz = q - p;
+        }
+        nz = m_pi / numext::tan(m_pi * nz);
+      }
+      else {
+        nz = zero;
+      }
+      x = one - x;
+    }
+
+    /* use the recurrence psi(x+1) = psi(x) + 1/x. */
+    s = x;
+    w = zero;
+    while (s < Scalar(10)) {
+      w += one / s;
+      s += one;
+    }
+
+    y = digamma_impl_maybe_poly<Scalar>::run(s);
+
+    y = numext::log(s) - (half / s) - y - w;
+
+    return (negative) ? y - nz : y;
+  }
+};
+
+/****************************************************************************
+ * Implementation of erf, requires C++11/C99                                *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct erf_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template <typename Scalar>
+struct erf_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erf_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); }
+};
+
+template <>
+struct erf_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+/***************************************************************************
+* Implementation of erfc, requires C++11/C99                               *
+****************************************************************************/
+
+template <typename Scalar>
+struct erfc_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template <typename Scalar>
+struct erfc_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erfc_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
+};
+
+template <>
+struct erfc_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+/**************************************************************************************************************
+ * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ **************************************************************************************************************/
+
+template <typename Scalar>
+struct igammac_retval {
+  typedef Scalar type;
+};
+
+// NOTE: cephes_helper is also used to implement zeta
+template <typename Scalar>
+struct cephes_helper {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar biginv() { assert(false && "biginv not supported for this type"); return 0.0; }
+};
+
+template <>
+struct cephes_helper<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float machep() {
+    return NumTraits<float>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float big() {
+    // use epsneg (1.0 - epsneg == 1.0)
+    return 1.0f / (NumTraits<float>::epsilon() / 2);
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float biginv() {
+    // epsneg
+    return machep();
+  }
+};
+
+template <>
+struct cephes_helper<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double machep() {
+    return NumTraits<double>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double big() {
+    return 1.0 / NumTraits<double>::epsilon();
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double biginv() {
+    // inverse of eps
+    return NumTraits<double>::epsilon();
+  }
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
+
+template <typename Scalar> struct igamma_impl;  // predeclare igamma_impl
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    /*  igamc()
+     *
+     *	Incomplete gamma integral (modified for Eigen)
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double a, x, y, igamc();
+     *
+     * y = igamc( a, x );
+     *
+     * DESCRIPTION:
+     *
+     * The function is defined by
+     *
+     *
+     *  igamc(a,x)   =   1 - igam(a,x)
+     *
+     *                            inf.
+     *                              -
+     *                     1       | |  -t  a-1
+     *               =   -----     |   e   t   dt.
+     *                    -      | |
+     *                   | (a)    -
+     *                             x
+     *
+     *
+     * In this implementation both arguments must be positive.
+     * The integral is evaluated by either a power series or
+     * continued fraction expansion, depending on the relative
+     * values of a and x.
+     *
+     * ACCURACY (float):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       7.8e-6      5.9e-7
+     *
+     *
+     * ACCURACY (double):
+     *
+     * Tested at random a, x.
+     *                a         x                      Relative error:
+     * arithmetic   domain   domain     # trials      peak         rms
+     *    IEEE     0.5,100   0,100      200000       1.9e-14     1.7e-15
+     *    IEEE     0.01,0.5  0,100      200000       1.4e-13     1.6e-15
+     *
+     */
+    /*
+      Cephes Math Library Release 2.2: June, 1992
+      Copyright 1985, 1987, 1992 by Stephen L. Moshier
+      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+    */
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if ((x < zero) || (a <= zero)) {
+      // domain error
+      return nan;
+    }
+
+    if ((x < one) || (x < a)) {
+      /* The checks above ensure that we meet the preconditions for
+       * igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
+       * Calling Run() would also work, but in that case the compiler may not be
+       * able to prove that igammac_impl::Run and igamma_impl::Run are not
+       * mutually recursive.  This leads to worse code, particularly on
+       * platforms like nvptx, where recursion is allowed only begrudgingly.
+       */
+      return (one - igamma_impl<Scalar>::Impl(a, x));
+    }
+
+    return Impl(a, x);
+  }
+
+ private:
+  /* igamma_impl calls igammac_impl::Impl. */
+  friend struct igamma_impl<Scalar>;
+
+  /* Actually computes igamc(a, x).
+   *
+   * Preconditions:
+   *   a > 0
+   *   x >= 1
+   *   x >= a
+   */
+  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+    const Scalar inf = NumTraits<Scalar>::infinity();
+
+    Scalar ans, ax, c, yc, r, t, y, z;
+    Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
+
+    if (x == inf) return zero;  // std::isinf crashes on CUDA
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (ax < -maxlog) {  // underflow
+      return zero;
+    }
+    ax = numext::exp(ax);
+
+    // continued fraction
+    y = one - a;
+    z = x + y + one;
+    c = zero;
+    pkm2 = one;
+    qkm2 = x;
+    pkm1 = x + one;
+    qkm1 = z * x;
+    ans = pkm1 / qkm1;
+
+    while (true) {
+      c += one;
+      y += one;
+      z += two;
+      yc = y * c;
+      pk = pkm1 * z - pkm2 * yc;
+      qk = qkm1 * z - qkm2 * yc;
+      if (qk != zero) {
+        r = pk / qk;
+        t = numext::abs((ans - r) / r);
+        ans = r;
+      } else {
+        t = one;
+      }
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+      if (numext::abs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+      }
+      if (t <= machep) {
+        break;
+      }
+    }
+
+    return (ans * ax);
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct igamma_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igamma_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
+
+template <typename Scalar>
+struct igamma_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    /*	igam()
+     *	Incomplete gamma integral
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double a, x, y, igam();
+     *
+     * y = igam( a, x );
+     *
+     * DESCRIPTION:
+     *
+     * The function is defined by
+     *
+     *                           x
+     *                            -
+     *                   1       | |  -t  a-1
+     *  igam(a,x)  =   -----     |   e   t   dt.
+     *                  -      | |
+     *                 | (a)    -
+     *                           0
+     *
+     *
+     * In this implementation both arguments must be positive.
+     * The integral is evaluated by either a power series or
+     * continued fraction expansion, depending on the relative
+     * values of a and x.
+     *
+     * ACCURACY (double):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30       200000       3.6e-14     2.9e-15
+     *    IEEE      0,100      300000       9.9e-14     1.5e-14
+     *
+     *
+     * ACCURACY (float):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        20000       7.8e-6      5.9e-7
+     *
+     */
+    /*
+      Cephes Math Library Release 2.2: June, 1992
+      Copyright 1985, 1987, 1992 by Stephen L. Moshier
+      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+    */
+
+
+    /* left tail of incomplete gamma function:
+     *
+     *          inf.      k
+     *   a  -x   -       x
+     *  x  e     >   ----------
+     *           -     -
+     *          k=0   | (a+k+1)
+     *
+     */
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if (x == zero) return zero;
+
+    if ((x < zero) || (a <= zero)) {  // domain error
+      return nan;
+    }
+
+    if ((x > one) && (x > a)) {
+      /* The checks above ensure that we meet the preconditions for
+       * igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
+       * Calling Run() would also work, but in that case the compiler may not be
+       * able to prove that igammac_impl::Run and igamma_impl::Run are not
+       * mutually recursive.  This leads to worse code, particularly on
+       * platforms like nvptx, where recursion is allowed only begrudgingly.
+       */
+      return (one - igammac_impl<Scalar>::Impl(a, x));
+    }
+
+    return Impl(a, x);
+  }
+
+ private:
+  /* igammac_impl calls igamma_impl::Impl. */
+  friend struct igammac_impl<Scalar>;
+
+  /* Actually computes igam(a, x).
+   *
+   * Preconditions:
+   *   x > 0
+   *   a > 0
+   *   !(x > 1 && x > a)
+   */
+  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+
+    Scalar ans, ax, c, r;
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (ax < -maxlog) {
+      // underflow
+      return zero;
+    }
+    ax = numext::exp(ax);
+
+    /* power series */
+    r = a;
+    c = one;
+    ans = one;
+
+    while (true) {
+      r += one;
+      c *= x/r;
+      ans += c;
+      if (c/ans <= machep) {
+        break;
+      }
+    }
+
+    return (ans * ax / a);
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/*****************************************************************************
+ * Implementation of Riemann zeta function of two arguments, based on Cephes *
+ *****************************************************************************/
+
+template <typename Scalar>
+struct zeta_retval {
+    typedef Scalar type;
+};
+
+template <typename Scalar>
+struct zeta_impl_series {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template <>
+struct zeta_impl_series<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) {
+    int i = 0;
+    while(i < 9)
+    {
+        i += 1;
+        a += 1.0f;
+        b = numext::pow( a, -x );
+        s += b;
+        if( numext::abs(b/s) < machep )
+            return true;
+    }
+
+    //Return whether we are done
+    return false;
+  }
+};
+
+template <>
+struct zeta_impl_series<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) {
+    int i = 0;
+    while( (i < 9) || (a <= 9.0) )
+    {
+        i += 1;
+        a += 1.0;
+        b = numext::pow( a, -x );
+        s += b;
+        if( numext::abs(b/s) < machep )
+            return true;
+    }
+
+    //Return whether we are done
+    return false;
+  }
+};
+
+template <typename Scalar>
+struct zeta_impl {
+    EIGEN_DEVICE_FUNC
+    static Scalar run(Scalar x, Scalar q) {
+        /*							zeta.c
+         *
+         *	Riemann zeta function of two arguments
+         *
+         *
+         *
+         * SYNOPSIS:
+         *
+         * double x, q, y, zeta();
+         *
+         * y = zeta( x, q );
+         *
+         *
+         *
+         * DESCRIPTION:
+         *
+         *
+         *
+         *                 inf.
+         *                  -        -x
+         *   zeta(x,q)  =   >   (k+q)
+         *                  -
+         *                 k=0
+         *
+         * where x > 1 and q is not a negative integer or zero.
+         * The Euler-Maclaurin summation formula is used to obtain
+         * the expansion
+         *
+         *                n
+         *                -       -x
+         * zeta(x,q)  =   >  (k+q)
+         *                -
+         *               k=1
+         *
+         *           1-x                 inf.  B   x(x+1)...(x+2j)
+         *      (n+q)           1         -     2j
+         *  +  ---------  -  -------  +   >    --------------------
+         *        x-1              x      -                   x+2j+1
+         *                   2(n+q)      j=1       (2j)! (n+q)
+         *
+         * where the B2j are Bernoulli numbers.  Note that (see zetac.c)
+         * zeta(x,1) = zetac(x) + 1.
+         *
+         *
+         *
+         * ACCURACY:
+         *
+         * Relative error for single precision:
+         * arithmetic   domain     # trials      peak         rms
+         *    IEEE      0,25        10000       6.9e-7      1.0e-7
+         *
+         * Large arguments may produce underflow in powf(), in which
+         * case the results are inaccurate.
+         *
+         * REFERENCE:
+         *
+         * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals,
+         * Series, and Products, p. 1073; Academic Press, 1980.
+         *
+         */
+
+        int i;
+        Scalar p, r, a, b, k, s, t, w;
+
+        const Scalar A[] = {
+            Scalar(12.0),
+            Scalar(-720.0),
+            Scalar(30240.0),
+            Scalar(-1209600.0),
+            Scalar(47900160.0),
+            Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/
+            Scalar(7.47242496e10),
+            Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/
+            Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/
+            Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/
+            Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/
+            Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/
+            };
+
+        const Scalar maxnum = NumTraits<Scalar>::infinity();
+        const Scalar zero = 0.0, half = 0.5, one = 1.0;
+        const Scalar machep = cephes_helper<Scalar>::machep();
+        const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+        if( x == one )
+            return maxnum;
+
+        if( x < one )
+        {
+            return nan;
+        }
+
+        if( q <= zero )
+        {
+            if(q == numext::floor(q))
+            {
+                return maxnum;
+            }
+            p = x;
+            r = numext::floor(p);
+            if (p != r)
+                return nan;
+        }
+
+        /* Permit negative q but continue sum until n+q > +9 .
+         * This case should be handled by a reflection formula.
+         * If q<0 and x is an integer, there is a relation to
+         * the polygamma function.
+         */
+        s = numext::pow( q, -x );
+        a = q;
+        b = zero;
+        // Run the summation in a helper function that is specific to the floating precision
+        if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) {
+            return s;
+        }
+
+        w = a;
+        s += b*w/(x-one);
+        s -= half * b;
+        a = one;
+        k = zero;
+        for( i=0; i<12; i++ )
+        {
+            a *= x + k;
+            b /= w;
+            t = a*b/A[i];
+            s = s + t;
+            t = numext::abs(t/s);
+            if( t < machep ) {
+              break;
+            }
+            k += one;
+            a *= x + k;
+            b /= w;
+            k += one;
+        }
+        return s;
+  }
+};
+
+/****************************************************************************
+ * Implementation of polygamma function, requires C++11/C99                 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct polygamma_retval {
+    typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct polygamma_impl {
+    EIGEN_DEVICE_FUNC
+    static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) {
+        EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                            THIS_TYPE_IS_NOT_SUPPORTED);
+        return Scalar(0);
+    }
+};
+
+#else
+
+template <typename Scalar>
+struct polygamma_impl {
+    EIGEN_DEVICE_FUNC
+    static Scalar run(Scalar n, Scalar x) {
+        Scalar zero = 0.0, one = 1.0;
+        Scalar nplus = n + one;
+        const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+        // Check that n is an integer
+        if (numext::floor(n) != n) {
+            return nan;
+        }
+        // Just return the digamma function for n = 1
+        else if (n == zero) {
+            return digamma_impl<Scalar>::run(x);
+        }
+        // Use the same implementation as scipy
+        else {
+            Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus));
+            return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x);
+        }
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct betainc_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) {
+    /*	betaincf.c
+     *
+     *	Incomplete beta integral
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float a, b, x, y, betaincf();
+     *
+     * y = betaincf( a, b, x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns incomplete beta integral of the arguments, evaluated
+     * from zero to x.  The function is defined as
+     *
+     *                  x
+     *     -            -
+     *    | (a+b)      | |  a-1     b-1
+     *  -----------    |   t   (1-t)   dt.
+     *   -     -     | |
+     *  | (a) | (b)   -
+     *                 0
+     *
+     * The domain of definition is 0 <= x <= 1.  In this
+     * implementation a and b are restricted to positive values.
+     * The integral from x to 1 may be obtained by the symmetry
+     * relation
+     *
+     *    1 - betainc( a, b, x )  =  betainc( b, a, 1-x ).
+     *
+     * The integral is evaluated by a continued fraction expansion.
+     * If a < 1, the function calls itself recursively after a
+     * transformation to increase a to a+1.
+     *
+     * ACCURACY (float):
+     *
+     * Tested at random points (a,b,x) with a and b in the indicated
+     * interval and x between 0 and 1.
+     *
+     * arithmetic   domain     # trials      peak         rms
+     * Relative error:
+     *    IEEE       0,30       10000       3.7e-5      5.1e-6
+     *    IEEE       0,100      10000       1.7e-4      2.5e-5
+     * The useful domain for relative error is limited by underflow
+     * of the single precision exponential function.
+     * Absolute error:
+     *    IEEE       0,30      100000       2.2e-5      9.6e-7
+     *    IEEE       0,100      10000       6.5e-5      3.7e-6
+     *
+     * Larger errors may occur for extreme ratios of a and b.
+     *
+     * ACCURACY (double):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,5         10000       6.9e-15     4.5e-16
+     *    IEEE      0,85       250000       2.2e-13     1.7e-14
+     *    IEEE      0,1000      30000       5.3e-12     6.3e-13
+     *    IEEE      0,10000    250000       9.3e-11     7.1e-12
+     *    IEEE      0,100000    10000       8.7e-10     4.8e-11
+     * Outputs smaller than the IEEE gradual underflow threshold
+     * were excluded from these statistics.
+     *
+     * ERROR MESSAGES:
+     *   message         condition      value returned
+     * incbet domain      x<0, x>1          nan
+     * incbet underflow                     nan
+     */
+
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True)
+ * Continued fraction expansion #2 for incomplete beta integral (small_branch = False)
+ */
+template <typename Scalar>
+struct incbeta_cfe {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value ||
+                         internal::is_same<Scalar, double>::value),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+
+    Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
+    Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update;
+    Scalar ans;
+    int n;
+
+    const int num_iters = (internal::is_same<Scalar, float>::value) ? 100 : 300;
+    const Scalar thresh =
+        (internal::is_same<Scalar, float>::value) ? machep : Scalar(3) * machep;
+    Scalar r = (internal::is_same<Scalar, float>::value) ? zero : one;
+
+    if (small_branch) {
+      k1 = a;
+      k2 = a + b;
+      k3 = a;
+      k4 = a + one;
+      k5 = one;
+      k6 = b - one;
+      k7 = k4;
+      k8 = a + two;
+      k26update = one;
+    } else {
+      k1 = a;
+      k2 = b - one;
+      k3 = a;
+      k4 = a + one;
+      k5 = one;
+      k6 = a + b;
+      k7 = a + one;
+      k8 = a + two;
+      k26update = -one;
+      x = x / (one - x);
+    }
+
+    pkm2 = zero;
+    qkm2 = one;
+    pkm1 = one;
+    qkm1 = one;
+    ans = one;
+    n = 0;
+
+    do {
+      xk = -(x * k1 * k2) / (k3 * k4);
+      pk = pkm1 + pkm2 * xk;
+      qk = qkm1 + qkm2 * xk;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      xk = (x * k5 * k6) / (k7 * k8);
+      pk = pkm1 + pkm2 * xk;
+      qk = qkm1 + qkm2 * xk;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      if (qk != zero) {
+        r = pk / qk;
+        if (numext::abs(ans - r) < numext::abs(r) * thresh) {
+          return r;
+        }
+        ans = r;
+      }
+
+      k1 += one;
+      k2 += k26update;
+      k3 += two;
+      k4 += two;
+      k5 += one;
+      k6 -= k26update;
+      k7 += two;
+      k8 += two;
+
+      if ((numext::abs(qk) + numext::abs(pk)) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+      }
+      if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) {
+        pkm2 *= big;
+        pkm1 *= big;
+        qkm2 *= big;
+        qkm1 *= big;
+      }
+    } while (++n < num_iters);
+
+    return ans;
+  }
+};
+
+/* Helper functions depending on the Scalar type */
+template <typename Scalar>
+struct betainc_helper {};
+
+template <>
+struct betainc_helper<float> {
+  /* Core implementation, assumes a large (> 1.0) */
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb,
+                                                            float xx) {
+    float ans, a, b, t, x, onemx;
+    bool reversed_a_b = false;
+
+    onemx = 1.0f - xx;
+
+    /* see if x is greater than the mean */
+    if (xx > (aa / (aa + bb))) {
+      reversed_a_b = true;
+      a = bb;
+      b = aa;
+      t = xx;
+      x = onemx;
+    } else {
+      a = aa;
+      b = bb;
+      t = onemx;
+      x = xx;
+    }
+
+    /* Choose expansion for optimal convergence */
+    if (b > 10.0f) {
+      if (numext::abs(b * x / a) < 0.3f) {
+        t = betainc_helper<float>::incbps(a, b, x);
+        if (reversed_a_b) t = 1.0f - t;
+        return t;
+      }
+    }
+
+    ans = x * (a + b - 2.0f) / (a - 1.0f);
+    if (ans < 1.0f) {
+      ans = incbeta_cfe<float>::run(a, b, x, true /* small_branch */);
+      t = b * numext::log(t);
+    } else {
+      ans = incbeta_cfe<float>::run(a, b, x, false /* small_branch */);
+      t = (b - 1.0f) * numext::log(t);
+    }
+
+    t += a * numext::log(x) + lgamma_impl<float>::run(a + b) -
+         lgamma_impl<float>::run(a) - lgamma_impl<float>::run(b);
+    t += numext::log(ans / a);
+    t = numext::exp(t);
+
+    if (reversed_a_b) t = 1.0f - t;
+    return t;
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) {
+    float t, u, y, s;
+    const float machep = cephes_helper<float>::machep();
+
+    y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a);
+    y -= lgamma_impl<float>::run(a) + lgamma_impl<float>::run(b);
+    y += lgamma_impl<float>::run(a + b);
+
+    t = x / (1.0f - x);
+    s = 0.0f;
+    u = 1.0f;
+    do {
+      b -= 1.0f;
+      if (b == 0.0f) {
+        break;
+      }
+      a += 1.0f;
+      u *= t * b / a;
+      s += u;
+    } while (numext::abs(u) > machep);
+
+    return numext::exp(y) * (1.0f + s);
+  }
+};
+
+template <>
+struct betainc_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static float run(float a, float b, float x) {
+    const float nan = NumTraits<float>::quiet_NaN();
+    float ans, t;
+
+    if (a <= 0.0f) return nan;
+    if (b <= 0.0f) return nan;
+    if ((x <= 0.0f) || (x >= 1.0f)) {
+      if (x == 0.0f) return 0.0f;
+      if (x == 1.0f) return 1.0f;
+      // mtherr("betaincf", DOMAIN);
+      return nan;
+    }
+
+    /* transformation for small aa */
+    if (a <= 1.0f) {
+      ans = betainc_helper<float>::incbsa(a + 1.0f, b, x);
+      t = a * numext::log(x) + b * numext::log1p(-x) +
+          lgamma_impl<float>::run(a + b) - lgamma_impl<float>::run(a + 1.0f) -
+          lgamma_impl<float>::run(b);
+      return (ans + numext::exp(t));
+    } else {
+      return betainc_helper<float>::incbsa(a, b, x);
+    }
+  }
+};
+
+template <>
+struct betainc_helper<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) {
+    const double machep = cephes_helper<double>::machep();
+
+    double s, t, u, v, n, t1, z, ai;
+
+    ai = 1.0 / a;
+    u = (1.0 - b) * x;
+    v = u / (a + 1.0);
+    t1 = v;
+    t = u;
+    n = 2.0;
+    s = 0.0;
+    z = machep * ai;
+    while (numext::abs(v) > z) {
+      u = (n - b) * x / n;
+      t *= u;
+      v = t / (a + n);
+      s += v;
+      n += 1.0;
+    }
+    s += t1;
+    s += ai;
+
+    u = a * numext::log(x);
+    // TODO: gamma() is not directly implemented in Eigen.
+    /*
+    if ((a + b) < maxgam && numext::abs(u) < maxlog) {
+      t = gamma(a + b) / (gamma(a) * gamma(b));
+      s = s * t * pow(x, a);
+    } else {
+    */
+    t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+        lgamma_impl<double>::run(b) + u + numext::log(s);
+    return s = numext::exp(t);
+  }
+};
+
+template <>
+struct betainc_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static double run(double aa, double bb, double xx) {
+    const double nan = NumTraits<double>::quiet_NaN();
+    const double machep = cephes_helper<double>::machep();
+    // const double maxgam = 171.624376956302725;
+
+    double a, b, t, x, xc, w, y;
+    bool reversed_a_b = false;
+
+    if (aa <= 0.0 || bb <= 0.0) {
+      return nan;  // goto domerr;
+    }
+
+    if ((xx <= 0.0) || (xx >= 1.0)) {
+      if (xx == 0.0) return (0.0);
+      if (xx == 1.0) return (1.0);
+      // mtherr("incbet", DOMAIN);
+      return nan;
+    }
+
+    if ((bb * xx) <= 1.0 && xx <= 0.95) {
+      return betainc_helper<double>::incbps(aa, bb, xx);
+    }
+
+    w = 1.0 - xx;
+
+    /* Reverse a and b if x is greater than the mean. */
+    if (xx > (aa / (aa + bb))) {
+      reversed_a_b = true;
+      a = bb;
+      b = aa;
+      xc = xx;
+      x = w;
+    } else {
+      a = aa;
+      b = bb;
+      xc = w;
+      x = xx;
+    }
+
+    if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) {
+      t = betainc_helper<double>::incbps(a, b, x);
+      if (t <= machep) {
+        t = 1.0 - machep;
+      } else {
+        t = 1.0 - t;
+      }
+      return t;
+    }
+
+    /* Choose expansion for better convergence. */
+    y = x * (a + b - 2.0) - (a - 1.0);
+    if (y < 0.0) {
+      w = incbeta_cfe<double>::run(a, b, x, true /* small_branch */);
+    } else {
+      w = incbeta_cfe<double>::run(a, b, x, false /* small_branch */) / xc;
+    }
+
+    /* Multiply w by the factor
+         a      b   _             _     _
+        x  (1-x)   | (a+b) / ( a | (a) | (b) ) .   */
+
+    y = a * numext::log(x);
+    t = b * numext::log(xc);
+    // TODO: gamma is not directly implemented in Eigen.
+    /*
+    if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog)
+    {
+      t = pow(xc, b);
+      t *= pow(x, a);
+      t /= a;
+      t *= w;
+      t *= gamma(a + b) / (gamma(a) * gamma(b));
+    } else {
+    */
+    /* Resort to logarithms.  */
+    y += t + lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+         lgamma_impl<double>::run(b);
+    y += numext::log(w / a);
+    t = numext::exp(y);
+
+    /* } */
+    // done:
+
+    if (reversed_a_b) {
+      if (t <= machep) {
+        t = 1.0 - machep;
+      } else {
+        t = 1.0 - t;
+      }
+    }
+    return t;
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+}  // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar)
+    lgamma(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar)
+    digamma(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar)
+zeta(const Scalar& x, const Scalar& q) {
+    return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar)
+polygamma(const Scalar& n, const Scalar& x) {
+    return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar)
+    erf(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
+    erfc(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
+    igamma(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
+    igammac(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
+    betainc(const Scalar& a, const Scalar& b, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x);
+}
+
+}  // end namespace numext
+
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
new file mode 100644
index 000000000..46d60d323
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
+
+/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
+
+/** \internal \returns the zeta function of two arguments (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); }
+
+/** \internal \returns the polygamma function (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); }
+
+/** \internal \returns the erf(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perf(const Packet& a) { using numext::erf; return erf(a); }
+
+/** \internal \returns the erfc(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
+
+/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
new file mode 100644
index 000000000..ec4fa8448
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
@@ -0,0 +1,165 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H
+#define EIGEN_CUDA_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+  using numext::lgamma;
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pzeta<float4>(const float4& x, const float4& q)
+{
+    using numext::zeta;
+    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pzeta<double2>(const double2& x, const double2& q)
+{
+    using numext::zeta;
+    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 ppolygamma<float4>(const float4& n, const float4& x)
+{
+    using numext::polygamma;
+    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 ppolygamma<double2>(const double2& n, const double2& x)
+{
+    using numext::polygamma;
+    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+  using numext::erf;
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+  using numext::erfc;
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+  using numext::erfc;
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+  using numext::igamma;
+  return make_float4(
+      igamma(a.x, x.x),
+      igamma(a.y, x.y),
+      igamma(a.z, x.z),
+      igamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigamma<double2>(const double2& a, const double2& x)
+{
+  using numext::igamma;
+  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+  using numext::igammac;
+  return make_float4(
+      igammac(a.x, x.x),
+      igammac(a.y, x.y),
+      igammac(a.z, x.z),
+      igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+  using numext::igammac;
+  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
+{
+  using numext::betainc;
+  return make_float4(
+      betainc(a.x, b.x, x.x),
+      betainc(a.y, b.y, x.y),
+      betainc(a.z, b.z, x.z),
+      betainc(a.w, b.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
+{
+  using numext::betainc;
+  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H
diff --git a/unsupported/Eigen/src/Splines/CMakeLists.txt b/unsupported/Eigen/src/Splines/CMakeLists.txt
deleted file mode 100644
index 55c6271e9..000000000
--- a/unsupported/Eigen/src/Splines/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Splines_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Splines_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Splines COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h
index 771f10432..627f6e482 100644
--- a/unsupported/Eigen/src/Splines/Spline.h
+++ b/unsupported/Eigen/src/Splines/Spline.h
@@ -44,9 +44,15 @@ namespace Eigen
     
     /** \brief The data type used to store knot vectors. */
     typedef typename SplineTraits<Spline>::KnotVectorType KnotVectorType;
+
+    /** \brief The data type used to store parameter vectors. */
+    typedef typename SplineTraits<Spline>::ParameterVectorType ParameterVectorType;
     
     /** \brief The data type used to store non-zero basis functions. */
     typedef typename SplineTraits<Spline>::BasisVectorType BasisVectorType;
+
+    /** \brief The data type used to store the values of the basis function derivatives. */
+    typedef typename SplineTraits<Spline>::BasisDerivativeType BasisDerivativeType;
     
     /** \brief The data type representing the spline's control points. */
     typedef typename SplineTraits<Spline>::ControlPointVectorType ControlPointVectorType;
@@ -57,7 +63,7 @@ namespace Eigen
     **/
     Spline() 
     : m_knots(1, (Degree==Dynamic ? 2 : 2*Degree+2))
-    , m_ctrls(ControlPointVectorType::Zero(2,(Degree==Dynamic ? 1 : Degree+1))) 
+    , m_ctrls(ControlPointVectorType::Zero(Dimension,(Degree==Dynamic ? 1 : Degree+1))) 
     {
       // in theory this code can go to the initializer list but it will get pretty
       // much unreadable ...
@@ -88,7 +94,7 @@ namespace Eigen
     const KnotVectorType& knots() const { return m_knots; }
     
     /**
-     * \brief Returns the knots of the underlying spline.
+     * \brief Returns the ctrls of the underlying spline.
      **/    
     const ControlPointVectorType& ctrls() const { return m_ctrls; }
 
@@ -203,10 +209,25 @@ namespace Eigen
      **/
     static BasisVectorType BasisFunctions(Scalar u, DenseIndex degree, const KnotVectorType& knots);
 
+    /**
+     * \copydoc Spline::basisFunctionDerivatives
+     * \param degree The degree of the underlying spline
+     * \param knots The underlying spline's knot vector.
+     **/    
+    static BasisDerivativeType BasisFunctionDerivatives(
+      const Scalar u, const DenseIndex order, const DenseIndex degree, const KnotVectorType& knots);
 
   private:
     KnotVectorType m_knots; /*!< Knot vector. */
     ControlPointVectorType  m_ctrls; /*!< Control points. */
+
+    template <typename DerivativeType>
+    static void BasisFunctionDerivativesImpl(
+      const typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+      const DenseIndex order,
+      const DenseIndex p, 
+      const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& U,
+      DerivativeType& N_);
   };
 
   template <typename _Scalar, int _Dim, int _Degree>
@@ -345,20 +366,24 @@ namespace Eigen
   }
 
   /* --------------------------------------------------------------------------------------------- */
-
-  template <typename SplineType, typename DerivativeType>
-  void basisFunctionDerivativesImpl(const SplineType& spline, typename SplineType::Scalar u, DenseIndex order, DerivativeType& N_)
+  
+  
+  template <typename _Scalar, int _Dim, int _Degree>
+  template <typename DerivativeType>
+  void Spline<_Scalar, _Dim, _Degree>::BasisFunctionDerivativesImpl(
+    const typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+    const DenseIndex order,
+    const DenseIndex p, 
+    const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& U,
+    DerivativeType& N_)
   {
+    typedef Spline<_Scalar, _Dim, _Degree> SplineType;
     enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
 
     typedef typename SplineTraits<SplineType>::Scalar Scalar;
     typedef typename SplineTraits<SplineType>::BasisVectorType BasisVectorType;
-    typedef typename SplineTraits<SplineType>::KnotVectorType KnotVectorType;
-
-    const KnotVectorType& U = spline.knots();
-
-    const DenseIndex p = spline.degree();
-    const DenseIndex span = spline.span(u);
+  
+    const DenseIndex span = SplineType::Span(u, p, U);
 
     const DenseIndex n = (std::min)(p, order);
 
@@ -369,7 +394,7 @@ namespace Eigen
 
     Matrix<Scalar,Order,Order> ndu(p+1,p+1);
 
-    double saved, temp;
+    Scalar saved, temp; // FIXME These were double instead of Scalar. Was there a reason for that?
 
     ndu(0,0) = 1.0;
 
@@ -408,7 +433,7 @@ namespace Eigen
       // Compute the k-th derivative
       for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
       {
-        double d = 0.0;
+        Scalar d = 0.0;
         DenseIndex rk,pk,j1,j2;
         rk = r-k; pk = p-k;
 
@@ -446,7 +471,7 @@ namespace Eigen
     r = p;
     for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
     {
-      for (DenseIndex j=p; j>=0; --j) N_(k,j) *= r;
+      for (j=p; j>=0; --j) N_(k,j) *= r;
       r *= p-k;
     }
   }
@@ -455,8 +480,8 @@ namespace Eigen
   typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType
     Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const
   {
-    typename SplineTraits< Spline >::BasisDerivativeType der;
-    basisFunctionDerivativesImpl(*this, u, order, der);
+    typename SplineTraits<Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType der;
+    BasisFunctionDerivativesImpl(u, order, degree(), knots(), der);
     return der;
   }
 
@@ -465,8 +490,21 @@ namespace Eigen
   typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::BasisDerivativeType
     Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const
   {
-    typename SplineTraits< Spline, DerivativeOrder >::BasisDerivativeType der;
-    basisFunctionDerivativesImpl(*this, u, order, der);
+    typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::BasisDerivativeType der;
+    BasisFunctionDerivativesImpl(u, order, degree(), knots(), der);
+    return der;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename SplineTraits<Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType
+  Spline<_Scalar, _Dim, _Degree>::BasisFunctionDerivatives(
+    const typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+    const DenseIndex order,
+    const DenseIndex degree,
+    const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots)
+  {
+    typename SplineTraits<Spline>::BasisDerivativeType der;
+    BasisFunctionDerivativesImpl(u, order, degree, knots, der);
     return der;
   }
 }
diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h
index 0265d532c..c761a9b3d 100644
--- a/unsupported/Eigen/src/Splines/SplineFitting.h
+++ b/unsupported/Eigen/src/Splines/SplineFitting.h
@@ -10,10 +10,14 @@
 #ifndef EIGEN_SPLINE_FITTING_H
 #define EIGEN_SPLINE_FITTING_H
 
+#include <algorithm>
+#include <functional>
 #include <numeric>
+#include <vector>
 
 #include "SplineFwd.h"
 
+#include <Eigen/LU>
 #include <Eigen/QR>
 
 namespace Eigen
@@ -50,6 +54,129 @@ namespace Eigen
   }
 
   /**
+   * \brief Computes knot averages when derivative constraints are present.
+   * Note that this is a technical interpretation of the referenced article
+   * since the algorithm contained therein is incorrect as written.
+   * \ingroup Splines_Module
+   *
+   * \param[in] parameters The parameters at which the interpolation B-Spline
+   *            will intersect the given interpolation points. The parameters
+   *            are assumed to be a non-decreasing sequence.
+   * \param[in] degree The degree of the interpolating B-Spline. This must be
+   *            greater than zero.
+   * \param[in] derivativeIndices The indices corresponding to parameters at
+   *            which there are derivative constraints. The indices are assumed
+   *            to be a non-decreasing sequence.
+   * \param[out] knots The calculated knot vector. These will be returned as a
+   *             non-decreasing sequence
+   *
+   * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+   * Curve interpolation with directional constraints for engineering design. 
+   * Engineering with Computers
+   **/
+  template <typename KnotVectorType, typename ParameterVectorType, typename IndexArray>
+  void KnotAveragingWithDerivatives(const ParameterVectorType& parameters,
+                                    const unsigned int degree,
+                                    const IndexArray& derivativeIndices,
+                                    KnotVectorType& knots)
+  {
+    typedef typename ParameterVectorType::Scalar Scalar;
+
+    DenseIndex numParameters = parameters.size();
+    DenseIndex numDerivatives = derivativeIndices.size();
+
+    if (numDerivatives < 1)
+    {
+      KnotAveraging(parameters, degree, knots);
+      return;
+    }
+
+    DenseIndex startIndex;
+    DenseIndex endIndex;
+  
+    DenseIndex numInternalDerivatives = numDerivatives;
+    
+    if (derivativeIndices[0] == 0)
+    {
+      startIndex = 0;
+      --numInternalDerivatives;
+    }
+    else
+    {
+      startIndex = 1;
+    }
+    if (derivativeIndices[numDerivatives - 1] == numParameters - 1)
+    {
+      endIndex = numParameters - degree;
+      --numInternalDerivatives;
+    }
+    else
+    {
+      endIndex = numParameters - degree - 1;
+    }
+
+    // There are (endIndex - startIndex + 1) knots obtained from the averaging
+    // and 2 for the first and last parameters.
+    DenseIndex numAverageKnots = endIndex - startIndex + 3;
+    KnotVectorType averageKnots(numAverageKnots);
+    averageKnots[0] = parameters[0];
+
+    int newKnotIndex = 0;
+    for (DenseIndex i = startIndex; i <= endIndex; ++i)
+      averageKnots[++newKnotIndex] = parameters.segment(i, degree).mean();
+    averageKnots[++newKnotIndex] = parameters[numParameters - 1];
+
+    newKnotIndex = -1;
+  
+    ParameterVectorType temporaryParameters(numParameters + 1);
+    KnotVectorType derivativeKnots(numInternalDerivatives);
+    for (DenseIndex i = 0; i < numAverageKnots - 1; ++i)
+    {
+      temporaryParameters[0] = averageKnots[i];
+      ParameterVectorType parameterIndices(numParameters);
+      int temporaryParameterIndex = 1;
+      for (DenseIndex j = 0; j < numParameters; ++j)
+      {
+        Scalar parameter = parameters[j];
+        if (parameter >= averageKnots[i] && parameter < averageKnots[i + 1])
+        {
+          parameterIndices[temporaryParameterIndex] = j;
+          temporaryParameters[temporaryParameterIndex++] = parameter;
+        }
+      }
+      temporaryParameters[temporaryParameterIndex] = averageKnots[i + 1];
+
+      for (int j = 0; j <= temporaryParameterIndex - 2; ++j)
+      {
+        for (DenseIndex k = 0; k < derivativeIndices.size(); ++k)
+        {
+          if (parameterIndices[j + 1] == derivativeIndices[k]
+              && parameterIndices[j + 1] != 0
+              && parameterIndices[j + 1] != numParameters - 1)
+          {
+            derivativeKnots[++newKnotIndex] = temporaryParameters.segment(j, 3).mean();
+            break;
+          }
+        }
+      }
+    }
+    
+    KnotVectorType temporaryKnots(averageKnots.size() + derivativeKnots.size());
+
+    std::merge(averageKnots.data(), averageKnots.data() + averageKnots.size(),
+               derivativeKnots.data(), derivativeKnots.data() + derivativeKnots.size(),
+               temporaryKnots.data());
+
+    // Number of knots (one for each point and derivative) plus spline order.
+    DenseIndex numKnots = numParameters + numDerivatives + degree + 1;
+    knots.resize(numKnots);
+
+    knots.head(degree).fill(temporaryKnots[0]);
+    knots.tail(degree).fill(temporaryKnots.template tail<1>()[0]);
+    knots.segment(degree, temporaryKnots.size()) = temporaryKnots;
+  }
+
+  /**
    * \brief Computes chord length parameters which are required for spline interpolation.
    * \ingroup Splines_Module
    *
@@ -86,6 +213,7 @@ namespace Eigen
   struct SplineFitting
   {
     typedef typename SplineType::KnotVectorType KnotVectorType;
+    typedef typename SplineType::ParameterVectorType ParameterVectorType;
 
     /**
      * \brief Fits an interpolating Spline to the given data points.
@@ -109,6 +237,52 @@ namespace Eigen
      **/
     template <typename PointArrayType>
     static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters);
+
+    /**
+     * \brief Fits an interpolating spline to the given data points and
+     * derivatives.
+     * 
+     * \param points The points for which an interpolating spline will be computed.
+     * \param derivatives The desired derivatives of the interpolating spline at interpolation
+     *                    points.
+     * \param derivativeIndices An array indicating which point each derivative belongs to. This
+     *                          must be the same size as @a derivatives.
+     * \param degree The degree of the interpolating spline.
+     *
+     * \returns A spline interpolating @a points with @a derivatives at those points.
+     *
+     * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+     * Curve interpolation with directional constraints for engineering design. 
+     * Engineering with Computers
+     **/
+    template <typename PointArrayType, typename IndexArray>
+    static SplineType InterpolateWithDerivatives(const PointArrayType& points,
+                                                 const PointArrayType& derivatives,
+                                                 const IndexArray& derivativeIndices,
+                                                 const unsigned int degree);
+
+    /**
+     * \brief Fits an interpolating spline to the given data points and derivatives.
+     * 
+     * \param points The points for which an interpolating spline will be computed.
+     * \param derivatives The desired derivatives of the interpolating spline at interpolation points.
+     * \param derivativeIndices An array indicating which point each derivative belongs to. This
+     *                          must be the same size as @a derivatives.
+     * \param degree The degree of the interpolating spline.
+     * \param parameters The parameters corresponding to the interpolation points.
+     *
+     * \returns A spline interpolating @a points with @a derivatives at those points.
+     *
+     * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+     * Curve interpolation with directional constraints for engineering design. 
+     * Engineering with Computers
+     */
+    template <typename PointArrayType, typename IndexArray>
+    static SplineType InterpolateWithDerivatives(const PointArrayType& points,
+                                                 const PointArrayType& derivatives,
+                                                 const IndexArray& derivativeIndices,
+                                                 const unsigned int degree,
+                                                 const ParameterVectorType& parameters);
   };
 
   template <typename SplineType>
@@ -151,6 +325,106 @@ namespace Eigen
     ChordLengths(pts, chord_lengths);
     return Interpolate(pts, degree, chord_lengths);
   }
+  
+  template <typename SplineType>
+  template <typename PointArrayType, typename IndexArray>
+  SplineType 
+  SplineFitting<SplineType>::InterpolateWithDerivatives(const PointArrayType& points,
+                                                        const PointArrayType& derivatives,
+                                                        const IndexArray& derivativeIndices,
+                                                        const unsigned int degree,
+                                                        const ParameterVectorType& parameters)
+  {
+    typedef typename SplineType::KnotVectorType::Scalar Scalar;      
+    typedef typename SplineType::ControlPointVectorType ControlPointVectorType;
+
+    typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
+
+    const DenseIndex n = points.cols() + derivatives.cols();
+    
+    KnotVectorType knots;
+
+    KnotAveragingWithDerivatives(parameters, degree, derivativeIndices, knots);
+    
+    // fill matrix
+    MatrixType A = MatrixType::Zero(n, n);
+
+    // Use these dimensions for quicker populating, then transpose for solving.
+    MatrixType b(points.rows(), n);
+
+    DenseIndex startRow;
+    DenseIndex derivativeStart;
+
+    // End derivatives.
+    if (derivativeIndices[0] == 0)
+    {
+      A.template block<1, 2>(1, 0) << -1, 1;
+      
+      Scalar y = (knots(degree + 1) - knots(0)) / degree;
+      b.col(1) = y*derivatives.col(0);
+      
+      startRow = 2;
+      derivativeStart = 1;
+    }
+    else
+    {
+      startRow = 1;
+      derivativeStart = 0;
+    }
+    if (derivativeIndices[derivatives.cols() - 1] == points.cols() - 1)
+    {
+      A.template block<1, 2>(n - 2, n - 2) << -1, 1;
+
+      Scalar y = (knots(knots.size() - 1) - knots(knots.size() - (degree + 2))) / degree;
+      b.col(b.cols() - 2) = y*derivatives.col(derivatives.cols() - 1);
+    }
+    
+    DenseIndex row = startRow;
+    DenseIndex derivativeIndex = derivativeStart;
+    for (DenseIndex i = 1; i < parameters.size() - 1; ++i)
+    {
+      const DenseIndex span = SplineType::Span(parameters[i], degree, knots);
+
+      if (derivativeIndices[derivativeIndex] == i)
+      {
+        A.block(row, span - degree, 2, degree + 1)
+          = SplineType::BasisFunctionDerivatives(parameters[i], 1, degree, knots);
+
+        b.col(row++) = points.col(i);
+        b.col(row++) = derivatives.col(derivativeIndex++);
+      }
+      else
+      {
+        A.row(row++).segment(span - degree, degree + 1)
+          = SplineType::BasisFunctions(parameters[i], degree, knots);
+      }
+    }
+    b.col(0) = points.col(0);
+    b.col(b.cols() - 1) = points.col(points.cols() - 1);
+    A(0,0) = 1;
+    A(n - 1, n - 1) = 1;
+    
+    // Solve
+    FullPivLU<MatrixType> lu(A);
+    ControlPointVectorType controlPoints = lu.solve(MatrixType(b.transpose())).transpose();
+
+    SplineType spline(knots, controlPoints);
+    
+    return spline;
+  }
+  
+  template <typename SplineType>
+  template <typename PointArrayType, typename IndexArray>
+  SplineType
+  SplineFitting<SplineType>::InterpolateWithDerivatives(const PointArrayType& points,
+                                                        const PointArrayType& derivatives,
+                                                        const IndexArray& derivativeIndices,
+                                                        const unsigned int degree)
+  {
+    ParameterVectorType parameters;
+    ChordLengths(points, parameters);
+    return InterpolateWithDerivatives(points, derivatives, derivativeIndices, degree, parameters);
+  }
 }
 
 #endif // EIGEN_SPLINE_FITTING_H
diff --git a/unsupported/Eigen/src/Splines/SplineFwd.h b/unsupported/Eigen/src/Splines/SplineFwd.h
index 49db8d35d..0a95fbf3e 100644
--- a/unsupported/Eigen/src/Splines/SplineFwd.h
+++ b/unsupported/Eigen/src/Splines/SplineFwd.h
@@ -31,6 +31,8 @@ namespace Eigen
 
       enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ };
       enum { NumOfDerivativesAtCompileTime = OrderAtCompileTime /*!< The number of derivatives defined for the current spline. */ };
+      
+      enum { DerivativeMemoryLayout = Dimension==1 ? RowMajor : ColMajor /*!< The derivative type's memory layout. */ };
 
       /** \brief The data type used to store non-zero basis functions. */
       typedef Array<Scalar,1,OrderAtCompileTime> BasisVectorType;
@@ -39,13 +41,16 @@ namespace Eigen
       typedef Array<Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType;
       
       /** \brief The data type used to store the spline's derivative values. */
-      typedef Array<Scalar,Dimension,Dynamic,ColMajor,Dimension,NumOfDerivativesAtCompileTime> DerivativeType;
+      typedef Array<Scalar,Dimension,Dynamic,DerivativeMemoryLayout,Dimension,NumOfDerivativesAtCompileTime> DerivativeType;
 
       /** \brief The point type the spline is representing. */
       typedef Array<Scalar,Dimension,1> PointType;
       
       /** \brief The data type used to store knot vectors. */
       typedef Array<Scalar,1,Dynamic> KnotVectorType;
+
+      /** \brief The data type used to store parameter vectors. */
+      typedef Array<Scalar,1,Dynamic> ParameterVectorType;
       
       /** \brief The data type representing the spline's control points. */
       typedef Array<Scalar,Dimension,Dynamic> ControlPointVectorType;
@@ -62,12 +67,14 @@ namespace Eigen
     {
       enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ };
       enum { NumOfDerivativesAtCompileTime = _DerivativeOrder==Dynamic ? Dynamic : _DerivativeOrder+1 /*!< The number of derivatives defined for the current spline. */ };
+      
+      enum { DerivativeMemoryLayout = _Dim==1 ? RowMajor : ColMajor /*!< The derivative type's memory layout. */ };
 
       /** \brief The data type used to store the values of the basis function derivatives. */
       typedef Array<_Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType;
       
       /** \brief The data type used to store the spline's derivative values. */      
-      typedef Array<_Scalar,_Dim,Dynamic,ColMajor,_Dim,NumOfDerivativesAtCompileTime> DerivativeType;
+      typedef Array<_Scalar,_Dim,Dynamic,DerivativeMemoryLayout,_Dim,NumOfDerivativesAtCompileTime> DerivativeType;
     };
 
     /** \brief 2D float B-spline with dynamic degree. */
diff --git a/unsupported/doc/Overview.dox b/unsupported/doc/Overview.dox
index d048377df..45464a545 100644
--- a/unsupported/doc/Overview.dox
+++ b/unsupported/doc/Overview.dox
@@ -1,14 +1,15 @@
+/// \brief Namespace containing all symbols from the %Eigen library.
 namespace Eigen {
 
-/** \mainpage Eigen's unsupported modules
+/** \mainpage %Eigen's unsupported modules
 
-This is the API documentation for Eigen's unsupported modules.
+This is the API documentation for %Eigen's unsupported modules.
 
 These modules are contributions from various users. They are provided "as is", without any support.
 
 Click on the \e Modules tab at the top of this page to get a list of all unsupported modules.
 
-Don't miss the <a href="..//index.html">official Eigen documentation</a>.
+Don't miss the <a href="../index.html">official Eigen documentation</a>.
 
 */
 
@@ -18,8 +19,10 @@ Don't miss the <a href="..//index.html">official Eigen documentation</a>.
 
 The unsupported modules are contributions from various users. They are
 provided "as is", without any support. Nevertheless, some of them are
-subject to be included in Eigen in the future.
+subject to be included in %Eigen in the future.
 
 */
 
+/// \internal \brief Namespace containing low-level routines from the %Eigen library.
+namespace internal {}
 }
diff --git a/unsupported/doc/examples/BVH_Example.cpp b/unsupported/doc/examples/BVH_Example.cpp
index 6b6fac075..afb0c94c2 100644
--- a/unsupported/doc/examples/BVH_Example.cpp
+++ b/unsupported/doc/examples/BVH_Example.cpp
@@ -6,9 +6,7 @@ using namespace Eigen;
 typedef AlignedBox<double, 2> Box2d;
 
 namespace Eigen {
-    namespace internal {
-        Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
-    }
+  Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
 }
 
 struct PointPointMinimizer //how to compute squared distances between points and rectangles
diff --git a/unsupported/doc/examples/EulerAngles.cpp b/unsupported/doc/examples/EulerAngles.cpp
new file mode 100644
index 000000000..1ef6aee18
--- /dev/null
+++ b/unsupported/doc/examples/EulerAngles.cpp
@@ -0,0 +1,46 @@
+#include <unsupported/Eigen/EulerAngles>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  // A common Euler system by many armies around the world,
+  //  where the first one is the azimuth(the angle from the north -
+  //   the same angle that is show in compass)
+  //  and the second one is elevation(the angle from the horizon)
+  //  and the third one is roll(the angle between the horizontal body
+  //   direction and the plane ground surface)
+  // Keep remembering we're using radian angles here!
+  typedef EulerSystem<-EULER_Z, EULER_Y, EULER_X> MyArmySystem;
+  typedef EulerAngles<double, MyArmySystem> MyArmyAngles;
+  
+  MyArmyAngles vehicleAngles(
+    3.14/*PI*/ / 2, /* heading to east, notice that this angle is counter-clockwise */
+    -0.3, /* going down from a mountain */
+    0.1); /* slightly rolled to the right */
+  
+  // Some Euler angles representation that our plane use.
+  EulerAnglesZYZd planeAngles(0.78474, 0.5271, -0.513794);
+  
+  MyArmyAngles planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeAngles);
+  
+  std::cout << "vehicle angles(MyArmy):     " << vehicleAngles << std::endl;
+  std::cout << "plane angles(ZYZ):        " << planeAngles << std::endl;
+  std::cout << "plane angles(MyArmy):     " << planeAnglesInMyArmyAngles << std::endl;
+  
+  // Now lets rotate the plane a little bit
+  std::cout << "==========================================================\n";
+  std::cout << "rotating plane now!\n";
+  std::cout << "==========================================================\n";
+  
+  Quaterniond planeRotated = AngleAxisd(-0.342, Vector3d::UnitY()) * planeAngles;
+  
+  planeAngles = planeRotated;
+  planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeRotated);
+  
+  std::cout << "new plane angles(ZYZ):     " << planeAngles << std::endl;
+  std::cout << "new plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl;
+  
+  return 0;
+}
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 2e4cfdb2e..b5fa1c845 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -1,10 +1,26 @@
+# generate split test header file only if it does not yet exist
+# in order to prevent a rebuild everytime cmake is configured
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
+  foreach(i RANGE 1 999)
+    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
+      "#ifdef EIGEN_TEST_PART_${i}\n"
+      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
+      "#else\n"
+      "#define CALL_SUBTEST_${i}(FUNC)\n"
+      "#endif\n\n"
+    )
+  endforeach()
+endif()
 
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
 add_custom_target(BuildUnsupported)
 
-include_directories(../../test ../../unsupported ../../Eigen 
+include_directories(../../test ../../unsupported ../../Eigen
                     ${CMAKE_CURRENT_BINARY_DIR}/../../test)
 
+find_package (Threads)
+
 find_package(GoogleHash)
 if(GOOGLEHASH_FOUND)
   add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
@@ -28,22 +44,30 @@ endif(ADOLC_FOUND)
 ei_add_test(NonLinearOptimization)
 
 ei_add_test(NumericalDiff)
+ei_add_test(autodiff_scalar)
 ei_add_test(autodiff)
+
+if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$")
 ei_add_test(BVH)
+endif()
+
 ei_add_test(matrix_exponential)
 ei_add_test(matrix_function)
 ei_add_test(matrix_power)
 ei_add_test(matrix_square_root)
 ei_add_test(alignedvector3)
+
 ei_add_test(FFT)
 
+ei_add_test(EulerAngles)
+
 find_package(MPFR 2.3.0)
 find_package(GMP)
-if(MPFR_FOUND)
+if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CXX11)
   include_directories(${MPFR_INCLUDES} ./mpreal)
   ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
   set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
-  ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+ ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" )
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
 endif()
@@ -82,9 +106,152 @@ endif()
 
 ei_add_test(polynomialsolver)
 ei_add_test(polynomialutils)
-ei_add_test(kronecker_product)
 ei_add_test(splines)
 ei_add_test(gmres)
 ei_add_test(minres)
 ei_add_test(levenberg_marquardt)
-ei_add_test(bdcsvd)
+ei_add_test(kronecker_product)
+ei_add_test(special_functions)
+
+# TODO: The following test names are prefixed with the cxx11 string, since historically
+# the tests depended on c++11. This isn't the case anymore so we ought to rename them.
+# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests
+# when using visual studio. We should make the check more strict to enable the tests for
+# newer versions of MSVC.
+if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+ei_add_test(cxx11_tensor_dimension)
+ei_add_test(cxx11_tensor_map)
+ei_add_test(cxx11_tensor_assign)
+ei_add_test(cxx11_tensor_comparisons)
+ei_add_test(cxx11_tensor_forced_eval)
+ei_add_test(cxx11_tensor_math)
+ei_add_test(cxx11_tensor_const)
+ei_add_test(cxx11_tensor_intdiv)
+ei_add_test(cxx11_tensor_casts)
+ei_add_test(cxx11_tensor_empty)
+ei_add_test(cxx11_tensor_sugar)
+ei_add_test(cxx11_tensor_roundings)
+ei_add_test(cxx11_tensor_layout_swap)
+ei_add_test(cxx11_tensor_io)
+if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+  # This test requires __uint128_t which is only available on 64bit systems
+  ei_add_test(cxx11_tensor_uint128)
+endif()
+endif()
+
+if(EIGEN_TEST_CXX11)
+  if(EIGEN_TEST_SYCL)
+    ei_add_test_sycl(cxx11_tensor_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_forced_eval_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11")
+  endif(EIGEN_TEST_SYCL)
+  # It should be safe to always run these tests as there is some fallback code for
+  # older compiler that don't support cxx11.
+  set(CMAKE_CXX_STANDARD 11)
+
+  ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+
+  ei_add_test(cxx11_meta)
+  ei_add_test(cxx11_tensor_simple)
+#  ei_add_test(cxx11_tensor_symmetry)
+  ei_add_test(cxx11_tensor_index_list)
+  ei_add_test(cxx11_tensor_mixed_indices)
+  ei_add_test(cxx11_tensor_contraction)
+  ei_add_test(cxx11_tensor_convolution)
+  ei_add_test(cxx11_tensor_expr)
+  ei_add_test(cxx11_tensor_fixed_size)
+  ei_add_test(cxx11_tensor_of_const_values)
+  ei_add_test(cxx11_tensor_of_complex)
+  ei_add_test(cxx11_tensor_of_strings)
+  ei_add_test(cxx11_tensor_lvalue)
+  ei_add_test(cxx11_tensor_broadcasting)
+  ei_add_test(cxx11_tensor_chipping)
+  ei_add_test(cxx11_tensor_concatenation)
+  ei_add_test(cxx11_tensor_inflation)
+  ei_add_test(cxx11_tensor_morphing)
+  ei_add_test(cxx11_tensor_padding)
+  ei_add_test(cxx11_tensor_patch)
+  ei_add_test(cxx11_tensor_image_patch)
+  ei_add_test(cxx11_tensor_volume_patch)
+  ei_add_test(cxx11_tensor_reduction)
+  ei_add_test(cxx11_tensor_argmax)
+  ei_add_test(cxx11_tensor_shuffling)
+  ei_add_test(cxx11_tensor_striding)
+  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_ref)
+  ei_add_test(cxx11_tensor_random)
+  ei_add_test(cxx11_tensor_generator)
+  ei_add_test(cxx11_tensor_custom_op)
+  ei_add_test(cxx11_tensor_custom_index)
+  ei_add_test(cxx11_tensor_fft)
+  ei_add_test(cxx11_tensor_ifft)
+  ei_add_test(cxx11_tensor_scan)
+
+endif()
+
+# These tests needs nvcc
+find_package(CUDA 7.0)
+if(CUDA_FOUND AND EIGEN_TEST_CUDA)
+  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
+  # and -fno-check-new flags since they trigger thousands of compilation warnings
+  # in the CUDA runtime
+  # Also remove -ansi that is incompatible with std=c++11.
+  string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+  message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
+
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
+  endif()
+  if(EIGEN_TEST_CUDA_CLANG)
+   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
+  endif()
+
+  set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
+  if (${CUDA_VERSION} STREQUAL "7.0")
+    set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
+  endif()
+
+  if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3))
+    set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
+  else()
+    # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
+    set(EIGEN_CUDA_CXX11_FLAG "")
+  endif()
+
+  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}")
+  cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
+  set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+
+  ei_add_test(cxx11_tensor_complex_cuda)
+  ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
+  ei_add_test(cxx11_tensor_reduction_cuda)
+  ei_add_test(cxx11_tensor_argmax_cuda)
+  ei_add_test(cxx11_tensor_cast_float16_cuda)
+  ei_add_test(cxx11_tensor_scan_cuda)
+
+  # Contractions require arch 3.0 or higher
+  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
+    ei_add_test(cxx11_tensor_device)
+    ei_add_test(cxx11_tensor_cuda)
+    ei_add_test(cxx11_tensor_contract_cuda)
+    ei_add_test(cxx11_tensor_of_float16_cuda)
+  endif()
+
+  # The random number generation code requires arch 3.5 or greater.
+  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
+    ei_add_test(cxx11_tensor_random_cuda)
+  endif()
+
+
+  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+endif()
diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp
new file mode 100644
index 000000000..a8cb52864
--- /dev/null
+++ b/unsupported/test/EulerAngles.cpp
@@ -0,0 +1,208 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <unsupported/Eigen/EulerAngles>
+
+using namespace Eigen;
+
+template<typename EulerSystem, typename Scalar>
+void verify_euler_ranged(const Matrix<Scalar,3,1>& ea,
+  bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma)
+{
+  typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Quaternion<Scalar> QuaternionType;
+  typedef AngleAxis<Scalar> AngleAxisType;
+  using std::abs;
+  
+  Scalar alphaRangeStart, alphaRangeEnd;
+  Scalar betaRangeStart, betaRangeEnd;
+  Scalar gammaRangeStart, gammaRangeEnd;
+  
+  if (positiveRangeAlpha)
+  {
+    alphaRangeStart = Scalar(0);
+    alphaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    alphaRangeStart = -Scalar(EIGEN_PI);
+    alphaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  if (positiveRangeBeta)
+  {
+    betaRangeStart = Scalar(0);
+    betaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    betaRangeStart = -Scalar(EIGEN_PI);
+    betaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  if (positiveRangeGamma)
+  {
+    gammaRangeStart = Scalar(0);
+    gammaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    gammaRangeStart = -Scalar(EIGEN_PI);
+    gammaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  const int i = EulerSystem::AlphaAxisAbs - 1;
+  const int j = EulerSystem::BetaAxisAbs - 1;
+  const int k = EulerSystem::GammaAxisAbs - 1;
+  
+  const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1;
+  const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1;
+  const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1;
+  
+  const Vector3 I = EulerAnglesType::AlphaAxisVector();
+  const Vector3 J = EulerAnglesType::BetaAxisVector();
+  const Vector3 K = EulerAnglesType::GammaAxisVector();
+  
+  EulerAnglesType e(ea[0], ea[1], ea[2]);
+  
+  Matrix3 m(e);
+  Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  
+  // Check that eabis in range
+  VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd);
+  VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd);
+  VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd);
+  
+  Vector3 eabis2 = m.eulerAngles(i, j, k);
+  
+  // Invert the relevant axes
+  eabis2[0] *= iFactor;
+  eabis2[1] *= jFactor;
+  eabis2[2] *= kFactor;
+  
+  // Saturate the angles to the correct range
+  if (positiveRangeAlpha && (eabis2[0] < 0))
+    eabis2[0] += Scalar(2 * EIGEN_PI);
+  if (positiveRangeBeta && (eabis2[1] < 0))
+    eabis2[1] += Scalar(2 * EIGEN_PI);
+  if (positiveRangeGamma && (eabis2[2] < 0))
+    eabis2[2] += Scalar(2 * EIGEN_PI);
+  
+  VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is
+  
+  Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K));
+  VERIFY_IS_APPROX(m,  mbis);
+  
+  // Tests that are only relevant for no possitive range
+  if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma))
+  {
+    /* If I==K, and ea[1]==0, then there no unique solution. */ 
+    /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ 
+    if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
+      VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
+    
+    // approx_or_less_than does not work for 0
+    VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
+  }
+  
+  // Quaternions
+  QuaternionType q(e);
+  eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
+}
+
+template<typename EulerSystem, typename Scalar>
+void verify_euler(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_ranged<EulerSystem>(ea, false, false, false);
+  verify_euler_ranged<EulerSystem>(ea, false, false, true);
+  verify_euler_ranged<EulerSystem>(ea, false, true, false);
+  verify_euler_ranged<EulerSystem>(ea, false, true, true);
+  verify_euler_ranged<EulerSystem>(ea, true, false, false);
+  verify_euler_ranged<EulerSystem>(ea, true, false, true);
+  verify_euler_ranged<EulerSystem>(ea, true, true, false);
+  verify_euler_ranged<EulerSystem>(ea, true, true, true);
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler<EulerSystemXYZ>(ea);
+  verify_euler<EulerSystemXYX>(ea);
+  verify_euler<EulerSystemXZY>(ea);
+  verify_euler<EulerSystemXZX>(ea);
+  
+  verify_euler<EulerSystemYZX>(ea);
+  verify_euler<EulerSystemYZY>(ea);
+  verify_euler<EulerSystemYXZ>(ea);
+  verify_euler<EulerSystemYXY>(ea);
+  
+  verify_euler<EulerSystemZXY>(ea);
+  verify_euler<EulerSystemZXZ>(ea);
+  verify_euler<EulerSystemZYX>(ea);
+  verify_euler<EulerSystemZYZ>(ea);
+}
+
+template<typename Scalar> void eulerangles()
+{
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Array<Scalar,3,1> Array3;
+  typedef Quaternion<Scalar> Quaternionx;
+  typedef AngleAxis<Scalar> AngleAxisType;
+
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+  Quaternionx q1;
+  q1 = AngleAxisType(a, Vector3::Random().normalized());
+  Matrix3 m;
+  m = q1;
+  
+  Vector3 ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with purely random Quaternion:
+  q1.coeffs() = Quaternionx::Coefficients::Random().normalized();
+  m = q1;
+  ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with random angles in range [0:pi]x[-pi:pi]x[-pi:pi].
+  ea = (Array3::Random() + Array3(1,0,0))*Scalar(EIGEN_PI)*Array3(0.5,1,1);
+  check_all_var(ea);
+  
+  ea[2] = ea[0] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[0] = ea[1] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[1] = 0;
+  check_all_var(ea);
+  
+  ea.head(2).setZero();
+  check_all_var(ea);
+  
+  ea.setZero();
+  check_all_var(ea);
+}
+
+void test_EulerAngles()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( eulerangles<float>() );
+    CALL_SUBTEST_2( eulerangles<double>() );
+  }
+}
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
index d3718e2d2..8b7528fb7 100644
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@@ -18,11 +18,11 @@ using namespace Eigen;
 
 
 template < typename T>
-complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
+complex<long double>  promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); }
 
-complex<long double>  promote(float x) { return complex<long double>( x); }
-complex<long double>  promote(double x) { return complex<long double>( x); }
-complex<long double>  promote(long double x) { return complex<long double>( x); }
+complex<long double>  promote(float x) { return complex<long double>((long double)x); }
+complex<long double>  promote(double x) { return complex<long double>((long double)x); }
+complex<long double>  promote(long double x) { return complex<long double>((long double)x); }
     
 
     template <typename VT1,typename VT2>
@@ -33,7 +33,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
         long double pi = acos((long double)-1 );
         for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) {
             complex<long double> acc = 0;
-            long double phinc = -2.*k0* pi / timebuf.size();
+            long double phinc = (long double)(-2.)*k0* pi / timebuf.size();
             for (size_t k1=0;k1<(size_t)timebuf.size();++k1) {
                 acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
             }
@@ -54,8 +54,8 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
         long double difpower=0;
         size_t n = (min)( buf1.size(),buf2.size() );
         for (size_t k=0;k<n;++k) {
-            totalpower += (numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2.;
-            difpower += numext::abs2(buf1[k] - buf2[k]);
+            totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2);
+            difpower += (long double)(numext::abs2(buf1[k] - buf2[k]));
         }
         return sqrt(difpower/totalpower);
     }
@@ -93,19 +93,19 @@ void test_scalar_generic(int nfft)
     fft.SetFlag(fft.HalfSpectrum );
     fft.fwd( freqBuf,tbuf);
     VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) );
-    VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>()  );// gross check
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
 
     fft.ClearFlag(fft.HalfSpectrum );
     fft.fwd( freqBuf,tbuf);
     VERIFY( (size_t)freqBuf.size() == (size_t)nfft);
-    VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>()  );// gross check
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
 
     if (nfft&1)
         return; // odd FFTs get the wrong size inverse FFT
 
     ScalarVector tbuf2;
     fft.inv( tbuf2 , freqBuf);
-    VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
 
 
     // verify that the Unscaled flag takes effect
@@ -121,12 +121,12 @@ void test_scalar_generic(int nfft)
     //for (size_t i=0;i<(size_t) tbuf.size();++i)
     //    cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " -  in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) <<  endl;
 
-    VERIFY( dif_rmse(tbuf,tbuf3) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>()  );// gross check
 
     // verify that ClearFlag works
     fft.ClearFlag(fft.Unscaled);
     fft.inv( tbuf2 , freqBuf);
-    VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
 }
 
 template <typename T>
@@ -152,10 +152,10 @@ void test_complex_generic(int nfft)
         inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
     fft.fwd( outbuf , inbuf);
 
-    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
+    VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>()  );// gross check
     fft.inv( buf3 , outbuf);
 
-    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
 
     // verify that the Unscaled flag takes effect
     ComplexVector buf4;
@@ -163,12 +163,12 @@ void test_complex_generic(int nfft)
     fft.inv( buf4 , outbuf);
     for (int k=0;k<nfft;++k)
         buf4[k] *= T(1./nfft);
-    VERIFY( dif_rmse(inbuf,buf4) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>()  );// gross check
 
     // verify that ClearFlag works
     fft.ClearFlag(fft.Unscaled);
     fft.inv( buf3 , outbuf);
-    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
 }
 
 template <typename T>
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index d7376b0f5..1d682dd83 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -12,7 +12,8 @@
 // It is intended to be done for this test only.
 #include <Eigen/src/Core/util/DisableStupidWarnings.h>
 
-using std::sqrt;
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
 
 int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag)
 {
@@ -246,9 +247,9 @@ struct hybrj_functor : Functor<double>
     int operator()(const VectorXd &x, VectorXd &fvec)
     {
         double temp, temp1, temp2;
-        const int n = x.size();
+        const VectorXd::Index n = x.size();
         assert(fvec.size()==n);
-        for (int k = 0; k < n; k++)
+        for (VectorXd::Index k = 0; k < n; k++)
         {
             temp = (3. - 2.*x[k])*x[k];
             temp1 = 0.;
@@ -261,12 +262,12 @@ struct hybrj_functor : Functor<double>
     }
     int df(const VectorXd &x, MatrixXd &fjac)
     {
-        const int n = x.size();
+        const VectorXd::Index n = x.size();
         assert(fjac.rows()==n);
         assert(fjac.cols()==n);
-        for (int k = 0; k < n; k++)
+        for (VectorXd::Index k = 0; k < n; k++)
         {
-            for (int j = 0; j < n; j++)
+            for (VectorXd::Index j = 0; j < n; j++)
                 fjac(k,j) = 0.;
             fjac(k,k) = 3.- 4.*x[k];
             if (k) fjac(k,k-1) = -1.;
@@ -351,10 +352,10 @@ struct hybrd_functor : Functor<double>
     int operator()(const VectorXd &x, VectorXd &fvec) const
     {
         double temp, temp1, temp2;
-        const int n = x.size();
+        const VectorXd::Index n = x.size();
 
         assert(fvec.size()==n);
-        for (int k=0; k < n; k++)
+        for (VectorXd::Index k=0; k < n; k++)
         {
             temp = (3. - 2.*x[k])*x[k];
             temp1 = 0.;
@@ -455,7 +456,7 @@ struct lmstr_functor : Functor<double>
         assert(jac_row.size()==x.size());
         double tmp1, tmp2, tmp3, tmp4;
 
-        int i = rownb-2;
+        VectorXd::Index i = rownb-2;
         tmp1 = i+1;
         tmp2 = 16 - i - 1;
         tmp3 = (i>=8)? tmp2 : tmp1;
@@ -1022,7 +1023,9 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev, 79);
   VERIFY_IS_EQUAL(lm.njev, 72);
   // check norm^2
-  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.430899764097e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  std::cout.precision(30);
+  std::cout << lm.fvec.squaredNorm() << "\n";
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -1043,7 +1046,7 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev, 9);
   VERIFY_IS_EQUAL(lm.njev, 8);
   // check norm^2
-  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.428595533845e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -1262,8 +1265,8 @@ void testNistBoxBOD(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 31);
-  VERIFY_IS_EQUAL(lm.njev, 25);
+  VERIFY(lm.nfev < 31); // 31
+  VERIFY(lm.njev < 25); // 25
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
   // check x
@@ -1342,10 +1345,6 @@ void testNistMGH17(void)
   lm.parameters.maxfev = 1000;
   info = lm.minimize(x);
 
-  // check return value
-  VERIFY_IS_EQUAL(info, 2); 
-  VERIFY_IS_EQUAL(lm.nfev, 602 ); 
-  VERIFY_IS_EQUAL(lm.njev, 545 ); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
   // check x
@@ -1354,6 +1353,15 @@ void testNistMGH17(void)
   VERIFY_IS_APPROX(x[2], -1.4646871366E+00);
   VERIFY_IS_APPROX(x[3], 1.2867534640E-02);
   VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
+  
+  // check return value
+  VERIFY_IS_EQUAL(info, 2); 
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev, 602);  // 602
+  VERIFY_IS_EQUAL(lm.njev, 545);  // 545
+  --g_test_level;
+  VERIFY(lm.nfev < 602 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev < 545 * LM_EVAL_COUNT_TOL);
 
   /*
    * Second try
@@ -1832,8 +1840,8 @@ void test_NonLinearOptimization()
     // NIST tests, level of difficulty = "Average"
     CALL_SUBTEST/*_5*/(testNistHahn1());
     CALL_SUBTEST/*_6*/(testNistMisra1d());
-//     CALL_SUBTEST/*_7*/(testNistMGH17());
-//     CALL_SUBTEST/*_8*/(testNistLanczos1());
+    CALL_SUBTEST/*_7*/(testNistMGH17());
+    CALL_SUBTEST/*_8*/(testNistLanczos1());
 
 //     // NIST tests, level of difficulty = "Higher"
     CALL_SUBTEST/*_9*/(testNistRat42());
diff --git a/unsupported/test/alignedvector3.cpp b/unsupported/test/alignedvector3.cpp
index fc2bc2135..252cb1d3f 100644
--- a/unsupported/test/alignedvector3.cpp
+++ b/unsupported/test/alignedvector3.cpp
@@ -10,6 +10,16 @@
 #include "main.h"
 #include <unsupported/Eigen/AlignedVector3>
 
+namespace Eigen {
+
+template<typename T,typename Derived>
+T test_relative_error(const AlignedVector3<T> &a, const MatrixBase<Derived> &b)
+{
+  return test_relative_error(a.coeffs().template head<3>(), b);
+}
+
+}
+
 template<typename Scalar>
 void alignedvector3()
 {
@@ -19,8 +29,8 @@ void alignedvector3()
   typedef Matrix<Scalar,3,3> Mat33;
   typedef AlignedVector3<Scalar> FastType;
   RefType  r1(RefType::Random()), r2(RefType::Random()), r3(RefType::Random()),
-           r4(RefType::Random()), r5(RefType::Random()), r6(RefType::Random());
-  FastType f1(r1), f2(r2), f3(r3), f4(r4), f5(r5), f6(r6);
+           r4(RefType::Random()), r5(RefType::Random());
+  FastType f1(r1), f2(r2), f3(r3), f4(r4), f5(r5);
   Mat33 m1(Mat33::Random());
   
   VERIFY_IS_APPROX(f1,r1);
@@ -49,6 +59,21 @@ void alignedvector3()
   f2.normalize();
   r2.normalize();
   VERIFY_IS_APPROX(f2,r2);
+  
+  {
+    FastType f6 = RefType::Zero();
+    FastType f7 = FastType::Zero();
+    VERIFY_IS_APPROX(f6,f7);
+    f6 = r4+r1;
+    VERIFY_IS_APPROX(f6,r4+r1);
+    f6 -= Scalar(2)*r4;
+    VERIFY_IS_APPROX(f6,r1-r4);
+  }
+  
+  std::stringstream ss1, ss2;
+  ss1 << f1;
+  ss2 << r1;
+  VERIFY(ss1.str()==ss2.str());
 }
 
 void test_alignedvector3()
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index 087e7c542..85743137e 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -16,7 +16,8 @@ EIGEN_DONT_INLINE Scalar foo(const Scalar& x, const Scalar& y)
   using namespace std;
 //   return x+std::sin(y);
   EIGEN_ASM_COMMENT("mybegin");
-  return static_cast<Scalar>(x*2 - pow(x,2) + 2*sqrt(y*y) - 4 * sin(x) + 2 * cos(y) - exp(-0.5*x*x));
+  // pow(float, int) promotes to pow(double, double)
+  return x*2 - 1 + static_cast<Scalar>(pow(1+x,2)) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(Scalar(-0.5)*x*x+0);
   //return x+2*y*x;//x*2 -std::pow(x,2);//(2*y/x);// - y*2;
   EIGEN_ASM_COMMENT("myend");
 }
@@ -104,6 +105,89 @@ struct TestFunc1
   }
 };
 
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+/* Test functor for the C++11 features. */
+template <typename Scalar>
+struct integratorFunctor
+{
+    typedef Matrix<Scalar, 2, 1> InputType;
+    typedef Matrix<Scalar, 2, 1> ValueType;
+
+    /*
+     * Implementation starts here.
+     */
+    integratorFunctor(const Scalar gain) : _gain(gain) {}
+    integratorFunctor(const integratorFunctor& f) : _gain(f._gain) {}
+    const Scalar _gain;
+
+    template <typename T1, typename T2>
+    void operator() (const T1 &input, T2 *output, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+    }
+
+    /* Only needed for the test */
+    template <typename T1, typename T2, typename T3>
+    void operator() (const T1 &input, T2 *output, T3 *jacobian, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+
+        if (jacobian)
+        {
+            T3 &j = *jacobian;
+
+            j(0, 0) = 1;
+            j(0, 1) = dt * _gain;
+            j(1, 0) = 0;
+            j(1, 1) = _gain;
+        }
+    }
+
+};
+
+template<typename Func> void forward_jacobian_cpp11(const Func& f)
+{
+    typedef typename Func::ValueType::Scalar Scalar;
+    typedef typename Func::ValueType ValueType;
+    typedef typename Func::InputType InputType;
+    typedef typename AutoDiffJacobian<Func>::JacobianType JacobianType;
+
+    InputType x = InputType::Random(InputType::RowsAtCompileTime);
+    ValueType y, yref;
+    JacobianType j, jref;
+
+    const Scalar dt = internal::random<double>();
+
+    jref.setZero();
+    yref.setZero();
+    f(x, &yref, &jref, dt);
+
+    //std::cerr << "y, yref, jref: " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << yref << "\n\n";
+    //std::cerr << jref << "\n\n";
+
+    AutoDiffJacobian<Func> autoj(f);
+    autoj(x, &y, &j, dt);
+
+    //std::cerr << "y j (via autodiff): " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << j << "\n\n";
+
+    VERIFY_IS_APPROX(y, yref);
+    VERIFY_IS_APPROX(j, jref);
+}
+#endif
+
 template<typename Func> void forward_jacobian(const Func& f)
 {
     typename Func::InputType x = Func::InputType::Random(f.inputs());
@@ -127,8 +211,8 @@ template<typename Func> void forward_jacobian(const Func& f)
     VERIFY_IS_APPROX(j, jref);
 }
 
-
 // TODO also check actual derivatives!
+template <int>
 void test_autodiff_scalar()
 {
   Vector2f p = Vector2f::Random();
@@ -139,7 +223,9 @@ void test_autodiff_scalar()
   VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y()));
 }
 
+
 // TODO also check actual derivatives!
+template <int>
 void test_autodiff_vector()
 {
   Vector2f p = Vector2f::Random();
@@ -148,11 +234,12 @@ void test_autodiff_vector()
   VectorAD ap = p.cast<AD>();
   ap.x().derivatives() = Vector2f::UnitX();
   ap.y().derivatives() = Vector2f::UnitY();
-  
+
   AD res = foo<VectorAD>(ap);
   VERIFY_IS_APPROX(res.value(), foo(p));
 }
 
+template <int>
 void test_autodiff_jacobian()
 {
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double,2,2>()) ));
@@ -160,14 +247,121 @@ void test_autodiff_jacobian()
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) ));
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) ));
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) ));
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  CALL_SUBTEST(( forward_jacobian_cpp11(integratorFunctor<double>(10)) ));
+#endif
+}
+
+
+template <int>
+void test_autodiff_hessian()
+{
+  typedef AutoDiffScalar<VectorXd> AD;
+  typedef Matrix<AD,Eigen::Dynamic,1> VectorAD;
+  typedef AutoDiffScalar<VectorAD> ADD;
+  typedef Matrix<ADD,Eigen::Dynamic,1> VectorADD;
+  VectorADD x(2);
+  double s1 = internal::random<double>(), s2 = internal::random<double>(), s3 = internal::random<double>(), s4 = internal::random<double>();
+  x(0).value()=s1;
+  x(1).value()=s2;
+
+  //set unit vectors for the derivative directions (partial derivatives of the input vector)
+  x(0).derivatives().resize(2);
+  x(0).derivatives().setZero();
+  x(0).derivatives()(0)= 1;
+  x(1).derivatives().resize(2);
+  x(1).derivatives().setZero();
+  x(1).derivatives()(1)=1;
+
+  //repeat partial derivatives for the inner AutoDiffScalar
+  x(0).value().derivatives() = VectorXd::Unit(2,0);
+  x(1).value().derivatives() = VectorXd::Unit(2,1);
+
+  //set the hessian matrix to zero
+  for(int idx=0; idx<2; idx++) {
+      x(0).derivatives()(idx).derivatives()  = VectorXd::Zero(2);
+      x(1).derivatives()(idx).derivatives()  = VectorXd::Zero(2);
+  }
+
+  ADD y = sin(AD(s3)*x(0) + AD(s4)*x(1));
+
+  VERIFY_IS_APPROX(y.value().derivatives()(0), y.derivatives()(0).value());
+  VERIFY_IS_APPROX(y.value().derivatives()(1), y.derivatives()(1).value());
+  VERIFY_IS_APPROX(y.value().derivatives()(0), s3*std::cos(s1*s3+s2*s4));
+  VERIFY_IS_APPROX(y.value().derivatives()(1), s4*std::cos(s1*s3+s2*s4));
+  VERIFY_IS_APPROX(y.derivatives()(0).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s3,s4*s3));
+  VERIFY_IS_APPROX(y.derivatives()(1).derivatives(),  -std::sin(s1*s3+s2*s4)*Vector2d(s3*s4,s4*s4));
+
+  ADD z = x(0)*x(1);
+  VERIFY_IS_APPROX(z.derivatives()(0).derivatives(), Vector2d(0,1));
+  VERIFY_IS_APPROX(z.derivatives()(1).derivatives(), Vector2d(1,0));
+}
+
+double bug_1222() {
+  typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
+  const double _cv1_3 = 1.0;
+  const AD chi_3 = 1.0;
+  // this line did not work, because operator+ returns ADS<DerType&>, which then cannot be converted to ADS<DerType>
+  const AD denom = chi_3 + _cv1_3;
+  return denom.value();
+}
+
+double bug_1223() {
+  using std::min;
+  typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
+
+  const double _cv1_3 = 1.0;
+  const AD chi_3 = 1.0;
+  const AD denom = 1.0;
+
+  // failed because implementation of min attempts to construct ADS<DerType&> via constructor AutoDiffScalar(const Real& value)
+  // without initializing m_derivatives (which is a reference in this case)
+  #define EIGEN_TEST_SPACE
+  const AD t = min EIGEN_TEST_SPACE (denom / chi_3, 1.0);
+
+  const AD t2 = min EIGEN_TEST_SPACE (denom / (chi_3 * _cv1_3), 1.0);
+
+  return t.value() + t2.value();
+}
+
+// regression test for some compilation issues with specializations of ScalarBinaryOpTraits
+void bug_1260() {
+  Matrix4d A;
+  Vector4d v;
+  A*v;
+}
+
+// check a compilation issue with numext::max
+double bug_1261() {
+  typedef AutoDiffScalar<Matrix2d> AD;
+  typedef Matrix<AD,2,1> VectorAD;
+
+  VectorAD v;
+  const AD maxVal = v.maxCoeff();
+  const AD minVal = v.minCoeff();
+  return maxVal.value() + minVal.value();
+}
+
+double bug_1264() {
+  typedef AutoDiffScalar<Vector2d> AD;
+  const AD s;
+  const Matrix<AD, 3, 1> v1;
+  const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1;
+  return v2(0).value();
 }
 
 void test_autodiff()
 {
   for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( test_autodiff_scalar() );
-    CALL_SUBTEST_2( test_autodiff_vector() );
-    CALL_SUBTEST_3( test_autodiff_jacobian() );
+    CALL_SUBTEST_1( test_autodiff_scalar<1>() );
+    CALL_SUBTEST_2( test_autodiff_vector<1>() );
+    CALL_SUBTEST_3( test_autodiff_jacobian<1>() );
+    CALL_SUBTEST_4( test_autodiff_hessian<1>() );
   }
+
+  bug_1222();
+  bug_1223();
+  bug_1260();
+  bug_1261();
 }
 
diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
new file mode 100644
index 000000000..4df2f5c57
--- /dev/null
+++ b/unsupported/test/autodiff_scalar.cpp
@@ -0,0 +1,83 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christoph Hertzberg <chtz@informatik.uni-bremen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/AutoDiff>
+
+/*
+ * In this file scalar derivations are tested for correctness.
+ * TODO add more tests!
+ */
+
+template<typename Scalar> void check_atan2()
+{
+  typedef Matrix<Scalar, 1, 1> Deriv1;
+  typedef AutoDiffScalar<Deriv1> AD;
+  
+  AD x(internal::random<Scalar>(-3.0, 3.0), Deriv1::UnitX());
+  
+  using std::exp;
+  Scalar r = exp(internal::random<Scalar>(-10, 10));
+  
+  AD s = sin(x), c = cos(x);
+  AD res = atan2(r*s, r*c);
+  
+  VERIFY_IS_APPROX(res.value(), x.value());
+  VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
+
+  res = atan2(r*s+0, r*c+0);
+  VERIFY_IS_APPROX(res.value(), x.value());
+  VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
+}
+
+template<typename Scalar> void check_hyperbolic_functions()
+{
+  using std::sinh;
+  using std::cosh;
+  using std::tanh;
+  typedef Matrix<Scalar, 1, 1> Deriv1;
+  typedef AutoDiffScalar<Deriv1> AD;
+  Deriv1 p = Deriv1::Random();
+  AD val(p.x(),Deriv1::UnitX());
+
+  Scalar cosh_px = std::cosh(p.x());
+  AD res1 = tanh(val);
+  VERIFY_IS_APPROX(res1.value(), std::tanh(p.x()));
+  VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(1.0) / (cosh_px * cosh_px));
+
+  AD res2 = sinh(val);
+  VERIFY_IS_APPROX(res2.value(), std::sinh(p.x()));
+  VERIFY_IS_APPROX(res2.derivatives().x(), cosh_px);
+
+  AD res3 = cosh(val);
+  VERIFY_IS_APPROX(res3.value(), cosh_px);
+  VERIFY_IS_APPROX(res3.derivatives().x(), std::sinh(p.x()));
+
+  // Check constant values.
+  const Scalar sample_point = Scalar(1) / Scalar(3); 
+  val = AD(sample_point,Deriv1::UnitX());
+  res1 = tanh(val);
+  VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(0.896629559604914));
+
+  res2 = sinh(val);
+  VERIFY_IS_APPROX(res2.derivatives().x(), Scalar(1.056071867829939));
+
+  res3 = cosh(val);
+  VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150));
+}
+
+void test_autodiff_scalar()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( check_atan2<float>() );
+    CALL_SUBTEST_2( check_atan2<double>() );
+    CALL_SUBTEST_3( check_hyperbolic_functions<float>() );
+    CALL_SUBTEST_4( check_hyperbolic_functions<double>() );
+  }
+}
diff --git a/unsupported/test/bdcsvd.cpp b/unsupported/test/bdcsvd.cpp
deleted file mode 100644
index 115a649b0..000000000
--- a/unsupported/test/bdcsvd.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
-// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
-// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
-// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/
-
-#include "svd_common.h"
-#include <iostream>
-#include <Eigen/LU>
-
-// check if "svd" is the good image of "m"  
-template<typename MatrixType>
-void bdcsvd_check_full(const MatrixType& m, const BDCSVD<MatrixType>& svd)
-{
-  svd_check_full< MatrixType, BDCSVD< MatrixType > >(m, svd);
-}
-
-// Compare to a reference value
-template<typename MatrixType>
-void bdcsvd_compare_to_full(const MatrixType& m,
-			    unsigned int computationOptions,
-			    const BDCSVD<MatrixType>& referenceSvd)
-{
-  svd_compare_to_full< MatrixType, BDCSVD< MatrixType > >(m, computationOptions, referenceSvd);
-} // end bdcsvd_compare_to_full
-
-
-template<typename MatrixType>
-void bdcsvd_solve(const MatrixType& m, unsigned int computationOptions)
-{
-  svd_solve< MatrixType, BDCSVD< MatrixType > >(m, computationOptions);
-} //  end template bdcsvd_solve
-
-
-// test the computations options
-template<typename MatrixType>
-void bdcsvd_test_all_computation_options(const MatrixType& m)
-{
-  BDCSVD<MatrixType> fullSvd(m, ComputeFullU|ComputeFullV);
-  svd_test_computation_options_1< MatrixType, BDCSVD< MatrixType > >(m, fullSvd); 
-  svd_test_computation_options_2< MatrixType, BDCSVD< MatrixType > >(m, fullSvd); 
-} // end bdcsvd_test_all_computation_options
-
-
-// Call a test with all the computations options
-template<typename MatrixType>
-void bdcsvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
-{
-  MatrixType m = pickrandom ? MatrixType::Random(a.rows(), a.cols()) : a;
-  bdcsvd_test_all_computation_options<MatrixType>(m);
-} // end template bdcsvd
-
-
-// verify assert
-template<typename MatrixType> 
-void bdcsvd_verify_assert(const MatrixType& m)
-{
-  svd_verify_assert< MatrixType, BDCSVD< MatrixType > >(m);
-}// end template bdcsvd_verify_assert
-
-
-// test weird values
-template<typename MatrixType>
-void bdcsvd_inf_nan()
-{
-  svd_inf_nan< MatrixType, BDCSVD< MatrixType > >();
-}// end template bdcsvd_inf_nan
-
-
-
-void bdcsvd_preallocate()
-{
-  svd_preallocate< BDCSVD< MatrixXf > >();
-} // end bdcsvd_preallocate
-
-
-// compare the Singular values returned with Jacobi and Bdc
-template<typename MatrixType> 
-void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0)
-{
-  std::cout << "debut compare" << std::endl;
-  MatrixType m = MatrixType::Random(a.rows(), a.cols());
-  BDCSVD<MatrixType> bdc_svd(m);
-  JacobiSVD<MatrixType> jacobi_svd(m);
-  VERIFY_IS_APPROX(bdc_svd.singularValues(), jacobi_svd.singularValues());
-  if(computationOptions & ComputeFullU)
-    VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU());
-  if(computationOptions & ComputeThinU)
-    VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU());
-  if(computationOptions & ComputeFullV)
-    VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
-  if(computationOptions & ComputeThinV)
-    VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
-  std::cout << "fin compare" << std::endl;
-} // end template compare_bdc_jacobi
-
-
-// call the tests
-void test_bdcsvd()
-{
-  // test of Dynamic defined Matrix (42, 42) of float 
-  CALL_SUBTEST_11(( bdcsvd_verify_assert<Matrix<float,Dynamic,Dynamic> >
-		    (Matrix<float,Dynamic,Dynamic>(42,42)) ));
-  CALL_SUBTEST_11(( compare_bdc_jacobi<Matrix<float,Dynamic,Dynamic> >
-		    (Matrix<float,Dynamic,Dynamic>(42,42), 0) ));
-  CALL_SUBTEST_11(( bdcsvd<Matrix<float,Dynamic,Dynamic> >
-		    (Matrix<float,Dynamic,Dynamic>(42,42)) ));
-
-  // test of Dynamic defined Matrix (50, 50) of double 
-  CALL_SUBTEST_13(( bdcsvd_verify_assert<Matrix<double,Dynamic,Dynamic> >
-		    (Matrix<double,Dynamic,Dynamic>(50,50)) ));
-  CALL_SUBTEST_13(( compare_bdc_jacobi<Matrix<double,Dynamic,Dynamic> >
-		    (Matrix<double,Dynamic,Dynamic>(50,50), 0) ));
-  CALL_SUBTEST_13(( bdcsvd<Matrix<double,Dynamic,Dynamic> >
-		    (Matrix<double,Dynamic,Dynamic>(50, 50)) )); 
-
-  // test of Dynamic defined Matrix (22, 22) of complex double
-  CALL_SUBTEST_14(( bdcsvd_verify_assert<Matrix<std::complex<double>,Dynamic,Dynamic> >
-  		    (Matrix<std::complex<double>,Dynamic,Dynamic>(22,22)) ));
-  CALL_SUBTEST_14(( compare_bdc_jacobi<Matrix<std::complex<double>,Dynamic,Dynamic> >
-  		    (Matrix<std::complex<double>, Dynamic, Dynamic> (22,22), 0) ));
-  CALL_SUBTEST_14(( bdcsvd<Matrix<std::complex<double>,Dynamic,Dynamic> >
-  		    (Matrix<std::complex<double>,Dynamic,Dynamic>(22, 22)) )); 
-
-  // test of Dynamic defined Matrix (10, 10) of int
-  //CALL_SUBTEST_15(( bdcsvd_verify_assert<Matrix<int,Dynamic,Dynamic> >
-  //		    (Matrix<int,Dynamic,Dynamic>(10,10)) ));		    
-  //CALL_SUBTEST_15(( compare_bdc_jacobi<Matrix<int,Dynamic,Dynamic> >
-  //		    (Matrix<int,Dynamic,Dynamic>(10,10), 0) ));
-  //CALL_SUBTEST_15(( bdcsvd<Matrix<int,Dynamic,Dynamic> >
-  //		    (Matrix<int,Dynamic,Dynamic>(10, 10)) )); 
-  
-
-  // test of Dynamic defined Matrix (8, 6) of double 
- 
-  CALL_SUBTEST_16(( bdcsvd_verify_assert<Matrix<double,Dynamic,Dynamic> >
-		    (Matrix<double,Dynamic,Dynamic>(8,6)) ));
-  CALL_SUBTEST_16(( compare_bdc_jacobi<Matrix<double,Dynamic,Dynamic> >
-		    (Matrix<double,Dynamic,Dynamic>(8, 6), 0) )); 
-  CALL_SUBTEST_16(( bdcsvd<Matrix<double,Dynamic,Dynamic> >
-		    (Matrix<double,Dynamic,Dynamic>(8, 6)) ));
-
-
-  
-  // test of Dynamic defined Matrix (36, 12) of float
-  CALL_SUBTEST_17(( compare_bdc_jacobi<Matrix<float,Dynamic,Dynamic> >
-		    (Matrix<float,Dynamic,Dynamic>(36, 12), 0) )); 
-  CALL_SUBTEST_17(( bdcsvd<Matrix<float,Dynamic,Dynamic> >
-		    (Matrix<float,Dynamic,Dynamic>(36, 12)) )); 
-
-  // test of Dynamic defined Matrix (5, 8) of double 
-  CALL_SUBTEST_18(( compare_bdc_jacobi<Matrix<double,Dynamic,Dynamic> >
-		    (Matrix<double,Dynamic,Dynamic>(5, 8), 0) )); 
-  CALL_SUBTEST_18(( bdcsvd<Matrix<double,Dynamic,Dynamic> >
-		    (Matrix<double,Dynamic,Dynamic>(5, 8)) )); 
-
-
-  // non regression tests
-  CALL_SUBTEST_3(( bdcsvd_verify_assert(Matrix3f()) ));
-  CALL_SUBTEST_4(( bdcsvd_verify_assert(Matrix4d()) ));
-  CALL_SUBTEST_7(( bdcsvd_verify_assert(MatrixXf(10,12)) ));
-  CALL_SUBTEST_8(( bdcsvd_verify_assert(MatrixXcd(7,5)) ));
-
-  // SUBTESTS 1 and 2 on specifics matrix
-  for(int i = 0; i < g_repeat; i++) {
-    Matrix2cd m;
-    m << 0, 1,
-      0, 1;
-    CALL_SUBTEST_1(( bdcsvd(m, false) ));
-    m << 1, 0,
-      1, 0;
-    CALL_SUBTEST_1(( bdcsvd(m, false) ));
-
-    Matrix2d n;
-    n << 0, 0,
-      0, 0;
-    CALL_SUBTEST_2(( bdcsvd(n, false) ));
-    n << 0, 0,
-      0, 1;
-    CALL_SUBTEST_2(( bdcsvd(n, false) ));
-    
-    // Statics matrix don't work with BDSVD yet
-    // bdc algo on a random 3x3 float matrix
-    // CALL_SUBTEST_3(( bdcsvd<Matrix3f>() ));
-    // bdc algo on a random 4x4 double matrix
-    // CALL_SUBTEST_4(( bdcsvd<Matrix4d>() ));
-    // bdc algo on a random 3x5 float matrix
-    // CALL_SUBTEST_5(( bdcsvd<Matrix<float,3,5> >() ));
-
-    int r = internal::random<int>(1, 30),
-      c = internal::random<int>(1, 30);
-    CALL_SUBTEST_7(( bdcsvd<MatrixXf>(MatrixXf(r,c)) ));
-    CALL_SUBTEST_8(( bdcsvd<MatrixXcd>(MatrixXcd(r,c)) ));
-    (void) r;
-    (void) c;
-
-    // Test on inf/nan matrix
-    CALL_SUBTEST_7( bdcsvd_inf_nan<MatrixXf>() );
-  }
-
-  CALL_SUBTEST_7(( bdcsvd<MatrixXf>(MatrixXf(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
-  CALL_SUBTEST_8(( bdcsvd<MatrixXcd>(MatrixXcd(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/3), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/3))) ));
-
-  // Test problem size constructors
-  CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
-
-} // end test_bdcsvd
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
new file mode 100644
index 000000000..3b598bf42
--- /dev/null
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -0,0 +1,142 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
+#ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
+  return rand();
+#else
+  return rand_r(s);
+#endif
+}
+
+static void test_basic_eventcount()
+{
+  MaxSizeVector<EventCount::Waiter> waiters(1);
+  waiters.resize(1);
+  EventCount ec(waiters);
+  EventCount::Waiter& w = waiters[0];
+  ec.Notify(false);
+  ec.Prewait(&w);
+  ec.Notify(true);
+  ec.CommitWait(&w);
+  ec.Prewait(&w);
+  ec.CancelWait(&w);
+}
+
+// Fake bounded counter-based queue.
+struct TestQueue {
+  std::atomic<int> val_;
+  static const int kQueueSize = 10;
+
+  TestQueue() : val_() {}
+
+  ~TestQueue() { VERIFY_IS_EQUAL(val_.load(), 0); }
+
+  bool Push() {
+    int val = val_.load(std::memory_order_relaxed);
+    for (;;) {
+      VERIFY_GE(val, 0);
+      VERIFY_LE(val, kQueueSize);
+      if (val == kQueueSize) return false;
+      if (val_.compare_exchange_weak(val, val + 1, std::memory_order_relaxed))
+        return true;
+    }
+  }
+
+  bool Pop() {
+    int val = val_.load(std::memory_order_relaxed);
+    for (;;) {
+      VERIFY_GE(val, 0);
+      VERIFY_LE(val, kQueueSize);
+      if (val == 0) return false;
+      if (val_.compare_exchange_weak(val, val - 1, std::memory_order_relaxed))
+        return true;
+    }
+  }
+
+  bool Empty() { return val_.load(std::memory_order_relaxed) == 0; }
+};
+
+const int TestQueue::kQueueSize;
+
+// A number of producers send messages to a set of consumers using a set of
+// fake queues. Ensure that it does not crash, consumers don't deadlock and
+// number of blocked and unblocked threads match.
+static void test_stress_eventcount()
+{
+  const int kThreads = std::thread::hardware_concurrency();
+  static const int kEvents = 1 << 16;
+  static const int kQueues = 10;
+
+  MaxSizeVector<EventCount::Waiter> waiters(kThreads);
+  waiters.resize(kThreads);
+  EventCount ec(waiters);
+  TestQueue queues[kQueues];
+
+  std::vector<std::unique_ptr<std::thread>> producers;
+  for (int i = 0; i < kThreads; i++) {
+    producers.emplace_back(new std::thread([&ec, &queues]() {
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
+        if (queues[idx].Push()) {
+          ec.Notify(false);
+          continue;
+        }
+        EIGEN_THREAD_YIELD();
+        j--;
+      }
+    }));
+  }
+
+  std::vector<std::unique_ptr<std::thread>> consumers;
+  for (int i = 0; i < kThreads; i++) {
+    consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() {
+      EventCount::Waiter& w = waiters[i];
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
+        if (queues[idx].Pop()) continue;
+        j--;
+        ec.Prewait(&w);
+        bool empty = true;
+        for (int q = 0; q < kQueues; q++) {
+          if (!queues[q].Empty()) {
+            empty = false;
+            break;
+          }
+        }
+        if (!empty) {
+          ec.CancelWait(&w);
+          continue;
+        }
+        ec.CommitWait(&w);
+      }
+    }));
+  }
+
+  for (int i = 0; i < kThreads; i++) {
+    producers[i]->join();
+    consumers[i]->join();
+  }
+}
+
+void test_cxx11_eventcount()
+{
+  CALL_SUBTEST(test_basic_eventcount());
+  CALL_SUBTEST(test_stress_eventcount());
+}
diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp
new file mode 100644
index 000000000..8911c59d8
--- /dev/null
+++ b/unsupported/test/cxx11_meta.cpp
@@ -0,0 +1,357 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <array>
+#include <Eigen/CXX11/src/util/CXX11Meta.h>
+
+using Eigen::internal::is_same;
+using Eigen::internal::type_list;
+using Eigen::internal::numeric_list;
+using Eigen::internal::gen_numeric_list;
+using Eigen::internal::gen_numeric_list_reversed;
+using Eigen::internal::gen_numeric_list_swapped_pair;
+using Eigen::internal::gen_numeric_list_repeated;
+using Eigen::internal::concat;
+using Eigen::internal::mconcat;
+using Eigen::internal::take;
+using Eigen::internal::skip;
+using Eigen::internal::slice;
+using Eigen::internal::get;
+using Eigen::internal::id_numeric;
+using Eigen::internal::id_type;
+using Eigen::internal::is_same_gf;
+using Eigen::internal::apply_op_from_left;
+using Eigen::internal::apply_op_from_right;
+using Eigen::internal::contained_in_list;
+using Eigen::internal::contained_in_list_gf;
+using Eigen::internal::arg_prod;
+using Eigen::internal::arg_sum;
+using Eigen::internal::sum_op;
+using Eigen::internal::product_op;
+using Eigen::internal::array_reverse;
+using Eigen::internal::array_sum;
+using Eigen::internal::array_prod;
+using Eigen::internal::array_reduce;
+using Eigen::internal::array_zip;
+using Eigen::internal::array_zip_and_reduce;
+using Eigen::internal::array_apply;
+using Eigen::internal::array_apply_and_reduce;
+using Eigen::internal::repeat;
+using Eigen::internal::instantiate_by_c_array;
+
+struct dummy_a {};
+struct dummy_b {};
+struct dummy_c {};
+struct dummy_d {};
+struct dummy_e {};
+
+// dummy operation for testing apply
+template<typename A, typename B> struct dummy_op;
+template<> struct dummy_op<dummy_a, dummy_b> { typedef dummy_c type; };
+template<> struct dummy_op<dummy_b, dummy_a> { typedef dummy_d type; };
+template<> struct dummy_op<dummy_b, dummy_c> { typedef dummy_a type; };
+template<> struct dummy_op<dummy_c, dummy_b> { typedef dummy_d type; };
+template<> struct dummy_op<dummy_c, dummy_a> { typedef dummy_b type; };
+template<> struct dummy_op<dummy_a, dummy_c> { typedef dummy_d type; };
+template<> struct dummy_op<dummy_a, dummy_a> { typedef dummy_e type; };
+template<> struct dummy_op<dummy_b, dummy_b> { typedef dummy_e type; };
+template<> struct dummy_op<dummy_c, dummy_c> { typedef dummy_e type; };
+
+template<typename A, typename B> struct dummy_test { constexpr static bool value = false; constexpr static int global_flags = 0; };
+template<> struct dummy_test<dummy_a, dummy_a>     { constexpr static bool value = true;  constexpr static int global_flags = 1; };
+template<> struct dummy_test<dummy_b, dummy_b>     { constexpr static bool value = true;  constexpr static int global_flags = 2; };
+template<> struct dummy_test<dummy_c, dummy_c>     { constexpr static bool value = true;  constexpr static int global_flags = 4; };
+
+struct times2_op { template<typename A> static A run(A v) { return v * 2; } };
+
+struct dummy_inst
+{
+  int c;
+
+  dummy_inst() : c(0) {}
+  explicit dummy_inst(int) : c(1) {}
+  dummy_inst(int, int) : c(2) {}
+  dummy_inst(int, int, int) : c(3) {}
+  dummy_inst(int, int, int, int) : c(4) {}
+  dummy_inst(int, int, int, int, int) : c(5) {}
+};
+
+static void test_gen_numeric_list()
+{
+  VERIFY((is_same<typename gen_numeric_list<int, 0>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 1>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 2>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 5>::type, numeric_list<int, 0, 1, 2, 3, 4>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 10>::type, numeric_list<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list<int, 0, 42>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 1, 42>::type, numeric_list<int, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 2, 42>::type, numeric_list<int, 42, 43>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 5, 42>::type, numeric_list<int, 42, 43, 44, 45, 46>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 10, 42>::type, numeric_list<int, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 0>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 1>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 2>::type, numeric_list<int, 1, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 5>::type, numeric_list<int, 4, 3, 2, 1, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 10>::type, numeric_list<int, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 0, 42>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 1, 42>::type, numeric_list<int, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 2, 42>::type, numeric_list<int, 43, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 5, 42>::type, numeric_list<int, 46, 45, 44, 43, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 10, 42>::type, numeric_list<int, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 0, 2, 3>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 1, 2, 3>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 2, 2, 3>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 5, 2, 3>::type, numeric_list<int, 0, 1, 3, 2, 4>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 10, 2, 3>::type, numeric_list<int, 0, 1, 3, 2, 4, 5, 6, 7, 8, 9>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 0, 44, 45, 42>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 1, 44, 45, 42>::type, numeric_list<int, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 2, 44, 45, 42>::type, numeric_list<int, 42, 43>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 5, 44, 45, 42>::type, numeric_list<int, 42, 43, 45, 44, 46>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 10, 44, 45, 42>::type, numeric_list<int, 42, 43, 45, 44, 46, 47, 48, 49, 50, 51>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 0, 0>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 1, 0>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 2, 0>::type, numeric_list<int, 0, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 5, 0>::type, numeric_list<int, 0, 0, 0, 0, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 10, 0>::type, numeric_list<int, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>>::value));
+}
+
+static void test_concat()
+{
+  VERIFY((is_same<typename concat<type_list<dummy_a, dummy_a>, type_list<>>::type, type_list<dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename concat<type_list<>, type_list<dummy_a, dummy_a>>::type, type_list<dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename concat<type_list<dummy_a, dummy_a>, type_list<dummy_a, dummy_a>>::type, type_list<dummy_a, dummy_a, dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename concat<type_list<dummy_a, dummy_a>, type_list<dummy_b, dummy_c>>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename concat<type_list<dummy_a>, type_list<dummy_b, dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+
+  VERIFY((is_same<typename concat<numeric_list<int, 0, 0>, numeric_list<int>>::type, numeric_list<int, 0, 0>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int>, numeric_list<int, 0, 0>>::type, numeric_list<int, 0, 0>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int, 0, 0>, numeric_list<int, 0, 0>>::type, numeric_list<int, 0, 0, 0, 0>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int, 0, 0>, numeric_list<int, 1, 2>>::type, numeric_list<int, 0, 0, 1, 2>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int, 0>, numeric_list<int, 1, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>>::type, type_list<dummy_a>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>, type_list<dummy_b>>::type, type_list<dummy_a, dummy_b>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>, type_list<dummy_b>, type_list<dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>, type_list<dummy_b, dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a, dummy_b>, type_list<dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>, numeric_list<int, 1>>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>, numeric_list<int, 1>, numeric_list<int, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>, numeric_list<int, 1, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0, 1>, numeric_list<int, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+}
+
+static void test_slice()
+{
+  typedef type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c> tl;
+  typedef numeric_list<int, 0, 1, 2, 3, 4, 5> il;
+
+  VERIFY((is_same<typename take<0, tl>::type, type_list<>>::value));
+  VERIFY((is_same<typename take<1, tl>::type, type_list<dummy_a>>::value));
+  VERIFY((is_same<typename take<2, tl>::type, type_list<dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename take<3, tl>::type, type_list<dummy_a, dummy_a, dummy_b>>::value));
+  VERIFY((is_same<typename take<4, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b>>::value));
+  VERIFY((is_same<typename take<5, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename take<6, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+
+  VERIFY((is_same<typename take<0, il>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename take<1, il>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename take<2, il>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename take<3, il>::type, numeric_list<int, 0, 1, 2>>::value));
+  VERIFY((is_same<typename take<4, il>::type, numeric_list<int, 0, 1, 2, 3>>::value));
+  VERIFY((is_same<typename take<5, il>::type, numeric_list<int, 0, 1, 2, 3, 4>>::value));
+  VERIFY((is_same<typename take<6, il>::type, numeric_list<int, 0, 1, 2, 3, 4, 5>>::value));
+  
+  VERIFY((is_same<typename skip<0, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<1, tl>::type, type_list<dummy_a, dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<2, tl>::type, type_list<dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<3, tl>::type, type_list<dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<4, tl>::type, type_list<dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<5, tl>::type, type_list<dummy_c>>::value));
+  VERIFY((is_same<typename skip<6, tl>::type, type_list<>>::value));
+
+  VERIFY((is_same<typename skip<0, il>::type, numeric_list<int, 0, 1, 2, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<1, il>::type, numeric_list<int, 1, 2, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<2, il>::type, numeric_list<int, 2, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<3, il>::type, numeric_list<int, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<4, il>::type, numeric_list<int, 4, 5>>::value));
+  VERIFY((is_same<typename skip<5, il>::type, numeric_list<int, 5>>::value));
+  VERIFY((is_same<typename skip<6, il>::type, numeric_list<int>>::value));
+
+  VERIFY((is_same<typename slice<0, 3, tl>::type, typename take<3, tl>::type>::value));
+  VERIFY((is_same<typename slice<0, 3, il>::type, typename take<3, il>::type>::value));
+  VERIFY((is_same<typename slice<1, 3, tl>::type, type_list<dummy_a, dummy_b, dummy_b>>::value));
+  VERIFY((is_same<typename slice<1, 3, il>::type, numeric_list<int, 1, 2, 3>>::value));
+}
+
+static void test_get()
+{
+  typedef type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c> tl;
+  typedef numeric_list<int, 4, 8, 15, 16, 23, 42> il;
+
+  VERIFY((is_same<typename get<0, tl>::type, dummy_a>::value));
+  VERIFY((is_same<typename get<1, tl>::type, dummy_a>::value));
+  VERIFY((is_same<typename get<2, tl>::type, dummy_b>::value));
+  VERIFY((is_same<typename get<3, tl>::type, dummy_b>::value));
+  VERIFY((is_same<typename get<4, tl>::type, dummy_c>::value));
+  VERIFY((is_same<typename get<5, tl>::type, dummy_c>::value));
+
+  VERIFY_IS_EQUAL(((int)get<0, il>::value), 4);
+  VERIFY_IS_EQUAL(((int)get<1, il>::value), 8);
+  VERIFY_IS_EQUAL(((int)get<2, il>::value), 15);
+  VERIFY_IS_EQUAL(((int)get<3, il>::value), 16);
+  VERIFY_IS_EQUAL(((int)get<4, il>::value), 23);
+  VERIFY_IS_EQUAL(((int)get<5, il>::value), 42);
+}
+
+static void test_id_helper(dummy_a a, dummy_a b, dummy_a c)
+{
+  (void)a;
+  (void)b;
+  (void)c;
+}
+
+template<int... ii>
+static void test_id_numeric()
+{
+  test_id_helper(typename id_numeric<int, ii, dummy_a>::type()...);
+}
+
+template<typename... tt>
+static void test_id_type()
+{
+  test_id_helper(typename id_type<tt, dummy_a>::type()...);
+}
+
+static void test_id()
+{
+  // don't call VERIFY here, just assume it works if it compiles
+  // (otherwise it will complain that it can't find the function)
+  test_id_numeric<1, 4, 6>();
+  test_id_type<dummy_a, dummy_b, dummy_c>();
+}
+
+static void test_is_same_gf()
+{
+  VERIFY((!is_same_gf<dummy_a, dummy_b>::value));
+  VERIFY((!!is_same_gf<dummy_a, dummy_a>::value));
+  VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_b>::global_flags), false);
+  VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_a>::global_flags), false);
+}
+
+static void test_apply_op()
+{
+  typedef type_list<dummy_a, dummy_b, dummy_c> tl;
+  VERIFY((!!is_same<typename apply_op_from_left<dummy_op, dummy_a, tl>::type, type_list<dummy_e, dummy_c, dummy_d>>::value));
+  VERIFY((!!is_same<typename apply_op_from_right<dummy_op, dummy_a, tl>::type, type_list<dummy_e, dummy_d, dummy_b>>::value));
+}
+
+static void test_contained_in_list()
+{
+  typedef type_list<dummy_a, dummy_b, dummy_c> tl;
+
+  VERIFY((!!contained_in_list<is_same, dummy_a, tl>::value));
+  VERIFY((!!contained_in_list<is_same, dummy_b, tl>::value));
+  VERIFY((!!contained_in_list<is_same, dummy_c, tl>::value));
+  VERIFY((!contained_in_list<is_same, dummy_d, tl>::value));
+  VERIFY((!contained_in_list<is_same, dummy_e, tl>::value));
+
+  VERIFY((!!contained_in_list_gf<dummy_test, dummy_a, tl>::value));
+  VERIFY((!!contained_in_list_gf<dummy_test, dummy_b, tl>::value));
+  VERIFY((!!contained_in_list_gf<dummy_test, dummy_c, tl>::value));
+  VERIFY((!contained_in_list_gf<dummy_test, dummy_d, tl>::value));
+  VERIFY((!contained_in_list_gf<dummy_test, dummy_e, tl>::value));
+
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_a, tl>::global_flags), 1);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_b, tl>::global_flags), 2);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_c, tl>::global_flags), 4);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_d, tl>::global_flags), 0);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_e, tl>::global_flags), 0);
+}
+
+static void test_arg_reductions()
+{
+  VERIFY_IS_EQUAL(arg_sum(1,2,3,4), 10);
+  VERIFY_IS_EQUAL(arg_prod(1,2,3,4), 24);
+  VERIFY_IS_APPROX(arg_sum(0.5, 2, 5), 7.5);
+  VERIFY_IS_APPROX(arg_prod(0.5, 2, 5), 5.0);
+}
+
+static void test_array_reverse_and_reduce()
+{
+  array<int, 6> a{{4, 8, 15, 16, 23, 42}};
+  array<int, 6> b{{42, 23, 16, 15, 8, 4}};
+
+  // there is no operator<< for std::array, so VERIFY_IS_EQUAL will
+  // not compile
+  VERIFY((array_reverse(a) == b));
+  VERIFY((array_reverse(b) == a));
+  VERIFY_IS_EQUAL((array_sum(a)), 108);
+  VERIFY_IS_EQUAL((array_sum(b)), 108);
+  VERIFY_IS_EQUAL((array_prod(a)), 7418880);
+  VERIFY_IS_EQUAL((array_prod(b)), 7418880);
+}
+
+static void test_array_zip_and_apply()
+{
+  array<int, 6> a{{4, 8, 15, 16, 23, 42}};
+  array<int, 6> b{{0, 1, 2, 3, 4, 5}};
+  array<int, 6> c{{4, 9, 17, 19, 27, 47}};
+  array<int, 6> d{{0, 8, 30, 48, 92, 210}};
+  array<int, 6> e{{0, 2, 4, 6, 8, 10}};
+
+  VERIFY((array_zip<sum_op>(a, b) == c));
+  VERIFY((array_zip<product_op>(a, b) == d));
+  VERIFY((array_apply<times2_op>(b) == e));
+  VERIFY_IS_EQUAL((array_apply_and_reduce<sum_op, times2_op>(a)), 216);
+  VERIFY_IS_EQUAL((array_apply_and_reduce<sum_op, times2_op>(b)), 30);
+  VERIFY_IS_EQUAL((array_zip_and_reduce<product_op, sum_op>(a, b)), 14755932);
+  VERIFY_IS_EQUAL((array_zip_and_reduce<sum_op, product_op>(a, b)), 388);
+}
+
+static void test_array_misc()
+{
+  array<int, 3> a3{{1, 1, 1}};
+  array<int, 6> a6{{2, 2, 2, 2, 2, 2}};
+  VERIFY((repeat<3, int>(1) == a3));
+  VERIFY((repeat<6, int>(2) == a6));
+
+  int data[5] = { 0, 1, 2, 3, 4 };
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 0>(data).c), 0);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 1>(data).c), 1);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 2>(data).c), 2);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 3>(data).c), 3);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 4>(data).c), 4);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 5>(data).c), 5);
+}
+
+void test_cxx11_meta()
+{
+  CALL_SUBTEST(test_gen_numeric_list());
+  CALL_SUBTEST(test_concat());
+  CALL_SUBTEST(test_slice());
+  CALL_SUBTEST(test_get());
+  CALL_SUBTEST(test_id());
+  CALL_SUBTEST(test_is_same_gf());
+  CALL_SUBTEST(test_apply_op());
+  CALL_SUBTEST(test_contained_in_list());
+  CALL_SUBTEST(test_arg_reductions());
+  CALL_SUBTEST(test_array_reverse_and_reduce());
+  CALL_SUBTEST(test_array_zip_and_apply());
+  CALL_SUBTEST(test_array_misc());
+}
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
new file mode 100644
index 000000000..5f9bb938b
--- /dev/null
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -0,0 +1,107 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include "main.h"
+#include "Eigen/CXX11/ThreadPool"
+
+static void test_create_destroy_empty_pool()
+{
+  // Just create and destroy the pool. This will wind up and tear down worker
+  // threads. Ensure there are no issues in that logic.
+  for (int i = 0; i < 16; ++i) {
+    NonBlockingThreadPool tp(i);
+  }
+}
+
+
+static void test_parallelism()
+{
+  // Test we never-ever fail to match available tasks with idle threads.
+  const int kThreads = 16;  // code below expects that this is a multiple of 4
+  NonBlockingThreadPool tp(kThreads);
+  VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
+  VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
+  for (int iter = 0; iter < 100; ++iter) {
+    std::atomic<int> running(0);
+    std::atomic<int> done(0);
+    std::atomic<int> phase(0);
+    // Schedule kThreads tasks and ensure that they all are running.
+    for (int i = 0; i < kThreads; ++i) {
+      tp.Schedule([&]() {
+        const int thread_id = tp.CurrentThreadId();
+        VERIFY_GE(thread_id, 0);
+        VERIFY_LE(thread_id, kThreads - 1);
+        running++;
+        while (phase < 1) {
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    running = 0;
+    phase = 1;
+    // Now, while the previous tasks exit, schedule another kThreads tasks and
+    // ensure that they are running.
+    for (int i = 0; i < kThreads; ++i) {
+      tp.Schedule([&, i]() {
+        running++;
+        while (phase < 2) {
+        }
+        // When all tasks are running, half of tasks exit, quarter of tasks
+        // continue running and quarter of tasks schedule another 2 tasks each.
+        // Concurrently main thread schedules another quarter of tasks.
+        // This gives us another kThreads tasks and we ensure that they all
+        // are running.
+        if (i < kThreads / 2) {
+        } else if (i < 3 * kThreads / 4) {
+          running++;
+          while (phase < 3) {
+          }
+          done++;
+        } else {
+          for (int j = 0; j < 2; ++j) {
+            tp.Schedule([&]() {
+              running++;
+              while (phase < 3) {
+              }
+              done++;
+            });
+          }
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    running = 0;
+    phase = 2;
+    for (int i = 0; i < kThreads / 4; ++i) {
+      tp.Schedule([&]() {
+        running++;
+        while (phase < 3) {
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    phase = 3;
+    while (done != 3 * kThreads) {
+    }
+  }
+}
+
+void test_cxx11_non_blocking_thread_pool()
+{
+  CALL_SUBTEST(test_create_destroy_empty_pool());
+  CALL_SUBTEST(test_parallelism());
+}
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
new file mode 100644
index 000000000..91f690114
--- /dev/null
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -0,0 +1,235 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include <cstdlib>
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
+#ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
+  return rand();
+#else
+  return rand_r(s);
+#endif
+}
+
+void test_basic_runqueue()
+{
+  RunQueue<int, 4> q;
+  // Check empty state.
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PopFront());
+  std::vector<int> stolen;
+  VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, stolen.size());
+  // Push one front, pop one front.
+  VERIFY_IS_EQUAL(0, q.PushFront(1));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(1, q.PopFront());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  // Push front to overflow.
+  VERIFY_IS_EQUAL(0, q.PushFront(2));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(3));
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(4));
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(5));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(6, q.PushFront(6));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(5, q.PopFront());
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(4, q.PopFront());
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(3, q.PopFront());
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(2, q.PopFront());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PopFront());
+  // Push one back, pop one back.
+  VERIFY_IS_EQUAL(0, q.PushBack(7));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(7, stolen[0]);
+  VERIFY_IS_EQUAL(0u, q.Size());
+  stolen.clear();
+  // Push back to overflow.
+  VERIFY_IS_EQUAL(0, q.PushBack(8));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(9));
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(10));
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(11));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(12, q.PushBack(12));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  // Pop back in halves.
+  VERIFY_IS_EQUAL(2u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(2u, stolen.size());
+  VERIFY_IS_EQUAL(10, stolen[0]);
+  VERIFY_IS_EQUAL(11, stolen[1]);
+  VERIFY_IS_EQUAL(2u, q.Size());
+  stolen.clear();
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(9, stolen[0]);
+  VERIFY_IS_EQUAL(1u, q.Size());
+  stolen.clear();
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(8, stolen[0]);
+  stolen.clear();
+  VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, stolen.size());
+  // Empty again.
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(1));
+  VERIFY_IS_EQUAL(0, q.PushFront(2));
+  VERIFY_IS_EQUAL(0, q.PushFront(3));
+  VERIFY_IS_EQUAL(1, q.PopBack());
+  VERIFY_IS_EQUAL(2, q.PopBack());
+  VERIFY_IS_EQUAL(3, q.PopBack());
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+}
+
+// Empty tests that the queue is not claimed to be empty when is is in fact not.
+// Emptiness property is crucial part of thread pool blocking scheme,
+// so we go to great effort to ensure this property. We create a queue with
+// 1 element and then push 1 element (either front or back at random) and pop
+// 1 element (either front or back at random). So queue always contains at least
+// 1 element, but otherwise changes chaotically. Another thread constantly tests
+// that the queue is not claimed to be empty.
+void test_empty_runqueue()
+{
+  RunQueue<int, 4> q;
+  q.PushFront(1);
+  std::atomic<bool> done(false);
+  std::thread mutator([&q, &done]() {
+    unsigned rnd = 0;
+    std::vector<int> stolen;
+    for (int i = 0; i < 1 << 18; i++) {
+      if (rand_reentrant(&rnd) % 2)
+        VERIFY_IS_EQUAL(0, q.PushFront(1));
+      else
+        VERIFY_IS_EQUAL(0, q.PushBack(1));
+      if (rand_reentrant(&rnd) % 2)
+        VERIFY_IS_EQUAL(1, q.PopFront());
+      else {
+        for (;;) {
+          if (q.PopBackHalf(&stolen) == 1) {
+            stolen.clear();
+            break;
+          }
+          VERIFY_IS_EQUAL(0u, stolen.size());
+        }
+      }
+    }
+    done = true;
+  });
+  while (!done) {
+    VERIFY(!q.Empty());
+    int size = q.Size();
+    VERIFY_GE(size, 1);
+    VERIFY_LE(size, 2);
+  }
+  VERIFY_IS_EQUAL(1, q.PopFront());
+  mutator.join();
+}
+
+// Stress is a chaotic random test.
+// One thread (owner) calls PushFront/PopFront, other threads call PushBack/
+// PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass.
+void test_stress_runqueue()
+{
+  static const int kEvents = 1 << 18;
+  RunQueue<int, 8> q;
+  std::atomic<int> total(0);
+  std::vector<std::unique_ptr<std::thread>> threads;
+  threads.emplace_back(new std::thread([&q, &total]() {
+    int sum = 0;
+    int pushed = 1;
+    int popped = 1;
+    while (pushed < kEvents || popped < kEvents) {
+      if (pushed < kEvents) {
+        if (q.PushFront(pushed) == 0) {
+          sum += pushed;
+          pushed++;
+        }
+      }
+      if (popped < kEvents) {
+        int v = q.PopFront();
+        if (v != 0) {
+          sum -= v;
+          popped++;
+        }
+      }
+    }
+    total += sum;
+  }));
+  for (int i = 0; i < 2; i++) {
+    threads.emplace_back(new std::thread([&q, &total]() {
+      int sum = 0;
+      for (int j = 1; j < kEvents; j++) {
+        if (q.PushBack(j) == 0) {
+          sum += j;
+          continue;
+        }
+        EIGEN_THREAD_YIELD();
+        j--;
+      }
+      total += sum;
+    }));
+    threads.emplace_back(new std::thread([&q, &total]() {
+      int sum = 0;
+      std::vector<int> stolen;
+      for (int j = 1; j < kEvents;) {
+        if (q.PopBackHalf(&stolen) == 0) {
+          EIGEN_THREAD_YIELD();
+          continue;
+        }
+        while (stolen.size() && j < kEvents) {
+          int v = stolen.back();
+          stolen.pop_back();
+          VERIFY_IS_NOT_EQUAL(v, 0);
+          sum += v;
+          j++;
+        }
+      }
+      while (stolen.size()) {
+        int v = stolen.back();
+        stolen.pop_back();
+        VERIFY_IS_NOT_EQUAL(v, 0);
+        while ((v = q.PushBack(v)) != 0) EIGEN_THREAD_YIELD();
+      }
+      total -= sum;
+    }));
+  }
+  for (size_t i = 0; i < threads.size(); i++) threads[i]->join();
+  VERIFY(q.Empty());
+  VERIFY(total.load() == 0);
+}
+
+void test_cxx11_runqueue()
+{
+  CALL_SUBTEST_1(test_basic_runqueue());
+  CALL_SUBTEST_2(test_empty_runqueue());
+  CALL_SUBTEST_3(test_stress_runqueue());
+}
diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp
new file mode 100644
index 000000000..037767270
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax.cpp
@@ -0,0 +1,294 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@google.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+using Eigen::Tuple;
+
+template <int DataLayout>
+static void test_simple_index_tuples()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+  index_tuples = tensor.index_tuples();
+
+  for (DenseIndex n = 0; n < 2*3*5*7; ++n) {
+    const Tuple<DenseIndex, float>& v = index_tuples.coeff(n);
+    VERIFY_IS_EQUAL(v.first, n);
+    VERIFY_IS_EQUAL(v.second, tensor.coeff(n));
+  }
+}
+
+template <int DataLayout>
+static void test_index_tuples_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+
+  index_tuples = tensor.index_tuples();
+
+  for (Eigen::DenseIndex n = 0; n < tensor.size(); ++n) {
+    const Tuple<DenseIndex, float>& v = index_tuples(n); //(i, j, k, l);
+    VERIFY_IS_EQUAL(v.first, n);
+    VERIFY_IS_EQUAL(v.second, tensor(n));
+  }
+}
+
+template <int DataLayout>
+static void test_argmax_tuple_reducer()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+  index_tuples = tensor.index_tuples();
+
+  Tensor<Tuple<DenseIndex, float>, 0, DataLayout> reduced;
+  DimensionList<DenseIndex, 4> dims;
+  reduced = index_tuples.reduce(
+      dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 0, DataLayout> maxi = tensor.maximum();
+
+  VERIFY_IS_EQUAL(maxi(), reduced(0).second);
+
+  array<DenseIndex, 3> reduce_dims;
+  for (int d = 0; d < 3; ++d) reduce_dims[d] = d;
+  Tensor<Tuple<DenseIndex, float>, 1, DataLayout> reduced_by_dims(7);
+  reduced_by_dims = index_tuples.reduce(
+      reduce_dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 1, DataLayout> max_by_dims = tensor.maximum(reduce_dims);
+
+  for (int l = 0; l < 7; ++l) {
+    VERIFY_IS_EQUAL(max_by_dims(l), reduced_by_dims(l).second);
+  }
+}
+
+template <int DataLayout>
+static void test_argmin_tuple_reducer()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+  index_tuples = tensor.index_tuples();
+
+  Tensor<Tuple<DenseIndex, float>, 0, DataLayout> reduced;
+  DimensionList<DenseIndex, 4> dims;
+  reduced = index_tuples.reduce(
+      dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 0, DataLayout> mini = tensor.minimum();
+
+  VERIFY_IS_EQUAL(mini(), reduced(0).second);
+
+  array<DenseIndex, 3> reduce_dims;
+  for (int d = 0; d < 3; ++d) reduce_dims[d] = d;
+  Tensor<Tuple<DenseIndex, float>, 1, DataLayout> reduced_by_dims(7);
+  reduced_by_dims = index_tuples.reduce(
+      reduce_dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 1, DataLayout> min_by_dims = tensor.minimum(reduce_dims);
+
+  for (int l = 0; l < 7; ++l) {
+    VERIFY_IS_EQUAL(min_by_dims(l), reduced_by_dims(l).second);
+  }
+}
+
+template <int DataLayout>
+static void test_simple_argmax()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+  tensor(0,0,0,0) = 10.0;
+
+  Tensor<DenseIndex, 0, DataLayout> tensor_argmax;
+
+  tensor_argmax = tensor.argmax();
+
+  VERIFY_IS_EQUAL(tensor_argmax(0), 0);
+
+  tensor(1,2,4,6) = 20.0;
+
+  tensor_argmax = tensor.argmax();
+
+  VERIFY_IS_EQUAL(tensor_argmax(0), 2*3*5*7 - 1);
+}
+
+template <int DataLayout>
+static void test_simple_argmin()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+  tensor(0,0,0,0) = -10.0;
+
+  Tensor<DenseIndex, 0, DataLayout> tensor_argmin;
+
+  tensor_argmin = tensor.argmin();
+
+  VERIFY_IS_EQUAL(tensor_argmin(0), 0);
+
+  tensor(1,2,4,6) = -20.0;
+
+  tensor_argmin = tensor.argmin();
+
+  VERIFY_IS_EQUAL(tensor_argmin(0), 2*3*5*7 - 1);
+}
+
+template <int DataLayout>
+static void test_argmax_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims {2, 3, 5, 7};
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_argmax;
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = 10.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmax = tensor.argmax(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmax.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmax.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmax.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = 20.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmax = tensor.argmax(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmax.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmax.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmax.data()[n], tensor.dimension(dim) - 1);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_argmin_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims {2, 3, 5, 7};
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_argmin;
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
+            tensor(ix) = -10.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmin = tensor.argmin(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmin.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmin.size(); ++n) {
+      // Expect min to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmin.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
+            tensor(ix) = -20.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmin = tensor.argmin(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmin.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmin.size(); ++n) {
+      // Expect min to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmin.data()[n], tensor.dimension(dim) - 1);
+    }
+  }
+}
+
+void test_cxx11_tensor_argmax()
+{
+  CALL_SUBTEST(test_simple_index_tuples<RowMajor>());
+  CALL_SUBTEST(test_simple_index_tuples<ColMajor>());
+  CALL_SUBTEST(test_index_tuples_dim<RowMajor>());
+  CALL_SUBTEST(test_index_tuples_dim<ColMajor>());
+  CALL_SUBTEST(test_argmax_tuple_reducer<RowMajor>());
+  CALL_SUBTEST(test_argmax_tuple_reducer<ColMajor>());
+  CALL_SUBTEST(test_argmin_tuple_reducer<RowMajor>());
+  CALL_SUBTEST(test_argmin_tuple_reducer<ColMajor>());
+  CALL_SUBTEST(test_simple_argmax<RowMajor>());
+  CALL_SUBTEST(test_simple_argmax<ColMajor>());
+  CALL_SUBTEST(test_simple_argmin<RowMajor>());
+  CALL_SUBTEST(test_simple_argmin<ColMajor>());
+  CALL_SUBTEST(test_argmax_dim<RowMajor>());
+  CALL_SUBTEST(test_argmax_dim<ColMajor>());
+  CALL_SUBTEST(test_argmin_dim<RowMajor>());
+  CALL_SUBTEST(test_argmin_dim<ColMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu
new file mode 100644
index 000000000..653443dc5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -0,0 +1,254 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int Layout>
+void test_cuda_simple_argmax()
+{
+  Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
+  Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
+  Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
+  in.setRandom();
+  in *= in.constant(100.0);
+  in(0, 0, 0) = -1000.0;
+  in(71, 52, 96) = 1000.0;
+
+  std::size_t in_bytes = in.size() * sizeof(double);
+  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+  double* d_in;
+  DenseIndex* d_out_max;
+  DenseIndex* d_out_min;
+  cudaMalloc((void**)(&d_in), in_bytes);
+  cudaMalloc((void**)(&d_out_max), out_bytes);
+  cudaMalloc((void**)(&d_out_min), out_bytes);
+
+  cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
+
+  gpu_out_max.device(gpu_device) = gpu_in.argmax();
+  gpu_out_min.device(gpu_device) = gpu_in.argmin();
+
+  assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
+  VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+
+  cudaFree(d_in);
+  cudaFree(d_out_max);
+  cudaFree(d_out_min);
+}
+
+template <int DataLayout>
+void test_cuda_argmax_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims;
+  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    array<DenseIndex, 3> out_shape;
+    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = 10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(float);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    float* d_in;
+    DenseIndex* d_out;
+    cudaMalloc((void**)(&d_in), in_bytes);
+    cudaMalloc((void**)(&d_out), out_bytes);
+
+    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+
+    Eigen::CudaStreamDevice stream;
+    Eigen::GpuDevice gpu_device(&stream);
+
+    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
+
+    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
+
+    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    VERIFY_IS_EQUAL(tensor_arg.size(),
+                    size_t(2*3*5*7 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = 20.0;
+          }
+        }
+      }
+    }
+
+    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+
+    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
+
+    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+
+    cudaFree(d_in);
+    cudaFree(d_out);
+  }
+}
+
+template <int DataLayout>
+void test_cuda_argmin_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims;
+  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    array<DenseIndex, 3> out_shape;
+    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = -10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(float);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    float* d_in;
+    DenseIndex* d_out;
+    cudaMalloc((void**)(&d_in), in_bytes);
+    cudaMalloc((void**)(&d_out), out_bytes);
+
+    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+
+    Eigen::CudaStreamDevice stream;
+    Eigen::GpuDevice gpu_device(&stream);
+
+    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
+
+    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
+
+    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    VERIFY_IS_EQUAL(tensor_arg.size(),
+                    2*3*5*7 / tensor.dimension(dim));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect min to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = -20.0;
+          }
+        }
+      }
+    }
+
+    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+
+    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
+
+    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+
+    cudaFree(d_in);
+    cudaFree(d_out);
+  }
+}
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
+  CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
+  CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
+  CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
new file mode 100644
index 000000000..8fe85d83c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+  vec1(0) = 4;  vec2(0) = 0;
+  vec1(1) = 8;  vec2(1) = 1;
+  vec1(2) = 15; vec2(2) = 2;
+  vec1(3) = 16; vec2(3) = 3;
+  vec1(4) = 23; vec2(4) = 4;
+  vec1(5) = 42; vec2(5) = 5;
+
+  int col_major[6];
+  int row_major[6];
+  memset(col_major, 0, 6*sizeof(int));
+  memset(row_major, 0, 6*sizeof(int));
+  TensorMap<Tensor<int, 1> > vec3(col_major, 6);
+  TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6);
+
+  vec3 = vec1;
+  vec4 = vec2;
+
+  VERIFY_IS_EQUAL(vec3(0), 4);
+  VERIFY_IS_EQUAL(vec3(1), 8);
+  VERIFY_IS_EQUAL(vec3(2), 15);
+  VERIFY_IS_EQUAL(vec3(3), 16);
+  VERIFY_IS_EQUAL(vec3(4), 23);
+  VERIFY_IS_EQUAL(vec3(5), 42);
+
+  VERIFY_IS_EQUAL(vec4(0), 0);
+  VERIFY_IS_EQUAL(vec4(1), 1);
+  VERIFY_IS_EQUAL(vec4(2), 2);
+  VERIFY_IS_EQUAL(vec4(3), 3);
+  VERIFY_IS_EQUAL(vec4(4), 4);
+  VERIFY_IS_EQUAL(vec4(5), 5);
+
+  vec1.setZero();
+  vec2.setZero();
+  vec1 = vec3;
+  vec2 = vec4;
+
+  VERIFY_IS_EQUAL(vec1(0), 4);
+  VERIFY_IS_EQUAL(vec1(1), 8);
+  VERIFY_IS_EQUAL(vec1(2), 15);
+  VERIFY_IS_EQUAL(vec1(3), 16);
+  VERIFY_IS_EQUAL(vec1(4), 23);
+  VERIFY_IS_EQUAL(vec1(5), 42);
+
+  VERIFY_IS_EQUAL(vec2(0), 0);
+  VERIFY_IS_EQUAL(vec2(1), 1);
+  VERIFY_IS_EQUAL(vec2(2), 2);
+  VERIFY_IS_EQUAL(vec2(3), 3);
+  VERIFY_IS_EQUAL(vec2(4), 4);
+  VERIFY_IS_EQUAL(vec2(5), 5);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  int col_major[6];
+  int row_major[6];
+  memset(col_major, 0, 6*sizeof(int));
+  memset(row_major, 0, 6*sizeof(int));
+  TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3);
+  TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3);
+
+  mat3 = mat1;
+  mat4 = mat2;
+
+  VERIFY_IS_EQUAL(mat3(0,0), 0);
+  VERIFY_IS_EQUAL(mat3(0,1), 1);
+  VERIFY_IS_EQUAL(mat3(0,2), 2);
+  VERIFY_IS_EQUAL(mat3(1,0), 3);
+  VERIFY_IS_EQUAL(mat3(1,1), 4);
+  VERIFY_IS_EQUAL(mat3(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat4(0,0), 0);
+  VERIFY_IS_EQUAL(mat4(0,1), 1);
+  VERIFY_IS_EQUAL(mat4(0,2), 2);
+  VERIFY_IS_EQUAL(mat4(1,0), 3);
+  VERIFY_IS_EQUAL(mat4(1,1), 4);
+  VERIFY_IS_EQUAL(mat4(1,2), 5);
+
+  mat1.setZero();
+  mat2.setZero();
+  mat1 = mat3;
+  mat2 = mat4;
+
+  VERIFY_IS_EQUAL(mat1(0,0), 0);
+  VERIFY_IS_EQUAL(mat1(0,1), 1);
+  VERIFY_IS_EQUAL(mat1(0,2), 2);
+  VERIFY_IS_EQUAL(mat1(1,0), 3);
+  VERIFY_IS_EQUAL(mat1(1,1), 4);
+  VERIFY_IS_EQUAL(mat1(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat2(0,0), 0);
+  VERIFY_IS_EQUAL(mat2(0,1), 1);
+  VERIFY_IS_EQUAL(mat2(0,2), 2);
+  VERIFY_IS_EQUAL(mat2(1,0), 3);
+  VERIFY_IS_EQUAL(mat2(1,1), 4);
+  VERIFY_IS_EQUAL(mat2(1,2), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  int col_major[2*3*7];
+  int row_major[2*3*7];
+  memset(col_major, 0, 2*3*7*sizeof(int));
+  memset(row_major, 0, 2*3*7*sizeof(int));
+  TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7);
+  TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7);
+
+  mat3 = mat1;
+  mat4 = mat2;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+
+  mat1.setZero();
+  mat2.setZero();
+  mat1 = mat3;
+  mat2 = mat4;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat1(i,j,k), val);
+        VERIFY_IS_EQUAL(mat2(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+static void test_same_type()
+{
+  Tensor<int, 1> orig_tensor(5);
+  Tensor<int, 1> dest_tensor(5);
+  orig_tensor.setRandom();
+  dest_tensor.setRandom();
+  int* orig_data = orig_tensor.data();
+  int* dest_data = dest_tensor.data();
+  dest_tensor = orig_tensor;
+  VERIFY_IS_EQUAL(orig_tensor.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_tensor.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest_tensor(i), orig_tensor(i));
+  }
+
+  TensorFixedSize<int, Sizes<5> > orig_array;
+  TensorFixedSize<int, Sizes<5> > dest_array;
+  orig_array.setRandom();
+  dest_array.setRandom();
+  orig_data = orig_array.data();
+  dest_data = dest_array.data();
+  dest_array = orig_array;
+  VERIFY_IS_EQUAL(orig_array.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_array.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest_array(i), orig_array(i));
+  }
+
+  int orig[5] = {1, 2, 3, 4, 5};
+  int dest[5] = {6, 7, 8, 9, 10};
+  TensorMap<Tensor<int, 1> > orig_map(orig, 5);
+  TensorMap<Tensor<int, 1> > dest_map(dest, 5);
+  orig_data = orig_map.data();
+  dest_data = dest_map.data();
+  dest_map = orig_map;
+  VERIFY_IS_EQUAL(orig_map.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_map.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest[i], i+1);
+  }
+}
+
+static void test_auto_resize()
+{
+  Tensor<int, 1> tensor1;
+  Tensor<int, 1> tensor2(3);
+  Tensor<int, 1> tensor3(5);
+  Tensor<int, 1> tensor4(7);
+
+  Tensor<int, 1> new_tensor(5);
+  new_tensor.setRandom();
+
+  tensor1 = tensor2 = tensor3 = tensor4 = new_tensor;
+
+  VERIFY_IS_EQUAL(tensor1.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor2.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor3.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor4.dimension(0), new_tensor.dimension(0));
+  for (int i = 0; i < new_tensor.dimension(0); ++i) {
+    VERIFY_IS_EQUAL(tensor1(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor2(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor3(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor4(i), new_tensor(i));
+  }
+}
+
+
+static void test_compound_assign()
+{
+  Tensor<int, 1> start_tensor(10);
+  Tensor<int, 1> offset_tensor(10);
+  start_tensor.setRandom();
+  offset_tensor.setRandom();
+
+  Tensor<int, 1> tensor = start_tensor;
+  tensor += offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor -= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor *= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor /= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i));
+  }
+}
+
+static void test_std_initializers_tensor() {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  Tensor<int, 1> a(3);
+  a.setValues({0, 1, 2});
+  VERIFY_IS_EQUAL(a(0), 0);
+  VERIFY_IS_EQUAL(a(1), 1);
+  VERIFY_IS_EQUAL(a(2), 2);
+
+  // It fills the top-left slice.
+  a.setValues({10, 20});
+  VERIFY_IS_EQUAL(a(0), 10);
+  VERIFY_IS_EQUAL(a(1), 20);
+  VERIFY_IS_EQUAL(a(2), 2);
+
+  // Chaining.
+  Tensor<int, 1> a2(3);
+  a2 = a.setValues({100, 200, 300});
+  VERIFY_IS_EQUAL(a(0), 100);
+  VERIFY_IS_EQUAL(a(1), 200);
+  VERIFY_IS_EQUAL(a(2), 300);
+  VERIFY_IS_EQUAL(a2(0), 100);
+  VERIFY_IS_EQUAL(a2(1), 200);
+  VERIFY_IS_EQUAL(a2(2), 300);
+
+  Tensor<int, 2> b(2, 3);
+  b.setValues({{0, 1, 2}, {3, 4, 5}});
+  VERIFY_IS_EQUAL(b(0, 0), 0);
+  VERIFY_IS_EQUAL(b(0, 1), 1);
+  VERIFY_IS_EQUAL(b(0, 2), 2);
+  VERIFY_IS_EQUAL(b(1, 0), 3);
+  VERIFY_IS_EQUAL(b(1, 1), 4);
+  VERIFY_IS_EQUAL(b(1, 2), 5);
+
+  // It fills the top-left slice.
+  b.setValues({{10, 20}, {30}});
+  VERIFY_IS_EQUAL(b(0, 0), 10);
+  VERIFY_IS_EQUAL(b(0, 1), 20);
+  VERIFY_IS_EQUAL(b(0, 2), 2);
+  VERIFY_IS_EQUAL(b(1, 0), 30);
+  VERIFY_IS_EQUAL(b(1, 1), 4);
+  VERIFY_IS_EQUAL(b(1, 2), 5);
+
+  Eigen::Tensor<int, 3> c(3, 2, 4);
+  c.setValues({{{0, 1, 2, 3}, {4, 5, 6, 7}},
+               {{10, 11, 12, 13}, {14, 15, 16, 17}},
+               {{20, 21, 22, 23}, {24, 25, 26, 27}}});
+  VERIFY_IS_EQUAL(c(0, 0, 0), 0);
+  VERIFY_IS_EQUAL(c(0, 0, 1), 1);
+  VERIFY_IS_EQUAL(c(0, 0, 2), 2);
+  VERIFY_IS_EQUAL(c(0, 0, 3), 3);
+  VERIFY_IS_EQUAL(c(0, 1, 0), 4);
+  VERIFY_IS_EQUAL(c(0, 1, 1), 5);
+  VERIFY_IS_EQUAL(c(0, 1, 2), 6);
+  VERIFY_IS_EQUAL(c(0, 1, 3), 7);
+  VERIFY_IS_EQUAL(c(1, 0, 0), 10);
+  VERIFY_IS_EQUAL(c(1, 0, 1), 11);
+  VERIFY_IS_EQUAL(c(1, 0, 2), 12);
+  VERIFY_IS_EQUAL(c(1, 0, 3), 13);
+  VERIFY_IS_EQUAL(c(1, 1, 0), 14);
+  VERIFY_IS_EQUAL(c(1, 1, 1), 15);
+  VERIFY_IS_EQUAL(c(1, 1, 2), 16);
+  VERIFY_IS_EQUAL(c(1, 1, 3), 17);
+  VERIFY_IS_EQUAL(c(2, 0, 0), 20);
+  VERIFY_IS_EQUAL(c(2, 0, 1), 21);
+  VERIFY_IS_EQUAL(c(2, 0, 2), 22);
+  VERIFY_IS_EQUAL(c(2, 0, 3), 23);
+  VERIFY_IS_EQUAL(c(2, 1, 0), 24);
+  VERIFY_IS_EQUAL(c(2, 1, 1), 25);
+  VERIFY_IS_EQUAL(c(2, 1, 2), 26);
+  VERIFY_IS_EQUAL(c(2, 1, 3), 27);
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+}
+
+void test_cxx11_tensor_assign()
+{
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_same_type());
+  CALL_SUBTEST(test_auto_resize());
+  CALL_SUBTEST(test_compound_assign());
+  CALL_SUBTEST(test_std_initializers_tensor());
+}
diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
new file mode 100644
index 000000000..7201bfe37
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -0,0 +1,74 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+
+  // BROADCAST test:
+  array<int, 4> in_range   = {{2, 3, 5, 7}};
+  array<int, 4> broadcasts = {{2, 3, 1, 4}};
+  array<int, 4> out_range;  // = in_range * broadcasts
+  for (size_t i = 0; i < out_range.size(); ++i)
+    out_range[i] = in_range[i] * broadcasts[i];
+
+  Tensor<float, 4>  input(in_range);
+  Tensor<float, 4> out(out_range);
+
+  for (size_t i = 0; i < in_range.size(); ++i)
+    VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
+
+
+  for (int i = 0; i < input.size(); ++i)
+    input(i) = static_cast<float>(i);
+
+  float * gpu_in_data  = static_cast<float*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_out_data  = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+
+  TensorMap<Tensor<float, 4>>  gpu_in(gpu_in_data, in_range);
+  TensorMap<Tensor<float, 4>> gpu_out(gpu_out_data, out_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(float));
+  gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 28; ++l) {
+          VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l));
+        }
+      }
+    }
+  }
+  printf("Broadcast Test Passed\n");
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+void test_cxx11_tensor_broadcast_sycl() {
+  cl::sycl::gpu_selector s;
+  Eigen::SyclDevice sycl_device(s);
+  CALL_SUBTEST(test_broadcast_sycl(sycl_device));
+}
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
new file mode 100644
index 000000000..5c0ea5889
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -0,0 +1,194 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_simple_broadcasting()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 1;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_broadcast;
+  no_broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 1;
+  broadcasts[3] = 4;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 4);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 28);
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 28; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%2,j%3,k%5,l%7), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_vectorized_broadcasting()
+{
+  Tensor<float, 3, DataLayout> tensor(8,3,5);
+  tensor.setRandom();
+  array<ptrdiff_t, 3> broadcasts;
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 4;
+
+  Tensor<float, 3, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+
+  tensor.resize(11,3,5);
+  tensor.setRandom();
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 22; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_static_broadcasting()
+{
+  Tensor<float, 3, DataLayout> tensor(8,3,5);
+  tensor.setRandom();
+
+#if EIGEN_HAS_CONSTEXPR
+  Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
+#else
+  Eigen::array<int, 3> broadcasts;
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 4;
+#endif
+
+  Tensor<float, 3, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+
+  tensor.resize(11,3,5);
+  tensor.setRandom();
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 22; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_fixed_size_broadcasting()
+{
+  // Need to add a [] operator to the Size class for this to work
+#if 0
+  Tensor<float, 1, DataLayout> t1(10);
+  t1.setRandom();
+  TensorFixedSize<float, Sizes<1>, DataLayout> t2;
+  t2 = t2.constant(20.0f);
+
+  Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{{10}});
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
+  }
+
+  TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {{1}});
+  Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{{10}});
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t5(i), t1(i) + t2(0));
+  }
+#endif
+}
+
+
+void test_cxx11_tensor_broadcasting()
+{
+  CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_vectorized_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_vectorized_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_static_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_static_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
new file mode 100644
index 000000000..88c233994
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
@@ -0,0 +1,82 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_conversion() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  Tensor<float, 1> floats(num_elem);
+  floats.setRandom();
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
+
+  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+
+void test_fallback_conversion() {
+  int num_elem = 101;
+  Tensor<float, 1> floats(num_elem);
+  floats.setRandom();
+
+  Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
+  Eigen::Tensor<float, 1> conv = halfs.cast<float>();
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(floats(i), conv(i));
+  }
+}
+
+
+void test_cxx11_tensor_cast_float16_cuda()
+{
+  CALL_SUBTEST(test_cuda_conversion());
+  CALL_SUBTEST(test_fallback_conversion());
+}
diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
new file mode 100644
index 000000000..3c6d0d2ff
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_casts.cpp
@@ -0,0 +1,115 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+static void test_simple_cast()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+  Tensor<char, 2> chartensor(20,30);
+  chartensor.setRandom();
+  Tensor<std::complex<float>, 2> cplextensor(20,30);
+  cplextensor.setRandom();
+
+  chartensor = ftensor.cast<char>();
+  cplextensor = ftensor.cast<std::complex<float> >();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(chartensor(i,j), static_cast<char>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float> >(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_vectorized_cast()
+{
+  Tensor<int, 2> itensor(20,30);
+  itensor = itensor.random() / 1000;
+  Tensor<float, 2> ftensor(20,30);
+  ftensor.setRandom();
+  Tensor<double, 2> dtensor(20,30);
+  dtensor.setRandom();
+
+  ftensor = itensor.cast<float>();
+  dtensor = itensor.cast<double>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(itensor(i,j), static_cast<int>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_float_to_int_cast()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 1000.0f;
+  Tensor<double, 2> dtensor(20,30);
+  dtensor = dtensor.random() * 1000.0;
+
+  Tensor<int, 2> i1tensor = ftensor.cast<int>();
+  Tensor<int, 2> i2tensor = dtensor.cast<int>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(i1tensor(i,j), static_cast<int>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(i2tensor(i,j), static_cast<int>(dtensor(i,j)));
+    }
+  }
+}
+
+
+static void test_big_to_small_type_cast()
+{
+  Tensor<double, 2> dtensor(20, 30);
+  dtensor.setRandom();
+  Tensor<float, 2> ftensor(20, 30);
+  ftensor = dtensor.cast<float>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_APPROX(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_small_to_big_type_cast()
+{
+  Tensor<float, 2> ftensor(20, 30);
+  ftensor.setRandom();
+  Tensor<double, 2> dtensor(20, 30);
+  dtensor = ftensor.cast<double>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_APPROX(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
+void test_cxx11_tensor_casts()
+{
+   CALL_SUBTEST(test_simple_cast());
+   CALL_SUBTEST(test_vectorized_cast());
+   CALL_SUBTEST(test_float_to_int_cast());
+   CALL_SUBTEST(test_big_to_small_type_cast());
+   CALL_SUBTEST(test_small_to_big_type_cast());
+}
diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
new file mode 100644
index 000000000..1832dec8b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_chipping.cpp
@@ -0,0 +1,425 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_chip()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 4, DataLayout> chip1;
+  chip1 = tensor.template chip<0>(1);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), 3);
+  VERIFY_IS_EQUAL(chip1.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip1.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip1.dimension(3), 11);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip2 = tensor.template chip<1>(1);
+  VERIFY_IS_EQUAL(chip2.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip2.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip3 = tensor.template chip<2>(2);
+  VERIFY_IS_EQUAL(chip3.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip3.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip3.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip3.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip4(tensor.template chip<3>(5));
+  VERIFY_IS_EQUAL(chip4.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip4.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip4.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip4.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip5(tensor.template chip<4>(7));
+  VERIFY_IS_EQUAL(chip5.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip5.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip5.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip5.dimension(3), 7);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_dynamic_chip()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 4, DataLayout> chip1;
+  chip1 = tensor.chip(1, 0);
+  VERIFY_IS_EQUAL(chip1.dimension(0), 3);
+  VERIFY_IS_EQUAL(chip1.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip1.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip1.dimension(3), 11);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip2 = tensor.chip(1, 1);
+  VERIFY_IS_EQUAL(chip2.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip2.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip3 = tensor.chip(2, 2);
+  VERIFY_IS_EQUAL(chip3.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip3.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip3.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip3.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip4(tensor.chip(5, 3));
+  VERIFY_IS_EQUAL(chip4.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip4.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip4.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip4.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip5(tensor.chip(7, 4));
+  VERIFY_IS_EQUAL(chip5.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip5.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip5.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip5.dimension(3), 7);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_chip_in_expr() {
+  Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
+  input1.setRandom();
+  Tensor<float, 4, DataLayout> input2(3,5,7,11);
+  input2.setRandom();
+
+  Tensor<float, 4, DataLayout> result = input1.template chip<0>(0) + input2;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          float expected = input1(0,i,j,k,l) + input2(i,j,k,l);
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected);
+        }
+      }
+    }
+  }
+
+  Tensor<float, 3, DataLayout> input3(3,7,11);
+  input3.setRandom();
+  Tensor<float, 3, DataLayout> result2 = input1.template chip<0>(0).template chip<1>(2) + input3;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        float expected = input1(0,i,2,j,k) + input3(i,j,k);
+        VERIFY_IS_EQUAL(result2(i,j,k), expected);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_chip_as_lvalue()
+{
+  Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
+  input1.setRandom();
+
+  Tensor<float, 4, DataLayout> input2(3,5,7,11);
+  input2.setRandom();
+  Tensor<float, 5, DataLayout> tensor = input1;
+  tensor.template chip<0>(1) = input2;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (i != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input3(2,5,7,11);
+  input3.setRandom();
+  tensor = input1;
+  tensor.template chip<1>(1) = input3;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (j != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input4(2,3,7,11);
+  input4.setRandom();
+  tensor = input1;
+  tensor.template chip<2>(3) = input4;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (k != 3) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input5(2,3,5,11);
+  input5.setRandom();
+  tensor = input1;
+  tensor.template chip<3>(4) = input5;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (l != 4) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input6(2,3,5,7);
+  input6.setRandom();
+  tensor = input1;
+  tensor.template chip<4>(5) = input6;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (m != 5) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 5, DataLayout> input7(2,3,5,7,11);
+  input7.setRandom();
+  tensor = input1;
+  tensor.chip(0, 0) = input7.chip(0, 0);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (i != 0) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void test_chip_raw_data_col_major()
+{
+  Tensor<float, 5, ColMajor> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  typedef TensorEvaluator<decltype(tensor.chip<4>(3)), DefaultDevice> Evaluator4;
+  auto chip = Evaluator4(tensor.chip<4>(3), DefaultDevice());
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int chip_index = i + 2 * (j + 3 * (k + 5 * l));
+          VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3));
+        }
+      }
+    }
+  }
+
+  typedef TensorEvaluator<decltype(tensor.chip<0>(0)), DefaultDevice> Evaluator0;
+  auto chip0 = Evaluator0(tensor.chip<0>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip0.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
+}
+
+static void test_chip_raw_data_row_major()
+{
+  Tensor<float, 5, RowMajor> tensor(11,7,5,3,2);
+  tensor.setRandom();
+
+  typedef TensorEvaluator<decltype(tensor.chip<0>(3)), DefaultDevice> Evaluator0;
+  auto chip = Evaluator0(tensor.chip<0>(3), DefaultDevice());
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          int chip_index = l + 2 * (k + 3 * (j + 5 * i));
+          VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(3,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  typedef TensorEvaluator<decltype(tensor.chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<4>(0)), DefaultDevice> Evaluator4;
+  auto chip4 = Evaluator4(tensor.chip<4>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0));
+}
+
+void test_cxx11_tensor_chipping()
+{
+  CALL_SUBTEST(test_simple_chip<ColMajor>());
+  CALL_SUBTEST(test_simple_chip<RowMajor>());
+  CALL_SUBTEST(test_dynamic_chip<ColMajor>());
+  CALL_SUBTEST(test_dynamic_chip<RowMajor>());
+  CALL_SUBTEST(test_chip_in_expr<ColMajor>());
+  CALL_SUBTEST(test_chip_in_expr<RowMajor>());
+  CALL_SUBTEST(test_chip_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_chip_as_lvalue<RowMajor>());
+  CALL_SUBTEST(test_chip_raw_data_col_major());
+  CALL_SUBTEST(test_chip_raw_data_row_major());
+}
diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
new file mode 100644
index 000000000..b1ff8aecb
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_orderings()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<bool, 3> lt(2,3,7);
+  Tensor<bool, 3> le(2,3,7);
+  Tensor<bool, 3> gt(2,3,7);
+  Tensor<bool, 3> ge(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  lt = mat1 < mat2;
+  le = mat1 <= mat2;
+  gt = mat1 > mat2;
+  ge = mat1 >= mat2;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(lt(i,j,k), mat1(i,j,k) < mat2(i,j,k));
+        VERIFY_IS_EQUAL(le(i,j,k), mat1(i,j,k) <= mat2(i,j,k));
+        VERIFY_IS_EQUAL(gt(i,j,k), mat1(i,j,k) > mat2(i,j,k));
+        VERIFY_IS_EQUAL(ge(i,j,k), mat1(i,j,k) >= mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_equality()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        if (internal::random<bool>()) {
+          mat2(i,j,k) = mat1(i,j,k);
+        }
+      }
+    }
+  }
+
+  Tensor<bool, 3> eq(2,3,7);
+  Tensor<bool, 3> ne(2,3,7);
+  eq = (mat1 == mat2);
+  ne = (mat1 != mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(eq(i,j,k), mat1(i,j,k) == mat2(i,j,k));
+        VERIFY_IS_EQUAL(ne(i,j,k), mat1(i,j,k) != mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_comparisons()
+{
+  CALL_SUBTEST(test_orderings());
+  CALL_SUBTEST(test_equality());
+}
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu
new file mode 100644
index 000000000..d4e111f5d
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_complex_cuda.cu
@@ -0,0 +1,153 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_complex
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_nullary() {
+  Tensor<std::complex<float>, 1, 0, int> in1(2);
+  Tensor<std::complex<float>, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t float_bytes = in1.size() * sizeof(float);
+  std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>);
+
+  std::complex<float>* d_in1;
+  std::complex<float>* d_in2;
+  float* d_out2;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out2), float_bytes);
+  cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2(
+      d_out2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f));
+  gpu_out2.device(gpu_device) = gpu_in2.abs();
+
+  Tensor<std::complex<float>, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f));
+    VERIFY_IS_APPROX(new2(i), std::abs(in2(i)));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out2);
+}
+
+
+static void test_cuda_sum_reductions() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+
+static void test_cuda_product_reductions() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.prod();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.prod();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+
+void test_cxx11_tensor_complex()
+{
+  CALL_SUBTEST(test_cuda_nullary());
+  CALL_SUBTEST(test_cuda_sum_reductions());
+  CALL_SUBTEST(test_cuda_product_reductions());
+}
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
new file mode 100644
index 000000000..2baf5eaad
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
@@ -0,0 +1,97 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename T>
+void test_cuda_complex_cwise_ops() {
+  const int kNumItems = 2;
+  std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>);
+
+  std::complex<T>* d_in1;
+  std::complex<T>* d_in2;
+  std::complex<T>* d_out;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out), complex_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(
+      d_out, kNumItems);
+
+  const std::complex<T> a(3.14f, 2.7f);
+  const std::complex<T> b(-10.6f, 1.4f);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(a);
+  gpu_in2.device(gpu_device) = gpu_in2.constant(b);
+
+  enum CwiseOp {
+    Add = 0,
+    Sub,
+    Mul,
+    Div
+  };
+
+  Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
+  for (int op = Add; op <= Div; op++) {
+    std::complex<T> expected;
+    switch (static_cast<CwiseOp>(op)) {
+      case Add:
+        gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+        expected = a + b;
+        break;
+      case Sub:
+        gpu_out.device(gpu_device) = gpu_in1 - gpu_in2;
+        expected = a - b;
+        break;
+      case Mul:
+        gpu_out.device(gpu_device) = gpu_in1 * gpu_in2;
+        expected = a * b;
+        break;
+      case Div:
+        gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
+        expected = a / b;
+        break;
+    }
+    assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
+                           gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    for (int i = 0; i < kNumItems; ++i) {
+      VERIFY_IS_APPROX(actual(i), expected);
+    }
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
+}
+
+
+void test_cxx11_tensor_complex_cwise_ops()
+{
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
+}
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
new file mode 100644
index 000000000..03ef12e63
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -0,0 +1,137 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_dimension_failures()
+{
+  Tensor<int, 3, DataLayout> left(2, 3, 1);
+  Tensor<int, 3, DataLayout> right(3, 3, 1);
+  left.setRandom();
+  right.setRandom();
+
+  // Okay; other dimensions are equal.
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+
+  // Dimension mismatches.
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1));
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 2));
+
+  // Axis > NumDims or < 0.
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 3));
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1));
+}
+
+template<int DataLayout>
+static void test_static_dimension_failure()
+{
+  Tensor<int, 2, DataLayout> left(2, 3);
+  Tensor<int, 3, DataLayout> right(2, 3, 1);
+
+#ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE
+  // Technically compatible, but we static assert that the inputs have same
+  // NumDims.
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+#endif
+
+  // This can be worked around in this case.
+  Tensor<int, 3, DataLayout> concatenation = left
+      .reshape(Tensor<int, 3>::Dimensions(2, 3, 1))
+      .concatenate(right, 0);
+  Tensor<int, 2, DataLayout> alternative = left
+      .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{{2, 3}}}), 0);
+}
+
+template<int DataLayout>
+static void test_simple_concatenation()
+{
+  Tensor<int, 3, DataLayout> left(2, 3, 1);
+  Tensor<int, 3, DataLayout> right(2, 3, 1);
+  left.setRandom();
+  right.setRandom();
+
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 4);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
+  for (int j = 0; j < 3; ++j) {
+    for (int i = 0; i < 2; ++i) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+    }
+    for (int i = 2; i < 4; ++i) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i - 2, j, 0));
+    }
+  }
+
+  concatenation = left.concatenate(right, 1);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 6);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+    }
+    for (int j = 3; j < 6; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i, j - 3, 0));
+    }
+  }
+
+  concatenation = left.concatenate(right, 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+      VERIFY_IS_EQUAL(concatenation(i, j, 1), right(i, j, 0));
+    }
+  }
+}
+
+
+// TODO(phli): Add test once we have a real vectorized implementation.
+// static void test_vectorized_concatenation() {}
+
+static void test_concatenation_as_lvalue()
+{
+  Tensor<int, 2> t1(2, 3);
+  Tensor<int, 2> t2(2, 3);
+  t1.setRandom();
+  t2.setRandom();
+
+  Tensor<int, 2> result(4, 3);
+  result.setRandom();
+  t1.concatenate(t2, 0) = result;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(t1(i, j), result(i, j));
+      VERIFY_IS_EQUAL(t2(i, j), result(i+2, j));
+    }
+  }
+}
+
+
+void test_cxx11_tensor_concatenation()
+{
+   CALL_SUBTEST(test_dimension_failures<ColMajor>());
+   CALL_SUBTEST(test_dimension_failures<RowMajor>());
+   CALL_SUBTEST(test_static_dimension_failure<ColMajor>());
+   CALL_SUBTEST(test_static_dimension_failure<RowMajor>());
+   CALL_SUBTEST(test_simple_concatenation<ColMajor>());
+   CALL_SUBTEST(test_simple_concatenation<RowMajor>());
+   // CALL_SUBTEST(test_vectorized_concatenation());
+   CALL_SUBTEST(test_concatenation_as_lvalue());
+
+}
diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
new file mode 100644
index 000000000..ad9c9da39
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_const.cpp
@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+using Eigen::Tensor;
+
+
+static void test_simple_assign()
+{
+  Tensor<int, 3> random(2,3,7);
+  random.setRandom();
+
+  TensorMap<Tensor<const int, 3> > constant(random.data(), 2, 3, 7);
+  Tensor<int, 3> result(2,3,7);
+  result = constant;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL((result(i,j,k)), random(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_assign_of_const_tensor()
+{
+  Tensor<int, 3> random(2,3,7);
+  random.setRandom();
+
+  TensorMap<Tensor<const int, 3> > constant1(random.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3> > constant2(random.data(), 2, 3, 7);
+  const TensorMap<Tensor<int, 3> > constant3(random.data(), 2, 3, 7);
+
+  Tensor<int, 2> result1 = constant1.chip(0, 2);
+  Tensor<int, 2> result2 = constant2.chip(0, 2);
+  Tensor<int, 2> result3 = constant3.chip(0, 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL((result1(i,j)), random(i,j,0));
+      VERIFY_IS_EQUAL((result2(i,j)), random(i,j,0));
+      VERIFY_IS_EQUAL((result3(i,j)), random(i,j,0));
+    }
+  }
+}
+
+
+void test_cxx11_tensor_const()
+{
+  CALL_SUBTEST(test_simple_assign());
+  CALL_SUBTEST(test_assign_of_const_tensor());
+}
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu
new file mode 100644
index 000000000..dd68430ce
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -0,0 +1,216 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_cuda_contraction(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
+
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  cudaFree((void*)d_t_left);
+  cudaFree((void*)d_t_right);
+  cudaFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_scalar(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 0, DataLayout> t_result;
+  Tensor<float, 0, DataLayout> t_result_gpu;
+  Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, m_size, k_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, k_size, n_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
+      gpu_t_result(d_t_result);
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
+    std::cout << "mismatch detected: " << t_result()
+              << " vs " <<  t_result_gpu() << std::endl;
+    assert(false);
+  }
+
+  cudaFree((void*)d_t_left);
+  cudaFree((void*)d_t_right);
+  cudaFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_cuda_contraction_m() {
+  for (int k = 32; k < 256; k++) {
+    test_cuda_contraction<ColMajor>(k, 128, 128);
+    test_cuda_contraction<RowMajor>(k, 128, 128);
+  }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_k() {
+  for (int k = 32; k < 256; k++) {
+    test_cuda_contraction<ColMajor>(128, k, 128);
+    test_cuda_contraction<RowMajor>(128, k, 128);
+  }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_n() {
+  for (int k = 32; k < 256; k++) {
+    test_cuda_contraction<ColMajor>(128, 128, k);
+    test_cuda_contraction<RowMajor>(128, 128, k);
+  }
+}
+
+
+template<int DataLayout>
+void test_cuda_contraction_sizes() {
+  int m_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257 , 511,
+                   512, 513, 1023, 1024, 1025};
+
+  int n_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257,  511,
+                   512, 513, 1023, 1024, 1025};
+
+  int k_sizes[] = {  31,   39,  63,  64,   65,
+                     95,   96, 127, 129,  255,
+                    257,  511, 512, 513, 1023,
+                   1024, 1025};
+
+  for (int i = 0; i < 15; i++) {
+    for (int j = 0; j < 15; j++) {
+      for (int k = 0; k < 17; k++) {
+        test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+
+  CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+
+  CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+
+  CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
new file mode 100644
index 000000000..ace97057f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -0,0 +1,545 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::DefaultDevice;
+using Eigen::Tensor;
+
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+static void test_evals()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(2, 3);
+  Tensor<float, 2, DataLayout> mat3(3, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  mat3.setRandom();
+
+  Tensor<float, 2, DataLayout> mat4(3,3);
+  mat4.setZero();
+  Eigen::array<DimPair, 1> dims3 = {{DimPair(0, 0)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims3)), DefaultDevice> Evaluator;
+  Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice());
+  eval.evalTo(mat4.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 3);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
+
+  VERIFY_IS_APPROX(mat4(0,0), mat1(0,0)*mat2(0,0) + mat1(1,0)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(0,1), mat1(0,0)*mat2(0,1) + mat1(1,0)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(0,2), mat1(0,0)*mat2(0,2) + mat1(1,0)*mat2(1,2));
+  VERIFY_IS_APPROX(mat4(1,0), mat1(0,1)*mat2(0,0) + mat1(1,1)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(1,1), mat1(0,1)*mat2(0,1) + mat1(1,1)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(1,2), mat1(0,1)*mat2(0,2) + mat1(1,1)*mat2(1,2));
+  VERIFY_IS_APPROX(mat4(2,0), mat1(0,2)*mat2(0,0) + mat1(1,2)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2));
+
+  Tensor<float, 2, DataLayout> mat5(2,2);
+  mat5.setZero();
+  Eigen::array<DimPair, 1> dims4 = {{DimPair(1, 1)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims4)), DefaultDevice> Evaluator2;
+  Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice());
+  eval2.evalTo(mat5.data());
+  EIGEN_STATIC_ASSERT(Evaluator2::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval2.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval2.dimensions()[1], 2);
+
+  VERIFY_IS_APPROX(mat5(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(0,1) + mat1(0,2)*mat2(0,2));
+  VERIFY_IS_APPROX(mat5(0,1), mat1(0,0)*mat2(1,0) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(1,2));
+  VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2));
+  VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2));
+
+  Tensor<float, 2, DataLayout> mat6(2,2);
+  mat6.setZero();
+  Eigen::array<DimPair, 1> dims6 = {{DimPair(1, 0)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat3, dims6)), DefaultDevice> Evaluator3;
+  Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice());
+  eval3.evalTo(mat6.data());
+  EIGEN_STATIC_ASSERT(Evaluator3::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval3.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval3.dimensions()[1], 2);
+
+  VERIFY_IS_APPROX(mat6(0,0), mat1(0,0)*mat3(0,0) + mat1(0,1)*mat3(1,0) + mat1(0,2)*mat3(2,0));
+  VERIFY_IS_APPROX(mat6(0,1), mat1(0,0)*mat3(0,1) + mat1(0,1)*mat3(1,1) + mat1(0,2)*mat3(2,1));
+  VERIFY_IS_APPROX(mat6(1,0), mat1(1,0)*mat3(0,0) + mat1(1,1)*mat3(1,0) + mat1(1,2)*mat3(2,0));
+  VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1));
+}
+
+template<int DataLayout>
+static void test_scalar()
+{
+  Tensor<float, 1, DataLayout> vec1({6});
+  Tensor<float, 1, DataLayout> vec2({6});
+
+  vec1.setRandom();
+  vec2.setRandom();
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
+  Tensor<float, 0, DataLayout> scalar = vec1.contract(vec2, dims);
+
+  float expected = 0.0f;
+  for (int i = 0; i < 6; ++i) {
+    expected += vec1(i) * vec2(i);
+  }
+  VERIFY_IS_APPROX(scalar(), expected);
+}
+
+template<int DataLayout>
+static void test_multidims()
+{
+  Tensor<float, 3, DataLayout> mat1(2, 2, 2);
+  Tensor<float, 4, DataLayout> mat2(2, 2, 2, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 3, DataLayout> mat3(2, 2, 2);
+  mat3.setZero();
+  Eigen::array<DimPair, 2> dims = {{DimPair(1, 2), DimPair(2, 3)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims)), DefaultDevice> Evaluator;
+  Evaluator eval(mat1.contract(mat2, dims), DefaultDevice());
+  eval.evalTo(mat3.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[2], 2);
+
+  VERIFY_IS_APPROX(mat3(0,0,0), mat1(0,0,0)*mat2(0,0,0,0) + mat1(0,1,0)*mat2(0,0,1,0) +
+                                mat1(0,0,1)*mat2(0,0,0,1) + mat1(0,1,1)*mat2(0,0,1,1));
+  VERIFY_IS_APPROX(mat3(0,0,1), mat1(0,0,0)*mat2(0,1,0,0) + mat1(0,1,0)*mat2(0,1,1,0) +
+                                mat1(0,0,1)*mat2(0,1,0,1) + mat1(0,1,1)*mat2(0,1,1,1));
+  VERIFY_IS_APPROX(mat3(0,1,0), mat1(0,0,0)*mat2(1,0,0,0) + mat1(0,1,0)*mat2(1,0,1,0) +
+                                mat1(0,0,1)*mat2(1,0,0,1) + mat1(0,1,1)*mat2(1,0,1,1));
+  VERIFY_IS_APPROX(mat3(0,1,1), mat1(0,0,0)*mat2(1,1,0,0) + mat1(0,1,0)*mat2(1,1,1,0) +
+                                mat1(0,0,1)*mat2(1,1,0,1) + mat1(0,1,1)*mat2(1,1,1,1));
+  VERIFY_IS_APPROX(mat3(1,0,0), mat1(1,0,0)*mat2(0,0,0,0) + mat1(1,1,0)*mat2(0,0,1,0) +
+                                mat1(1,0,1)*mat2(0,0,0,1) + mat1(1,1,1)*mat2(0,0,1,1));
+  VERIFY_IS_APPROX(mat3(1,0,1), mat1(1,0,0)*mat2(0,1,0,0) + mat1(1,1,0)*mat2(0,1,1,0) +
+                                mat1(1,0,1)*mat2(0,1,0,1) + mat1(1,1,1)*mat2(0,1,1,1));
+  VERIFY_IS_APPROX(mat3(1,1,0), mat1(1,0,0)*mat2(1,0,0,0) + mat1(1,1,0)*mat2(1,0,1,0) +
+                                mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1));
+  VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) +
+                                mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1));
+
+  Tensor<float, 2, DataLayout> mat4(2, 2);
+  Tensor<float, 3, DataLayout> mat5(2, 2, 2);
+
+  mat4.setRandom();
+  mat5.setRandom();
+
+  Tensor<float, 1, DataLayout> mat6(2);
+  mat6.setZero();
+  Eigen::array<DimPair, 2> dims2({{DimPair(0, 1), DimPair(1, 0)}});
+  typedef TensorEvaluator<decltype(mat4.contract(mat5, dims2)), DefaultDevice> Evaluator2;
+  Evaluator2 eval2(mat4.contract(mat5, dims2), DefaultDevice());
+  eval2.evalTo(mat6.data());
+  EIGEN_STATIC_ASSERT(Evaluator2::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval2.dimensions()[0], 2);
+
+  VERIFY_IS_APPROX(mat6(0), mat4(0,0)*mat5(0,0,0) + mat4(1,0)*mat5(0,1,0) +
+                   mat4(0,1)*mat5(1,0,0) + mat4(1,1)*mat5(1,1,0));
+  VERIFY_IS_APPROX(mat6(1), mat4(0,0)*mat5(0,0,1) + mat4(1,0)*mat5(0,1,1) +
+                   mat4(0,1)*mat5(1,0,1) + mat4(1,1)*mat5(1,1,1));
+}
+
+template<int DataLayout>
+static void test_holes() {
+  Tensor<float, 4, DataLayout> t1(2, 5, 7, 3);
+  Tensor<float, 5, DataLayout> t2(2, 7, 11, 13, 3);
+  t1.setRandom();
+  t2.setRandom();
+
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(3, 4)}};
+  Tensor<float, 5, DataLayout> result = t1.contract(t2, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  VERIFY_IS_EQUAL(result.dimension(2), 7);
+  VERIFY_IS_EQUAL(result.dimension(3), 11);
+  VERIFY_IS_EQUAL(result.dimension(4), 13);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          for (int m = 0; m < 5; ++m) {
+            VERIFY_IS_APPROX(result(i, j, k, l, m),
+                             t1(0, i, j, 0) * t2(0, k, l, m, 0) +
+                             t1(1, i, j, 0) * t2(1, k, l, m, 0) +
+                             t1(0, i, j, 1) * t2(0, k, l, m, 1) +
+                             t1(1, i, j, 1) * t2(1, k, l, m, 1) +
+                             t1(0, i, j, 2) * t2(0, k, l, m, 2) +
+                             t1(1, i, j, 2) * t2(1, k, l, m, 2));
+          }
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_full_redux()
+{
+  Tensor<float, 2, DataLayout> t1(2, 2);
+  Tensor<float, 3, DataLayout> t2(2, 2, 2);
+  t1.setRandom();
+  t2.setRandom();
+
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
+  Tensor<float, 1, DataLayout> result = t1.contract(t2, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) +  t1(1, 0) * t2(1, 0, 0)
+                            + t1(0, 1) * t2(0, 1, 0) +  t1(1, 1) * t2(1, 1, 0));
+  VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(0, 0, 1) +  t1(1, 0) * t2(1, 0, 1)
+                            + t1(0, 1) * t2(0, 1, 1) +  t1(1, 1) * t2(1, 1, 1));
+
+  dims[0] = DimPair(1, 0);
+  dims[1] = DimPair(2, 1);
+  result = t2.contract(t1, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) +  t1(1, 0) * t2(0, 1, 0)
+                            + t1(0, 1) * t2(0, 0, 1) +  t1(1, 1) * t2(0, 1, 1));
+  VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(1, 0, 0) +  t1(1, 0) * t2(1, 1, 0)
+                            + t1(0, 1) * t2(1, 0, 1) +  t1(1, 1) * t2(1, 1, 1));
+}
+
+template<int DataLayout>
+static void test_contraction_of_contraction()
+{
+  Tensor<float, 2, DataLayout> t1(2, 2);
+  Tensor<float, 2, DataLayout> t2(2, 2);
+  Tensor<float, 2, DataLayout> t3(2, 2);
+  Tensor<float, 2, DataLayout> t4(2, 2);
+  t1.setRandom();
+  t2.setRandom();
+  t3.setRandom();
+  t4.setRandom();
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  auto contract1 = t1.contract(t2, dims);
+  auto diff = t3 - contract1;
+  auto contract2 = t1.contract(t4, dims);
+  Tensor<float, 2, DataLayout> result = contract2.contract(diff, dims);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 2);
+
+  Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>>
+      m1(t1.data(), 2, 2), m2(t2.data(), 2, 2), m3(t3.data(), 2, 2),
+      m4(t4.data(), 2, 2);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>
+      expected = (m1 * m4) * (m3 - m1 * m2);
+
+  VERIFY_IS_APPROX(result(0, 0), expected(0, 0));
+  VERIFY_IS_APPROX(result(0, 1), expected(0, 1));
+  VERIFY_IS_APPROX(result(1, 0), expected(1, 0));
+  VERIFY_IS_APPROX(result(1, 1), expected(1, 1));
+}
+
+template<int DataLayout>
+static void test_expr()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(3, 2);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 2, DataLayout> mat3(2,2);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+  VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
+template<int DataLayout>
+static void test_out_of_order_contraction()
+{
+  Tensor<float, 3, DataLayout> mat1(2, 2, 2);
+  Tensor<float, 3, DataLayout> mat2(2, 2, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 2, DataLayout> mat3(2, 2);
+
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(0, 2)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0, 0),
+                   mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) +
+                   mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(1, 0),
+                   mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) +
+                   mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(0, 1),
+                   mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) +
+                   mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1));
+  VERIFY_IS_APPROX(mat3(1, 1),
+                   mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
+                   mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
+
+  Eigen::array<DimPair, 2> dims2 = {{DimPair(0, 2), DimPair(2, 0)}};
+  mat3 = mat1.contract(mat2, dims2);
+
+  VERIFY_IS_APPROX(mat3(0, 0),
+                   mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) +
+                   mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(1, 0),
+                   mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) +
+                   mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(0, 1),
+                   mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) +
+                   mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1));
+  VERIFY_IS_APPROX(mat3(1, 1),
+                   mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
+                   mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
+
+}
+
+template<int DataLayout>
+static void test_consistency()
+{
+  // this does something like testing (A*B)^T = (B^T * A^T)
+
+  Tensor<float, 3, DataLayout> mat1(4, 3, 5);
+  Tensor<float, 5, DataLayout> mat2(3, 2, 1, 5, 4);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 4, DataLayout> mat3(5, 2, 1, 5);
+  Tensor<float, 4, DataLayout> mat4(2, 1, 5, 5);
+
+  // contract on dimensions of size 4 and 3
+  Eigen::array<DimPair, 2> dims1 = {{DimPair(0, 4), DimPair(1, 0)}};
+  Eigen::array<DimPair, 2> dims2 = {{DimPair(4, 0), DimPair(0, 1)}};
+
+  mat3 = mat1.contract(mat2, dims1);
+  mat4 = mat2.contract(mat1, dims2);
+
+  // check that these are equal except for ordering of dimensions
+  if (DataLayout == ColMajor) {
+    for (size_t i = 0; i < 5; i++) {
+      for (size_t j = 0; j < 10; j++) {
+        VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]);
+      }
+    }
+  } else {
+    // Row major
+    for (size_t i = 0; i < 5; i++) {
+      for (size_t j = 0; j < 10; j++) {
+        VERIFY_IS_APPROX(mat3.data()[10 * i + j], mat4.data()[i + 5 * j]);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_large_contraction()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(3, 1)}};
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+template<int DataLayout>
+static void test_matrix_vector()
+{
+  Tensor<float, 2, DataLayout> t_left(30, 50);
+  Tensor<float, 1, DataLayout> t_right(50);
+  Tensor<float, 1, DataLayout> t_result(30);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 30, 50);
+  MapXf m_right(t_right.data(), 50, 1);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(30, 1);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
+  }
+}
+
+
+template<int DataLayout>
+static void test_tensor_vector()
+{
+  Tensor<float, 3, DataLayout> t_left(7, 13, 17);
+  Tensor<float, 2, DataLayout> t_right(1, 7);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef typename Tensor<float, 1, DataLayout>::DimensionPair DimensionPair;
+  Eigen::array<DimensionPair, 1> dim_pair01{{{0, 1}}};
+  Tensor<float, 3, DataLayout> t_result = t_left.contract(t_right, dim_pair01);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 7, 13*17);
+  MapXf m_right(t_right.data(), 1, 7);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left.transpose() * m_right.transpose();
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
+  }
+}
+
+
+template<int DataLayout>
+static void test_small_blocking_factors()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 5, 3, 31);
+  Tensor<float, 5, DataLayout> t_right(3, 31, 7, 20, 1);
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  // Force the cache sizes, which results in smaller blocking factors.
+  Eigen::setCpuCacheSizes(896, 1920, 2944);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(3, 1)}};
+  Tensor<float, 5, DataLayout> t_result;
+  t_result = t_left.contract(t_right, dims);
+
+  // compute result using a simple eigen matrix product
+  Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_left(t_left.data(), 150, 93);
+  Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left * m_right;
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+template<int DataLayout>
+static void test_tensor_product()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(4, 1);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{{}});
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 3);
+  VERIFY_IS_EQUAL(result.dimension(2), 4);
+  VERIFY_IS_EQUAL(result.dimension(3), 1);
+  for (int i = 0; i < result.dimension(0); ++i) {
+    for (int j = 0; j < result.dimension(1); ++j) {
+      for (int k = 0; k < result.dimension(2); ++k) {
+        for (int l = 0; l < result.dimension(3); ++l) {
+			VERIFY_IS_APPROX(result(i, j, k, l), mat1(i, j) * mat2(k, l) );
+        }
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_const_inputs()
+{
+  Tensor<float, 2, DataLayout> in1(2, 3);
+  Tensor<float, 2, DataLayout> in2(3, 2);
+  in1.setRandom();
+  in2.setRandom();
+
+  TensorMap<Tensor<const float, 2, DataLayout> > mat1(in1.data(), 2, 3);
+  TensorMap<Tensor<const float, 2, DataLayout> > mat2(in2.data(), 3, 2);
+  Tensor<float, 2, DataLayout> mat3(2,2);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+  VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
+void test_cxx11_tensor_contraction()
+{
+  CALL_SUBTEST(test_evals<ColMajor>());
+  CALL_SUBTEST(test_evals<RowMajor>());
+  CALL_SUBTEST(test_scalar<ColMajor>());
+  CALL_SUBTEST(test_scalar<RowMajor>());
+  CALL_SUBTEST(test_multidims<ColMajor>());
+  CALL_SUBTEST(test_multidims<RowMajor>());
+  CALL_SUBTEST(test_holes<ColMajor>());
+  CALL_SUBTEST(test_holes<RowMajor>());
+  CALL_SUBTEST(test_full_redux<ColMajor>());
+  CALL_SUBTEST(test_full_redux<RowMajor>());
+  CALL_SUBTEST(test_contraction_of_contraction<ColMajor>());
+  CALL_SUBTEST(test_contraction_of_contraction<RowMajor>());
+  CALL_SUBTEST(test_expr<ColMajor>());
+  CALL_SUBTEST(test_expr<RowMajor>());
+  CALL_SUBTEST(test_out_of_order_contraction<ColMajor>());
+  CALL_SUBTEST(test_out_of_order_contraction<RowMajor>());
+  CALL_SUBTEST(test_consistency<ColMajor>());
+  CALL_SUBTEST(test_consistency<RowMajor>());
+  CALL_SUBTEST(test_large_contraction<ColMajor>());
+  CALL_SUBTEST(test_large_contraction<RowMajor>());
+  CALL_SUBTEST(test_matrix_vector<ColMajor>());
+  CALL_SUBTEST(test_matrix_vector<RowMajor>());
+  CALL_SUBTEST(test_tensor_vector<ColMajor>());
+  CALL_SUBTEST(test_tensor_vector<RowMajor>());
+  CALL_SUBTEST(test_small_blocking_factors<ColMajor>());
+  CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
+  CALL_SUBTEST(test_tensor_product<ColMajor>());
+  CALL_SUBTEST(test_tensor_product<RowMajor>());
+  CALL_SUBTEST(test_const_inputs<ColMajor>());
+  CALL_SUBTEST(test_const_inputs<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
new file mode 100644
index 000000000..e3d4675eb
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_convolution.cpp
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::DefaultDevice;
+
+template <int DataLayout>
+static void test_evals()
+{
+  Tensor<float, 2, DataLayout> input(3, 3);
+  Tensor<float, 1, DataLayout> kernel(2);
+
+  input.setRandom();
+  kernel.setRandom();
+
+  Tensor<float, 2, DataLayout> result(2,3);
+  result.setZero();
+  Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}};
+
+  typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator;
+  Evaluator eval(input.convolve(kernel, dims3), DefaultDevice());
+  eval.evalTo(result.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
+  VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
+  VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
+}
+
+template <int DataLayout>
+static void test_expr()
+{
+  Tensor<float, 2, DataLayout> input(3, 3);
+  Tensor<float, 2, DataLayout> kernel(2, 2);
+  input.setRandom();
+  kernel.setRandom();
+
+  Tensor<float, 2, DataLayout> result(2,2);
+  Eigen::array<ptrdiff_t, 2> dims;
+  dims[0] = 0;
+  dims[1] = 1;
+  result = input.convolve(kernel, dims);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+                                input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+                                input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+                                input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+                                input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+}
+
+template <int DataLayout>
+static void test_modes() {
+  Tensor<float, 1, DataLayout> input(3);
+  Tensor<float, 1, DataLayout> kernel(3);
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  Eigen::array<ptrdiff_t, 1> dims;
+  dims[0] = 0;
+  Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<float, 1, DataLayout> valid(1);
+  valid = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<float, 1, DataLayout> same(3);
+  same = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+  Tensor<float, 1, DataLayout> full(5);
+  full = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+}
+
+template <int DataLayout>
+static void test_strides() {
+  Tensor<float, 1, DataLayout> input(13);
+  Tensor<float, 1, DataLayout> kernel(3);
+  input.setRandom();
+  kernel.setRandom();
+
+  Eigen::array<ptrdiff_t, 1> dims;
+  dims[0] = 0;
+  Eigen::array<ptrdiff_t, 1> stride_of_3;
+  stride_of_3[0] = 3;
+  Eigen::array<ptrdiff_t, 1> stride_of_2;
+  stride_of_2[0] = 2;
+
+  Tensor<float, 1, DataLayout> result;
+  result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+void test_cxx11_tensor_convolution()
+{
+  CALL_SUBTEST(test_evals<ColMajor>());
+  CALL_SUBTEST(test_evals<RowMajor>());
+  CALL_SUBTEST(test_expr<ColMajor>());
+  CALL_SUBTEST(test_expr<RowMajor>());
+  CALL_SUBTEST(test_modes<ColMajor>());
+  CALL_SUBTEST(test_modes<RowMajor>());
+  CALL_SUBTEST(test_strides<ColMajor>());
+  CALL_SUBTEST(test_strides<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
new file mode 100644
index 000000000..0ba9d52e9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -0,0 +1,1287 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_nullary() {
+  Tensor<float, 1, 0, int> in1(2);
+  Tensor<float, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t tensor_bytes = in1.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  cudaMalloc((void**)(&d_in1), tensor_bytes);
+  cudaMalloc((void**)(&d_in2), tensor_bytes);
+  cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
+  gpu_in2.device(gpu_device) = gpu_in2.random();
+
+  Tensor<float, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), 3.14f);
+    VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+void test_cuda_elementwise_small() {
+  Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
+      d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
+      d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(
+        out(Eigen::array<Eigen::DenseIndex, 1>(i)),
+        in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
+}
+
+void test_cuda_elementwise()
+{
+  Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  in1.setRandom();
+  in2.setRandom();
+  in3.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t in3_bytes = in3.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_in3;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_in3), in3_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      for (int k = 0; k < 97; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
+      }
+    }
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_in3);
+  cudaFree(d_out);
+}
+
+void test_cuda_props() {
+  Tensor<float, 1> in1(200);
+  Tensor<bool, 1> out(200);
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(bool);
+
+  float* d_in1;
+  bool* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, 200);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
+      d_out, 200);
+
+  gpu_out.device(gpu_device) = (gpu_in1.isnan)();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 200; ++i) {
+    VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_out);
+}
+
+void test_cuda_reduction()
+{
+  Tensor<float, 4> in1(72,53,97,113);
+  Tensor<float, 2> out(72,97);
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  array<Eigen::DenseIndex, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = 0;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected =
+              std::max<float>(expected, in1(i, k, j, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i,j), expected);
+    }
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_out);
+}
+
+template<int DataLayout>
+void test_cuda_contraction()
+{
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
+  Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
+
+  typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
+  MapXf m_left(t_left.data(), 300, 93);
+  MapXf m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims;
+  dims[0] = DimPair(2, 0);
+  dims[1] = DimPair(3, 1);
+
+  m_result = m_left * m_right;
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+
+  cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  cudaFree(d_t_left);
+  cudaFree(d_t_right);
+  cudaFree(d_t_result);
+}
+
+template<int DataLayout>
+void test_cuda_convolution_1d()
+{
+  Tensor<float, 4, DataLayout> input(74,37,11,137);
+  Tensor<float, 1, DataLayout> kernel(4);
+  Tensor<float, 4, DataLayout> out(74,34,11,137);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims(1);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 34; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
+                                 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+void test_cuda_convolution_inner_dim_col_major_1d()
+{
+  Tensor<float, 4, ColMajor> input(74,9,11,7);
+  Tensor<float, 1, ColMajor> kernel(4);
+  Tensor<float, 4, ColMajor> out(71,9,11,7);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims(0);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 71; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
+                                 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+void test_cuda_convolution_inner_dim_row_major_1d()
+{
+  Tensor<float, 4, RowMajor> input(7,9,11,74);
+  Tensor<float, 1, RowMajor> kernel(4);
+  Tensor<float, 4, RowMajor> out(7,9,11,71);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims(3);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 71; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
+                                 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+template<int DataLayout>
+void test_cuda_convolution_2d()
+{
+  Tensor<float, 4, DataLayout> input(74,37,11,137);
+  Tensor<float, 2, DataLayout> kernel(3,4);
+  Tensor<float, 4, DataLayout> out(74,35,8,137);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
+
+  Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+                                 input(i,j+1,k+0,l) * kernel(1,0) +
+                                 input(i,j+2,k+0,l) * kernel(2,0) +
+                                 input(i,j+0,k+1,l) * kernel(0,1) +
+                                 input(i,j+1,k+1,l) * kernel(1,1) +
+                                 input(i,j+2,k+1,l) * kernel(2,1) +
+                                 input(i,j+0,k+2,l) * kernel(0,2) +
+                                 input(i,j+1,k+2,l) * kernel(1,2) +
+                                 input(i,j+2,k+2,l) * kernel(2,2) +
+                                 input(i,j+0,k+3,l) * kernel(0,3) +
+                                 input(i,j+1,k+3,l) * kernel(1,3) +
+                                 input(i,j+2,k+3,l) * kernel(2,3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+template<int DataLayout>
+void test_cuda_convolution_3d()
+{
+  Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
+  Tensor<float, 3, DataLayout> kernel(3,4,2);
+  Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;    
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
+
+  Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 136; ++l) {
+          for (int m = 0; m < 17; ++m) {
+            const float result = out(i,j,k,l,m);
+            const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
+                                   input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
+                                   input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
+                                   input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
+                                   input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
+                                   input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
+                                   input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
+                                   input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
+                                   input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
+                                   input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
+                                   input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
+                                   input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
+                                   input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
+                                   input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
+                                   input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
+                                   input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
+                                   input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
+                                   input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
+                                   input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
+                                   input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
+                                   input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
+                                   input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
+                                   input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
+                                   input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
+            VERIFY_IS_APPROX(result, expected);
+          }
+        }
+      }
+    }
+  }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+
+template <typename Scalar>
+void test_cuda_lgamma(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.lgamma();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
+    }
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_digamma()
+{
+  Tensor<Scalar, 1> in(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in(0) = Scalar(1);
+  in(1) = Scalar(1.5);
+  in(2) = Scalar(4);
+  in(3) = Scalar(-10.5);
+  in(4) = Scalar(10000.5);
+  in(5) = Scalar(0);
+  in(6) = Scalar(-1);
+
+  expected_out(0) = Scalar(-0.5772156649015329);
+  expected_out(1) = Scalar(0.03648997397857645);
+  expected_out(2) = Scalar(1.2561176684318);
+  expected_out(3) = Scalar(2.398239129535781);
+  expected_out(4) = Scalar(9.210340372392849);
+  expected_out(5) = std::numeric_limits<Scalar>::infinity();
+  expected_out(6) = std::numeric_limits<Scalar>::infinity();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in.digamma();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+  for (int i = 5; i < 7; ++i) {
+    VERIFY_IS_EQUAL(out(i), expected_out(i));
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_zeta()
+{
+  Tensor<Scalar, 1> in_x(6);
+  Tensor<Scalar, 1> in_q(6);
+  Tensor<Scalar, 1> out(6);
+  Tensor<Scalar, 1> expected_out(6);
+  out.setZero();
+
+  in_x(0) = Scalar(1);
+  in_x(1) = Scalar(1.5);
+  in_x(2) = Scalar(4);
+  in_x(3) = Scalar(-10.5);
+  in_x(4) = Scalar(10000.5);
+  in_x(5) = Scalar(3);
+  
+  in_q(0) = Scalar(1.2345);
+  in_q(1) = Scalar(2);
+  in_q(2) = Scalar(1.5);
+  in_q(3) = Scalar(3);
+  in_q(4) = Scalar(1.0001);
+  in_q(5) = Scalar(-2.5);
+
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = Scalar(1.61237534869);
+  expected_out(2) = Scalar(0.234848505667);
+  expected_out(3) = Scalar(1.03086757337e-5);
+  expected_out(4) = Scalar(0.367879440865);
+  expected_out(5) = Scalar(0.054102025820864097);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_q;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in_x), bytes);
+  cudaMalloc((void**)(&d_in_q), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
+  
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+  gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  VERIFY_IS_EQUAL(out(0), expected_out(0));
+  VERIFY((std::isnan)(out(3)));
+
+  for (int i = 1; i < 6; ++i) {
+    if (i != 3) {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_q);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_polygamma()
+{
+  Tensor<Scalar, 1> in_x(7);
+  Tensor<Scalar, 1> in_n(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in_n(0) = Scalar(1);
+  in_n(1) = Scalar(1);
+  in_n(2) = Scalar(1);
+  in_n(3) = Scalar(17);
+  in_n(4) = Scalar(31);
+  in_n(5) = Scalar(28);
+  in_n(6) = Scalar(8);
+  
+  in_x(0) = Scalar(2);
+  in_x(1) = Scalar(3);
+  in_x(2) = Scalar(25.5);
+  in_x(3) = Scalar(4.7);
+  in_x(4) = Scalar(11.8);
+  in_x(5) = Scalar(17.7);
+  in_x(6) = Scalar(30.2);
+
+  expected_out(0) = Scalar(0.644934066848);
+  expected_out(1) = Scalar(0.394934066848);
+  expected_out(2) = Scalar(0.0399946696496);
+  expected_out(3) = Scalar(293.334565435);
+  expected_out(4) = Scalar(0.445487887616);
+  expected_out(5) = Scalar(-2.47810300902e-07);
+  expected_out(6) = Scalar(-8.29668781082e-09);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_n;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in_x), bytes);
+  cudaMalloc((void**)(&d_in_n), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
+  
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 7; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_n);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_igamma()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[j];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                          {0.0, 0.6321205588285578, 0.7768698398515702,
+                           0.9816843611112658, 9.999500016666262e-05, 1.0},
+                          {0.0, 0.4275932955291202, 0.608374823728911,
+                           0.9539882943107686, 7.522076445089201e-07, 1.0},
+                          {0.0, 0.01898815687615381, 0.06564245437845008,
+                           0.5665298796332909, 4.166333347221828e-18, 1.0},
+                          {0.0, 0.9999780593618628, 0.9999899967080838,
+                           0.9999996219837988, 0.9991370418689945, 1.0},
+                          {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+
+
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+
+  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igamma_s[i][j])) {
+        VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
+      }
+    }
+  }
+
+  cudaFree(d_a);
+  cudaFree(d_x);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_igammac()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[j];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                           {1.0, 0.36787944117144233, 0.22313016014842982,
+                            0.018315638888734182, 0.9999000049998333, 0.0},
+                           {1.0, 0.5724067044708798, 0.3916251762710878,
+                            0.04601170568923136, 0.9999992477923555, 0.0},
+                           {1.0, 0.9810118431238462, 0.9343575456215499,
+                            0.4334701203667089, 1.0, 0.0},
+                           {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                            3.7801620118431334e-07, 0.0008629581310054535,
+                            0.0},
+                           {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_a), bytes);
+  cudaMalloc((void**)(&d_x), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igammac_s[i][j])) {
+        VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
+      }
+    }
+  }
+
+  cudaFree(d_a);
+  cudaFree(d_x);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_erf(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erf();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
+    }
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_erfc(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erfc();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
+    }
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_betainc()
+{
+  Tensor<Scalar, 1> in_x(125);
+  Tensor<Scalar, 1> in_a(125);
+  Tensor<Scalar, 1> in_b(125);
+  Tensor<Scalar, 1> out(125);
+  Tensor<Scalar, 1> expected_out(125);
+  out.setZero();
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Array<Scalar, 1, Dynamic> x(125);
+  Array<Scalar, 1, Dynamic> a(125);
+  Array<Scalar, 1, Dynamic> b(125);
+  Array<Scalar, 1, Dynamic> v(125);
+
+  a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
+
+  b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999;
+
+  x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+      0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+      -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
+
+  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+      0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+      0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
+      nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+      0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+      0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+      0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
+      nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
+      nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
+      nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
+      nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
+      2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
+      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
+      2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
+      0.5000000000004947, 0.9999999999999999, nan;
+
+  for (int i = 0; i < 125; ++i) {
+    in_x(i) = x(i);
+    in_a(i) = a(i);
+    in_b(i) = b(i);
+    expected_out(i) = v(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_a;
+  Scalar* d_in_b;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in_x), bytes);
+  cudaMalloc((void**)(&d_in_a), bytes);
+  cudaMalloc((void**)(&d_in_b), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
+
+  gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 1; i < 125; ++i) {
+    if ((std::isnan)(expected_out(i))) {
+      VERIFY((std::isnan)(out(i)));
+    } else {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_a);
+  cudaFree(d_in_b);
+  cudaFree(d_out);
+}
+
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_nullary());
+  CALL_SUBTEST_1(test_cuda_elementwise_small());
+  CALL_SUBTEST_1(test_cuda_elementwise());
+  CALL_SUBTEST_1(test_cuda_props());
+  CALL_SUBTEST_1(test_cuda_reduction());
+  CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
+
+#if __cplusplus > 199711L
+  // std::erf, std::erfc, and so on where only added in c++11. We use them
+  // as a golden reference to validate the results produced by Eigen. Therefore
+  // we can only run these tests if we use a c++11 compiler.
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
+
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
+
+  CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
+
+  CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
+
+  CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
+
+  CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
+  // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
+
+  CALL_SUBTEST_5(test_cuda_digamma<float>());
+  CALL_SUBTEST_5(test_cuda_digamma<double>());
+
+  CALL_SUBTEST_5(test_cuda_polygamma<float>());
+  CALL_SUBTEST_5(test_cuda_polygamma<double>());
+
+  CALL_SUBTEST_5(test_cuda_zeta<float>());
+  CALL_SUBTEST_5(test_cuda_zeta<double>());
+
+  CALL_SUBTEST_5(test_cuda_igamma<float>());
+  CALL_SUBTEST_5(test_cuda_igammac<float>());
+
+  CALL_SUBTEST_5(test_cuda_igamma<double>());
+  CALL_SUBTEST_5(test_cuda_igammac<double>());
+
+  CALL_SUBTEST_6(test_cuda_betainc<float>());
+  CALL_SUBTEST_6(test_cuda_betainc<double>());
+#endif
+}
diff --git a/unsupported/test/cxx11_tensor_custom_index.cpp b/unsupported/test/cxx11_tensor_custom_index.cpp
new file mode 100644
index 000000000..4528cc176
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_index.cpp
@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <map>
+
+#include <Eigen/Dense>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+template <int DataLayout>
+static void test_map_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  using NormalIndex = DSizes<ptrdiff_t, 4>;
+  using CustomIndex = std::map<ptrdiff_t, ptrdiff_t>;
+  CustomIndex coeffC;
+  coeffC[0] = 1;
+  coeffC[1] = 2;
+  coeffC[2] = 4;
+  coeffC[3] = 1;
+  NormalIndex coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_matrix_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  using NormalIndex = DSizes<ptrdiff_t, 4>;
+  using CustomIndex = Matrix<unsigned int, 4, 1>;
+  CustomIndex coeffC(1,2,4,1);
+  NormalIndex coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_varlist_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  DSizes<ptrdiff_t, 4> coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff({1,2,4,1}), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef({1,2,4,1}), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_sizes_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  DSizes<ptrdiff_t, 4> coeff(1,2,4,1);
+  Sizes<1,2,4,1> coeffC;
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+void test_cxx11_tensor_custom_index() {
+  test_map_as_index<ColMajor>();
+  test_map_as_index<RowMajor>();
+  test_matrix_as_index<ColMajor>();
+  test_matrix_as_index<RowMajor>();
+  test_varlist_as_index<ColMajor>();
+  test_varlist_as_index<RowMajor>();
+  test_sizes_as_index<ColMajor>();
+  test_sizes_as_index<RowMajor>();
+}
diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp
new file mode 100644
index 000000000..8baa477cc
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp
@@ -0,0 +1,111 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+struct InsertZeros {
+  DSizes<DenseIndex, 2> dimensions(const Tensor<float, 2>& input) const {
+    DSizes<DenseIndex, 2> result;
+    result[0] = input.dimension(0) * 2;
+    result[1] = input.dimension(1) * 2;
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const Tensor<float, 2>& input, Output& output, const Device& device) const
+  {
+    array<DenseIndex, 2> strides;
+    strides[0] = 2;
+    strides[1] = 2;
+    output.stride(strides).device(device) = input;
+
+    Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+    Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+    output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+  }
+};
+
+static void test_custom_unary_op()
+{
+  Tensor<float, 2> tensor(3,5);
+  tensor.setRandom();
+
+  Tensor<float, 2> result = tensor.customOp(InsertZeros());
+  VERIFY_IS_EQUAL(result.dimension(0), 6);
+  VERIFY_IS_EQUAL(result.dimension(1), 10);
+
+  for (int i = 0; i < 6; i+=2) {
+    for (int j = 0; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(result(i, j), tensor(i/2, j/2));
+    }
+  }
+  for (int i = 1; i < 6; i+=2) {
+    for (int j = 1; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(result(i, j), 0);
+    }
+  }
+}
+
+
+struct BatchMatMul {
+  DSizes<DenseIndex, 3> dimensions(const Tensor<float, 3>& input1, const Tensor<float, 3>& input2) const {
+    DSizes<DenseIndex, 3> result;
+    result[0] = input1.dimension(0);
+    result[1] = input2.dimension(1);
+    result[2] = input2.dimension(2);
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const Tensor<float, 3>& input1, const Tensor<float, 3>& input2,
+            Output& output, const Device& device) const
+  {
+    typedef Tensor<float, 3>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    for (int i = 0; i < output.dimension(2); ++i) {
+      output.template chip<2>(i).device(device) = input1.chip<2>(i).contract(input2.chip<2>(i), dims);
+    }
+  }
+};
+
+
+static void test_custom_binary_op()
+{
+  Tensor<float, 3> tensor1(2,3,5);
+  tensor1.setRandom();
+  Tensor<float, 3> tensor2(3,7,5);
+  tensor2.setRandom();
+
+  Tensor<float, 3> result = tensor1.customOp(tensor2, BatchMatMul());
+  for (int i = 0; i < 5; ++i) {
+    typedef Tensor<float, 3>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    Tensor<float, 2> reference = tensor1.chip<2>(i).contract(tensor2.chip<2>(i), dims);
+    TensorRef<Tensor<float, 2> > val = result.chip<2>(i);
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(val(j, k), reference(j, k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_custom_op()
+{
+  CALL_SUBTEST(test_custom_unary_op());
+  CALL_SUBTEST(test_custom_binary_op());
+}
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
new file mode 100644
index 000000000..fde20ddf2
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -0,0 +1,390 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_device
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+// Context for evaluation on cpu
+struct CPUContext {
+  CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
+    kernel_1d_(0) = 3.14f;
+    kernel_1d_(1) = 2.7f;
+
+    kernel_2d_(0,0) = 3.14f;
+    kernel_2d_(1,0) = 2.7f;
+    kernel_2d_(0,1) = 0.2f;
+    kernel_2d_(1,1) = 7.0f;
+
+    kernel_3d_(0,0,0) = 3.14f;
+    kernel_3d_(0,1,0) = 2.7f;
+    kernel_3d_(0,0,1) = 0.2f;
+    kernel_3d_(0,1,1) = 7.0f;
+    kernel_3d_(1,0,0) = -1.0f;
+    kernel_3d_(1,1,0) = -0.3f;
+    kernel_3d_(1,0,1) = -0.7f;
+    kernel_3d_(1,1,1) = -0.5f;
+  }
+
+  const Eigen::DefaultDevice& device() const { return cpu_device_; }
+
+  const Eigen::Tensor<float, 3>& in1() const { return in1_; }
+  const Eigen::Tensor<float, 3>& in2() const { return in2_; }
+  Eigen::Tensor<float, 3>& out() { return out_; }
+  const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
+  const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
+  const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
+
+ private:
+  const Eigen::Tensor<float, 3>& in1_;
+  const Eigen::Tensor<float, 3>& in2_;
+  Eigen::Tensor<float, 3>& out_;
+
+  Eigen::Tensor<float, 1> kernel_1d_;
+  Eigen::Tensor<float, 2> kernel_2d_;
+  Eigen::Tensor<float, 3> kernel_3d_;
+
+  Eigen::DefaultDevice cpu_device_;
+};
+
+
+// Context for evaluation on GPU
+struct GPUContext {
+  GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
+    assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
+    float kernel_1d_val[] = {3.14f, 2.7f};
+    assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+
+    assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
+    float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
+    assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+
+    assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
+    float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
+    assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+  }
+  ~GPUContext() {
+    assert(cudaFree(kernel_1d_) == cudaSuccess);
+    assert(cudaFree(kernel_2d_) == cudaSuccess);
+    assert(cudaFree(kernel_3d_) == cudaSuccess);
+  }
+
+  const Eigen::GpuDevice& device() const { return gpu_device_; }
+
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
+
+ private:
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
+
+  float* kernel_1d_;
+  float* kernel_2d_;
+  float* kernel_3d_;
+
+  Eigen::CudaStreamDevice stream_;
+  Eigen::GpuDevice gpu_device_;
+};
+
+
+// The actual expression to evaluate
+template <typename Context>
+void test_contextual_eval(Context* context)
+{
+  context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+void test_forced_contextual_eval(Context* context)
+{
+  context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+void test_compound_assignment(Context* context)
+{
+  context->out().device(context->device()) = context->in1().constant(2.718f);
+  context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
+}
+
+
+template <typename Context>
+void test_contraction(Context* context)
+{
+  Eigen::array<std::pair<int, int>, 2> dims;
+  dims[0] = std::make_pair(1, 1);
+  dims[1] = std::make_pair(2, 2);
+
+  Eigen::array<int, 2> shape(40, 50*70);
+
+  Eigen::DSizes<int, 2> indices(0,0);
+  Eigen::DSizes<int, 2> sizes(40,40);
+
+  context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
+}
+
+
+template <typename Context>
+void test_1d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,70);
+
+  Eigen::array<int, 1> dims(1);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
+}
+
+template <typename Context>
+void test_2d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,69);
+
+  Eigen::array<int, 2> dims(1,2);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
+}
+
+template <typename Context>
+void test_3d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(39,49,69);
+
+  Eigen::array<int, 3> dims(0,1,2);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
+}
+
+
+void test_cpu() {
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
+
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  CPUContext context(in1, in2, out);
+  test_contextual_eval(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_forced_contextual_eval(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_compound_assignment(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_contraction(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 40; ++j) {
+      const float result = out(i,j,0);
+      float expected = 0;
+      for (int k = 0; k < 50; ++k) {
+        for (int l = 0; l < 70; ++l) {
+          expected += in1(i, k, l) * in2(j, k, l);
+        }
+      }
+      VERIFY_IS_APPROX(expected, result);
+    }
+  }
+
+  test_1d_convolution(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+      }
+    }
+  }
+
+  test_2d_convolution(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
+                               (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
+          continue;
+        }
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+  test_3d_convolution(&context);
+  for (int i = 0; i < 39; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
+                               (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
+          continue;
+        }
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+}
+
+void test_gpu() {
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
+
+  GPUContext context(gpu_in1, gpu_in2, gpu_out);
+  test_contextual_eval(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_forced_contextual_eval(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_compound_assignment(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_contraction(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 40; ++j) {
+      const float result = out(i,j,0);
+      float expected = 0;
+      for (int k = 0; k < 50; ++k) {
+        for (int l = 0; l < 70; ++l) {
+          expected += in1(i, k, l) * in2(j, k, l);
+        }
+      }
+      VERIFY_IS_APPROX(expected, result);
+    }
+  }
+
+  test_1d_convolution(&context);
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+      }
+    }
+  }
+
+  test_2d_convolution(&context);
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+  test_3d_convolution(&context);
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  for (int i = 0; i < 39; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+       const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
+                                in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_device()
+{
+  CALL_SUBTEST_1(test_cpu());
+  CALL_SUBTEST_2(test_gpu());
+}
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
new file mode 100644
index 000000000..7f79753c5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -0,0 +1,31 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+void test_device_sycl(const Eigen::SyclDevice &sycl_device) {
+  std::cout <<"Helo from ComputeCpp: the requested device exists and the device name is : "
+    << sycl_device.m_queue.get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl;;
+}
+void test_cxx11_tensor_device_sycl() {
+  cl::sycl::gpu_selector s;
+  Eigen::SyclDevice sycl_device(s);
+  CALL_SUBTEST(test_device_sycl(sycl_device));
+}
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
new file mode 100644
index 000000000..16f168ed4
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+static void test_dynamic_size()
+{
+  Eigen::DSizes<int, 3> dimensions(2,3,7);
+
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
+  VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
+  VERIFY_IS_EQUAL((int)dimensions[0], 2);
+  VERIFY_IS_EQUAL((int)dimensions[1], 3);
+  VERIFY_IS_EQUAL((int)dimensions[2], 7);
+}
+
+static void test_fixed_size()
+{
+  Eigen::Sizes<2,3,7> dimensions;
+
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
+  VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
+}
+
+static void test_match()
+{
+  Eigen::DSizes<unsigned int, 3> dyn((unsigned int)2,(unsigned int)3,(unsigned int)7);
+  Eigen::Sizes<2,3,7> stat;
+  VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true);
+
+  Eigen::DSizes<int, 3> dyn1(2,3,7);
+  Eigen::DSizes<int, 2> dyn2(2,3);
+  VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn1, dyn2), false);
+}
+
+static void test_rank_zero()
+{
+  Eigen::Sizes<> scalar;
+  VERIFY_IS_EQUAL((int)scalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL((int)scalar.rank(), 0);
+  VERIFY_IS_EQUAL((int)internal::array_prod(scalar), 1);
+
+  Eigen::DSizes<ptrdiff_t, 0> dscalar;
+  VERIFY_IS_EQUAL((int)dscalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
+}
+
+void test_cxx11_tensor_dimension()
+{
+  CALL_SUBTEST(test_dynamic_size());
+  CALL_SUBTEST(test_fixed_size());
+  CALL_SUBTEST(test_match());
+  CALL_SUBTEST(test_rank_zero());
+}
diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
new file mode 100644
index 000000000..d7eea42d7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_empty.cpp
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_empty_tensor()
+{
+  Tensor<float, 2> source;
+  Tensor<float, 2> tgt1 = source;
+  Tensor<float, 2> tgt2(source);
+  Tensor<float, 2> tgt3;
+  tgt3 = tgt1;
+  tgt3 = tgt2;
+}
+
+static void test_empty_fixed_size_tensor()
+{
+  TensorFixedSize<float, Sizes<0> > source;
+  TensorFixedSize<float, Sizes<0> > tgt1 = source;
+  TensorFixedSize<float, Sizes<0> > tgt2(source);
+  TensorFixedSize<float, Sizes<0> > tgt3;
+  tgt3 = tgt1;
+  tgt3 = tgt2;
+}
+
+
+void test_cxx11_tensor_empty()
+{
+   CALL_SUBTEST(test_empty_tensor());
+   CALL_SUBTEST(test_empty_fixed_size_tensor());
+}
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
new file mode 100644
index 000000000..77e24cb67
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -0,0 +1,314 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_1d()
+{
+  Tensor<float, 1> vec1(6);
+  Tensor<float, 1, RowMajor> vec2(6);
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<Tensor<float, 1>> vec3(data3, 6);
+  vec3 = vec1.sqrt();
+  float data4[6];
+  TensorMap<Tensor<float, 1, RowMajor>> vec4(data4, 6);
+  vec4 = vec2.square();
+  float data5[6];
+  TensorMap<Tensor<float, 1, RowMajor>> vec5(data5, 6);
+  vec5 = vec2.cube();
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), 0.0f);
+  VERIFY_IS_APPROX(vec4(1), 1.0f);
+  VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f);
+
+  VERIFY_IS_APPROX(vec5(0), 0.0f);
+  VERIFY_IS_APPROX(vec5(1), 1.0f);
+  VERIFY_IS_APPROX(vec5(2), 2.0f * 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec5(3), 3.0f * 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec5(4), 4.0f * 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec5(5), 5.0f * 5.0f * 5.0f);
+
+  vec3 = vec1 + vec2;
+  VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
+  VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f);
+}
+
+static void test_2d()
+{
+  float data1[6];
+  TensorMap<Tensor<float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2, RowMajor>> mat2(data2, 2, 3);
+
+  mat1(0,0) = 0.0;
+  mat1(0,1) = 1.0;
+  mat1(0,2) = 2.0;
+  mat1(1,0) = 3.0;
+  mat1(1,1) = 4.0;
+  mat1(1,2) = 5.0;
+
+  mat2(0,0) = -0.0;
+  mat2(0,1) = -1.0;
+  mat2(0,2) = -2.0;
+  mat2(1,0) = -3.0;
+  mat2(1,1) = -4.0;
+  mat2(1,2) = -5.0;
+
+  Tensor<float, 2> mat3(2,3);
+  Tensor<float, 2, RowMajor> mat4(2,3);
+  mat3 = mat1.abs();
+  mat4 = mat2.abs();
+
+  VERIFY_IS_APPROX(mat3(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat3(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat3(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat3(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat3(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat3(1,2), 5.0f);
+
+  VERIFY_IS_APPROX(mat4(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat4(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat4(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat4(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat4(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat4(1,2), 5.0f);
+}
+
+static void test_3d()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3, RowMajor> mat2(2,3,7);
+
+  float val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+
+  Tensor<float, 3> mat3(2,3,7);
+  mat3 = mat1 + mat1;
+  Tensor<float, 3, RowMajor> mat4(2,3,7);
+  mat4 = mat2 * 3.14f;
+  Tensor<float, 3> mat5(2,3,7);
+  mat5 = mat1.inverse().log();
+  Tensor<float, 3, RowMajor> mat6(2,3,7);
+  mat6 = mat2.pow(0.5f) * 3.14f;
+  Tensor<float, 3> mat7(2,3,7);
+  mat7 = mat1.cwiseMax(mat5 * 2.0f).exp();
+  Tensor<float, 3, RowMajor> mat8(2,3,7);
+  mat8 = (-mat2).exp() * 3.14f;
+  Tensor<float, 3, RowMajor> mat9(2,3,7);
+  mat9 = mat2 + 3.14f;
+  Tensor<float, 3, RowMajor> mat10(2,3,7);
+  mat10 = mat2 - 3.14f;
+  Tensor<float, 3, RowMajor> mat11(2,3,7);
+  mat11 = mat2 / 3.14f;
+
+  val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), val + val);
+        VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f);
+        VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val));
+        VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f);
+        VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f)));
+        VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f);
+        VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f);
+        VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f);
+        VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f);
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+static void test_constants()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  float val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+  mat2 = mat1.constant(3.14f);
+  mat3 = mat1.cwiseMax(7.3f).exp();
+
+  val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat2(i,j,k), 3.14f);
+        VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f)));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+static void test_boolean()
+{
+  Tensor<int, 1> vec(6);
+  std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data());
+
+  // Test ||.
+  Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
+  VERIFY_IS_EQUAL(bool1[0], true);
+  VERIFY_IS_EQUAL(bool1[1], false);
+  VERIFY_IS_EQUAL(bool1[2], false);
+  VERIFY_IS_EQUAL(bool1[3], false);
+  VERIFY_IS_EQUAL(bool1[4], false);
+  VERIFY_IS_EQUAL(bool1[5], true);
+
+  // Test &&, including cast of operand vec.
+  Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
+  VERIFY_IS_EQUAL(bool2[0], false);
+  VERIFY_IS_EQUAL(bool2[1], true);
+  VERIFY_IS_EQUAL(bool2[2], true);
+  VERIFY_IS_EQUAL(bool2[3], true);
+  VERIFY_IS_EQUAL(bool2[4], false);
+  VERIFY_IS_EQUAL(bool2[5], false);
+
+  // Compilation tests:
+  // Test Tensor<bool> against results of cast or comparison; verifies that
+  // CoeffReturnType is set to match Op return type of bool for Unary and Binary
+  // Ops.
+  Tensor<bool, 1> bool3 = vec.cast<bool>() && bool2;
+  bool3 = vec < vec.constant(4) && bool2;
+}
+
+static void test_functors()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  float val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+  mat2 = mat1.inverse().unaryExpr(&asinf);
+  mat3 = mat1.unaryExpr(&tanhf);
+
+  val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k)));
+        VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k)));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+static void test_type_casting()
+{
+  Tensor<bool, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<double, 3> mat3(2,3,7);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  mat3 = mat1.cast<double>();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) ? 1.0 : 0.0);
+      }
+    }
+  }
+
+  mat3 = mat2.cast<double>();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), static_cast<double>(mat2(i,j,k)));
+      }
+    }
+  }
+}
+
+static void test_select()
+{
+  Tensor<float, 3> selector(2,3,7);
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> result(2,3,7);
+
+  selector.setRandom();
+  mat1.setRandom();
+  mat2.setRandom();
+  result = (selector > selector.constant(0.5f)).select(mat1, mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(result(i,j,k), (selector(i,j,k) > 0.5f) ? mat1(i,j,k) : mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_expr()
+{
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_constants());
+  CALL_SUBTEST(test_boolean());
+  CALL_SUBTEST(test_functors());
+  CALL_SUBTEST(test_type_casting());
+  CALL_SUBTEST(test_select());
+}
diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
new file mode 100644
index 000000000..2f14ebc62
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_fft.cpp
@@ -0,0 +1,273 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_fft_2D_golden() {
+  Tensor<float, 2, DataLayout> input(2, 3);
+  input(0, 0) = 1;
+  input(0, 1) = 2;
+  input(0, 2) = 3;
+  input(1, 0) = 4;
+  input(1, 1) = 5;
+  input(1, 2) = 6;
+
+  array<ptrdiff_t, 2> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+
+  Tensor<std::complex<float>, 2, DataLayout> output = input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+
+  std::complex<float> output_golden[6]; // in ColMajor order
+  output_golden[0] = std::complex<float>(21, 0);
+  output_golden[1] = std::complex<float>(-9, 0);
+  output_golden[2] = std::complex<float>(-3, 1.73205);
+  output_golden[3] = std::complex<float>( 0, 0);
+  output_golden[4] = std::complex<float>(-3, -1.73205);
+  output_golden[5] = std::complex<float>(0 ,0);
+
+  std::complex<float> c_offset = std::complex<float>(1.0, 1.0);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_APPROX(output(0) + c_offset, output_golden[0] + c_offset);
+    VERIFY_IS_APPROX(output(1) + c_offset, output_golden[1] + c_offset);
+    VERIFY_IS_APPROX(output(2) + c_offset, output_golden[2] + c_offset);
+    VERIFY_IS_APPROX(output(3) + c_offset, output_golden[3] + c_offset);
+    VERIFY_IS_APPROX(output(4) + c_offset, output_golden[4] + c_offset);
+    VERIFY_IS_APPROX(output(5) + c_offset, output_golden[5] + c_offset);
+  }
+  else {
+    VERIFY_IS_APPROX(output(0)+ c_offset, output_golden[0]+ c_offset);
+    VERIFY_IS_APPROX(output(1)+ c_offset, output_golden[2]+ c_offset);
+    VERIFY_IS_APPROX(output(2)+ c_offset, output_golden[4]+ c_offset);
+    VERIFY_IS_APPROX(output(3)+ c_offset, output_golden[1]+ c_offset);
+    VERIFY_IS_APPROX(output(4)+ c_offset, output_golden[3]+ c_offset);
+    VERIFY_IS_APPROX(output(5)+ c_offset, output_golden[5]+ c_offset);
+  }
+}
+
+static void test_fft_complex_input_golden() {
+  Tensor<std::complex<float>, 1, ColMajor> input(5);
+  input(0) = std::complex<float>(1, 1);
+  input(1) = std::complex<float>(2, 2);
+  input(2) = std::complex<float>(3, 3);
+  input(3) = std::complex<float>(4, 4);
+  input(4) = std::complex<float>(5, 5);
+
+  array<ptrdiff_t, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<float>, 1, ColMajor> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 1, ColMajor> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_real_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_real_part.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_imag_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_imag_part.dimension(0), input.dimension(0));
+
+  std::complex<float> forward_golden_result[5];
+  std::complex<float> reverse_golden_result[5];
+
+  forward_golden_result[0] = std::complex<float>(15.000000000000000,+15.000000000000000);
+  forward_golden_result[1] = std::complex<float>(-5.940954801177935, +0.940954801177934);
+  forward_golden_result[2] = std::complex<float>(-3.312299240582266, -1.687700759417735);
+  forward_golden_result[3] = std::complex<float>(-1.687700759417735, -3.312299240582266);
+  forward_golden_result[4] = std::complex<float>( 0.940954801177934, -5.940954801177935);
+
+  reverse_golden_result[0] = std::complex<float>( 3.000000000000000, + 3.000000000000000);
+  reverse_golden_result[1] = std::complex<float>( 0.188190960235587, - 1.188190960235587);
+  reverse_golden_result[2] = std::complex<float>(-0.337540151883547, - 0.662459848116453);
+  reverse_golden_result[3] = std::complex<float>(-0.662459848116453, - 0.337540151883547);
+  reverse_golden_result[4] = std::complex<float>(-1.188190960235587, + 0.188190960235587);
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(forward_output_both_parts(i), forward_golden_result[i]);
+    VERIFY_IS_APPROX(forward_output_real_part(i), forward_golden_result[i].real());
+    VERIFY_IS_APPROX(forward_output_imag_part(i), forward_golden_result[i].imag());
+  }
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(reverse_output_both_parts(i), reverse_golden_result[i]);
+    VERIFY_IS_APPROX(reverse_output_real_part(i), reverse_golden_result[i].real());
+    VERIFY_IS_APPROX(reverse_output_imag_part(i), reverse_golden_result[i].imag());
+  }
+}
+
+static void test_fft_real_input_golden() {
+  Tensor<float, 1, ColMajor> input(5);
+  input(0) = 1.0;
+  input(1) = 2.0;
+  input(2) = 3.0;
+  input(3) = 4.0;
+  input(4) = 5.0;
+
+  array<ptrdiff_t, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<float>, 1, ColMajor> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 1, ColMajor> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_real_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_real_part.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_imag_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_imag_part.dimension(0), input.dimension(0));
+
+  std::complex<float> forward_golden_result[5];
+  std::complex<float> reverse_golden_result[5];
+
+
+  forward_golden_result[0] = std::complex<float>(  15, 0);
+  forward_golden_result[1] = std::complex<float>(-2.5, +3.44095480117793);
+  forward_golden_result[2] = std::complex<float>(-2.5, +0.81229924058227);
+  forward_golden_result[3] = std::complex<float>(-2.5, -0.81229924058227);
+  forward_golden_result[4] = std::complex<float>(-2.5, -3.44095480117793);
+
+  reverse_golden_result[0] = std::complex<float>( 3.0, 0);
+  reverse_golden_result[1] = std::complex<float>(-0.5, -0.688190960235587);
+  reverse_golden_result[2] = std::complex<float>(-0.5, -0.162459848116453);
+  reverse_golden_result[3] = std::complex<float>(-0.5, +0.162459848116453);
+  reverse_golden_result[4] = std::complex<float>(-0.5, +0.688190960235587);
+
+  std::complex<float> c_offset(1.0, 1.0);
+  float r_offset = 1.0;
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(forward_output_both_parts(i) + c_offset, forward_golden_result[i] + c_offset);
+    VERIFY_IS_APPROX(forward_output_real_part(i)  + r_offset, forward_golden_result[i].real() + r_offset);
+    VERIFY_IS_APPROX(forward_output_imag_part(i)  + r_offset, forward_golden_result[i].imag() + r_offset);
+  }
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(reverse_output_both_parts(i) + c_offset, reverse_golden_result[i] + c_offset);
+    VERIFY_IS_APPROX(reverse_output_real_part(i)  + r_offset, reverse_golden_result[i].real() + r_offset);
+    VERIFY_IS_APPROX(reverse_output_imag_part(i)  + r_offset, reverse_golden_result[i].imag() + r_offset);
+  }
+}
+
+
+template <int DataLayout, typename RealScalar, bool isComplexInput, int FFTResultType, int FFTDirection, int TensorRank>
+static void test_fft_real_input_energy() {
+
+  Eigen::DSizes<ptrdiff_t, TensorRank> dimensions;
+  ptrdiff_t total_size = 1;
+  for (int i = 0; i < TensorRank; ++i) {
+    dimensions[i] = rand() % 20 + 1;
+    total_size *= dimensions[i];
+  }
+  const DSizes<ptrdiff_t, TensorRank> arr = dimensions;
+
+  typedef typename internal::conditional<isComplexInput == true, std::complex<RealScalar>, RealScalar>::type InputScalar;
+
+  Tensor<InputScalar, TensorRank, DataLayout> input;
+  input.resize(arr);
+  input.setRandom();
+
+  array<ptrdiff_t, TensorRank> fft;
+  for (int i = 0; i < TensorRank; ++i) {
+    fft[i] = i;
+  }
+
+  typedef typename internal::conditional<FFTResultType == Eigen::BothParts, std::complex<RealScalar>, RealScalar>::type OutputScalar;
+  Tensor<OutputScalar, TensorRank, DataLayout> output;
+  output = input.template fft<FFTResultType, FFTDirection>(fft);
+
+  for (int i = 0; i < TensorRank; ++i) {
+    VERIFY_IS_EQUAL(output.dimension(i), input.dimension(i));
+  }
+
+  RealScalar energy_original = 0.0;
+  RealScalar energy_after_fft = 0.0;
+
+  for (int i = 0; i < total_size; ++i) {
+    energy_original += numext::abs2(input(i));
+  }
+
+  for (int i = 0; i < total_size; ++i) {
+    energy_after_fft += numext::abs2(output(i));
+  }
+
+  if(FFTDirection == FFT_FORWARD) {
+    VERIFY_IS_APPROX(energy_original, energy_after_fft / total_size);
+  }
+  else {
+    VERIFY_IS_APPROX(energy_original, energy_after_fft * total_size);
+  }
+}
+
+void test_cxx11_tensor_fft() {
+    test_fft_complex_input_golden();
+    test_fft_real_input_golden();
+
+    test_fft_2D_golden<ColMajor>();
+    test_fft_2D_golden<RowMajor>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 1>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 2>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 3>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 1>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 2>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 3>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+}
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
new file mode 100644
index 000000000..4c660de65
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -0,0 +1,261 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+
+static void test_0d()
+{
+  TensorFixedSize<float, Sizes<> > scalar1;
+  TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+  VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
+
+  scalar1() = 7.0;
+  scalar2() = 13.0;
+
+  // Test against shallow copy.
+  TensorFixedSize<float, Sizes<> > copy = scalar1;
+  VERIFY_IS_NOT_EQUAL(scalar1.data(), copy.data());
+  VERIFY_IS_APPROX(scalar1(), copy());
+  copy = scalar1;
+  VERIFY_IS_NOT_EQUAL(scalar1.data(), copy.data());
+  VERIFY_IS_APPROX(scalar1(), copy());
+
+  TensorFixedSize<float, Sizes<> > scalar3 = scalar1.sqrt();
+  TensorFixedSize<float, Sizes<>, RowMajor> scalar4 = scalar2.sqrt();
+  VERIFY_IS_EQUAL(scalar3.rank(), 0);
+  VERIFY_IS_APPROX(scalar3(), sqrtf(7.0));
+  VERIFY_IS_APPROX(scalar4(), sqrtf(13.0));
+
+  scalar3 = scalar1 + scalar2;
+  VERIFY_IS_APPROX(scalar3(), 7.0f + 13.0f);
+}
+
+static void test_1d()
+{
+  TensorFixedSize<float, Sizes<6> > vec1;
+  TensorFixedSize<float, Sizes<6>, RowMajor> vec2;
+
+  VERIFY_IS_EQUAL((vec1.size()), 6);
+  //  VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6);
+  //  VERIFY_IS_EQUAL((vec1.dimension(0)), 6);
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  // Test against shallow copy.
+  TensorFixedSize<float, Sizes<6> > copy = vec1;
+  VERIFY_IS_NOT_EQUAL(vec1.data(), copy.data());
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec1(i), copy(i));
+  }
+  copy = vec1;
+  VERIFY_IS_NOT_EQUAL(vec1.data(), copy.data());
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec1(i), copy(i));
+  }
+
+  TensorFixedSize<float, Sizes<6> > vec3 = vec1.sqrt();
+  TensorFixedSize<float, Sizes<6>, RowMajor> vec4 = vec2.sqrt();
+
+  VERIFY_IS_EQUAL((vec3.size()), 6);
+  VERIFY_IS_EQUAL(vec3.rank(), 1);
+  //  VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6);
+  //  VERIFY_IS_EQUAL((vec3.dimension(0)), 6);
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), sqrtf(0.0));
+  VERIFY_IS_APPROX(vec4(1), sqrtf(1.0));
+  VERIFY_IS_APPROX(vec4(2), sqrtf(2.0));
+  VERIFY_IS_APPROX(vec4(3), sqrtf(3.0));
+  VERIFY_IS_APPROX(vec4(4), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec4(5), sqrtf(5.0));
+
+  vec3 = vec1 + vec2;
+  VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
+  VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f);
+}
+
+static void test_tensor_map()
+{
+  TensorFixedSize<float, Sizes<6> > vec1;
+  TensorFixedSize<float, Sizes<6>, RowMajor> vec2;
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<TensorFixedSize<float, Sizes<6> > > vec3(data3, 6);
+  vec3 = vec1.sqrt() + vec2;
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0) + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0) + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0) + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0) + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0) + 5.0f);
+}
+
+static void test_2d()
+{
+  float data1[6];
+  TensorMap<TensorFixedSize<float, Sizes<2, 3> > > mat1(data1,2,3);
+  float data2[6];
+  TensorMap<TensorFixedSize<float, Sizes<2, 3>, RowMajor> > mat2(data2,2,3);
+
+  VERIFY_IS_EQUAL((mat1.size()), 2*3);
+  VERIFY_IS_EQUAL(mat1.rank(), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
+
+  mat1(0,0) = 0.0;
+  mat1(0,1) = 1.0;
+  mat1(0,2) = 2.0;
+  mat1(1,0) = 3.0;
+  mat1(1,1) = 4.0;
+  mat1(1,2) = 5.0;
+
+  mat2(0,0) = -0.0;
+  mat2(0,1) = -1.0;
+  mat2(0,2) = -2.0;
+  mat2(1,0) = -3.0;
+  mat2(1,1) = -4.0;
+  mat2(1,2) = -5.0;
+
+  TensorFixedSize<float, Sizes<2, 3> > mat3;
+  TensorFixedSize<float, Sizes<2, 3>, RowMajor> mat4;
+  mat3 = mat1.abs();
+  mat4 = mat2.abs();
+
+  VERIFY_IS_EQUAL((mat3.size()), 2*3);
+    //  VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
+    //  VERIFY_IS_EQUAL((mat3.dimension(1)), 3);
+
+  VERIFY_IS_APPROX(mat3(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat3(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat3(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat3(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat3(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat3(1,2), 5.0f);
+
+  VERIFY_IS_APPROX(mat4(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat4(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat4(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat4(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat4(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat4(1,2), 5.0f);
+}
+
+static void test_3d()
+{
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
+  TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat2;
+
+  VERIFY_IS_EQUAL((mat1.size()), 2*3*7);
+  VERIFY_IS_EQUAL(mat1.rank(), 3);
+  //  VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
+  //  VERIFY_IS_EQUAL((mat1.dimension(2)), 7);
+
+  float val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
+  mat3 = mat1.sqrt();
+  TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat4;
+  mat4 = mat2.sqrt();
+
+  VERIFY_IS_EQUAL((mat3.size()), 2*3*7);
+  //  VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat3.dimension(1)), 3);
+  //  VERIFY_IS_EQUAL((mat3.dimension(2)), 7);
+
+
+  val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val));
+        VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+
+static void test_array()
+{
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
+  float val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
+  mat3 = mat1.pow(3.5f);
+
+  val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_fixed_size()
+{
+  CALL_SUBTEST(test_0d());
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_tensor_map());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_array());
+}
diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp
new file mode 100644
index 000000000..45d7345e9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_forced_eval.cpp
@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/Core>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::MatrixXf;
+using Eigen::Tensor;
+
+static void test_simple()
+{
+  MatrixXf m1(3,3);
+  MatrixXf m2(3,3);
+  m1.setRandom();
+  m2.setRandom();
+
+  TensorMap<Tensor<float, 2> > mat1(m1.data(), 3,3);
+  TensorMap<Tensor<float, 2> > mat2(m2.data(), 3,3);
+
+  Tensor<float, 2> mat3(3,3);
+  mat3 = mat1;
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+
+  mat3 = mat3.contract(mat2, dims).eval();
+
+  VERIFY_IS_APPROX(mat3(0, 0), (m1*m2).eval()(0,0));
+  VERIFY_IS_APPROX(mat3(0, 1), (m1*m2).eval()(0,1));
+  VERIFY_IS_APPROX(mat3(0, 2), (m1*m2).eval()(0,2));
+  VERIFY_IS_APPROX(mat3(1, 0), (m1*m2).eval()(1,0));
+  VERIFY_IS_APPROX(mat3(1, 1), (m1*m2).eval()(1,1));
+  VERIFY_IS_APPROX(mat3(1, 2), (m1*m2).eval()(1,2));
+  VERIFY_IS_APPROX(mat3(2, 0), (m1*m2).eval()(2,0));
+  VERIFY_IS_APPROX(mat3(2, 1), (m1*m2).eval()(2,1));
+  VERIFY_IS_APPROX(mat3(2, 2), (m1*m2).eval()(2,2));
+}
+
+
+static void test_const()
+{
+  MatrixXf input(3,3);
+  input.setRandom();
+  MatrixXf output = input;
+  output.rowwise() -= input.colwise().maxCoeff();
+
+  Eigen::array<int, 1> depth_dim;
+  depth_dim[0] = 0;
+  Tensor<float, 2>::Dimensions dims2d;
+  dims2d[0] = 1;
+  dims2d[1] = 3;
+  Eigen::array<int, 2> bcast;
+  bcast[0] = 3;
+  bcast[1] = 1;
+  const TensorMap<Tensor<const float, 2> > input_tensor(input.data(), 3, 3);
+  Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(output(i, j), output_tensor(i, j));
+    }
+  }
+}
+
+
+void test_cxx11_tensor_forced_eval()
+{
+  CALL_SUBTEST(test_simple());
+  CALL_SUBTEST(test_const());
+}
diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
new file mode 100644
index 000000000..5690da723
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -0,0 +1,70 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
+
+  int sizeDim1 = 100;
+  int sizeDim2 = 200;
+  int sizeDim3 = 200;
+  Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Eigen::Tensor<float, 3> in1(tensorRange);
+  Eigen::Tensor<float, 3> in2(tensorRange);
+  Eigen::Tensor<float, 3> out(tensorRange);
+
+  float * gpu_in1_data  = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_in2_data  = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_out_data =  static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  // creating TensorMap from tensor
+  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(float));
+  /// c=(a+b)*b
+  gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k),
+                         (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
+      }
+    }
+  }
+  printf("(a+b)*b Test Passed\n");
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+
+}
+
+void test_cxx11_tensor_forced_eval_sycl() {
+  cl::sycl::gpu_selector s;
+  Eigen::SyclDevice sycl_device(s);
+  CALL_SUBTEST(test_forced_eval_sycl(sycl_device));
+}
diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp
new file mode 100644
index 000000000..dcb928714
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_generator.cpp
@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+struct Generator1D {
+  Generator1D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const {
+    return coordinates[0];
+  }
+};
+
+template <int DataLayout>
+static void test_1D()
+{
+  Tensor<float, 1> vec(6);
+  Tensor<float, 1> result = vec.generate(Generator1D());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(result(i), i);
+  }
+}
+
+
+struct Generator2D {
+  Generator2D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const {
+    return 3 * coordinates[0] + 11 * coordinates[1];
+  }
+};
+
+template <int DataLayout>
+static void test_2D()
+{
+  Tensor<float, 2> matrix(5, 7);
+  Tensor<float, 2> result = matrix.generate(Generator2D());
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_gaussian()
+{
+  int rows = 32;
+  int cols = 48;
+  array<float, 2> means;
+  means[0] = rows / 2.0f;
+  means[1] = cols / 2.0f;
+  array<float, 2> std_devs;
+  std_devs[0] = 3.14f;
+  std_devs[1] = 2.7f;
+  internal::GaussianGenerator<float, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs);
+
+  Tensor<float, 2> matrix(rows, cols);
+  Tensor<float, 2> result = matrix.generate(gaussian_gen);
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      float g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f;
+      float g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f;
+      float gaussian = expf(-g_rows - g_cols);
+      VERIFY_IS_EQUAL(result(i, j), gaussian);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_generator()
+{
+  CALL_SUBTEST(test_1D<ColMajor>());
+  CALL_SUBTEST(test_1D<RowMajor>());
+  CALL_SUBTEST(test_2D<ColMajor>());
+  CALL_SUBTEST(test_2D<RowMajor>());
+  CALL_SUBTEST(test_gaussian<ColMajor>());
+  CALL_SUBTEST(test_gaussian<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_ifft.cpp b/unsupported/test/cxx11_tensor_ifft.cpp
new file mode 100644
index 000000000..5fd88fa6c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_ifft.cpp
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <complex>
+#include <cmath>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_1D_fft_ifft_invariant(int sequence_length) {
+  Tensor<double, 1, DataLayout> tensor(sequence_length);
+  tensor.setRandom();
+
+  array<int, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<double>, 1, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 1, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), sequence_length);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), sequence_length);
+
+  for (int i = 0; i < sequence_length; ++i) {
+    VERIFY_IS_APPROX(static_cast<float>(tensor(i)), static_cast<float>(std::real(tensor_after_fft_ifft(i))));
+  }
+}
+
+template <int DataLayout>
+static void test_2D_fft_ifft_invariant(int dim0, int dim1) {
+  Tensor<double, 2, DataLayout> tensor(dim0, dim1);
+  tensor.setRandom();
+
+  array<int, 2> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+
+  Tensor<std::complex<double>, 2, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 2, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      //std::cout << "[" << i << "][" << j << "]" <<  "  Original data: " << tensor(i,j) << " Transformed data:" << tensor_after_fft_ifft(i,j) << std::endl;
+      VERIFY_IS_APPROX(static_cast<float>(tensor(i,j)), static_cast<float>(std::real(tensor_after_fft_ifft(i,j))));
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_3D_fft_ifft_invariant(int dim0, int dim1, int dim2) {
+  Tensor<double, 3, DataLayout> tensor(dim0, dim1, dim2);
+  tensor.setRandom();
+
+  array<int, 3> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+  fft[2] = 2;
+
+  Tensor<std::complex<double>, 3, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 3, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(2), dim2);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      for (int k = 0; k < dim2; ++k) {
+        VERIFY_IS_APPROX(static_cast<float>(tensor(i,j,k)), static_cast<float>(std::real(tensor_after_fft_ifft(i,j,k))));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3) {
+  Tensor<double, 4, DataLayout> tensor(dim0, dim1, dim2, dim3);
+  tensor.setRandom();
+
+  array<int, 2> fft;
+  fft[0] = 2;
+  fft[1] = 0;
+
+  Tensor<std::complex<double>, 4, DataLayout> tensor_after_fft;
+  Tensor<double, 4, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::RealPart, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(3), dim3);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(3), dim3);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      for (int k = 0; k < dim2; ++k) {
+        for (int l = 0; l < dim3; ++l) {
+          VERIFY_IS_APPROX(static_cast<float>(tensor(i,j,k,l)), static_cast<float>(tensor_after_fft_ifft(i,j,k,l)));
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_ifft() {
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(4));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(16));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(32));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(1024*1024));
+
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(4,4));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(8,16));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(16,32));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(1024,1024));
+
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(4,4,4));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(8,16,32));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(16,4,8));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(256,256,256));
+
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(4,4,4,4));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(8,16,32,64));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(16,4,8,12));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(64,64,64,64));
+}
diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
new file mode 100644
index 000000000..475c59651
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -0,0 +1,757 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_simple_patch()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  // Single pixel patch: ColMajor
+  Tensor<float, 5> single_pixel_patch;
+  single_pixel_patch = tensor.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7);
+
+  // Single pixel patch: RowMajor
+  Tensor<float, 5, RowMajor> single_pixel_patch_row_major;
+  single_pixel_patch_row_major = tensor_row_major.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(4), 2);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    // ColMajor
+    if (tensor.data()[i] != single_pixel_patch.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor.data()[i] << " vs " << single_pixel_patch.data()[i]
+           << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_pixel_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor.data()[i] << " vs "
+           << single_pixel_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i],
+                    single_pixel_patch_row_major.data()[i]);
+  }
+
+  // Entire image patch: ColMajor
+  Tensor<float, 5> entire_image_patch;
+  entire_image_patch = tensor.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(4), 7);
+
+  // Entire image patch: RowMajor
+  Tensor<float, 5, RowMajor> entire_image_patch_row_major;
+  entire_image_patch_row_major = tensor_row_major.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 3; ++r) {
+        for (int c = 0; c < 5; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            for (int b = 0; b < 7; ++b) {
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+                expected = tensor(d, r-1+i, c-2+j, b);
+                expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d);
+              }
+              // ColMajor
+              if (entire_image_patch(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (entire_image_patch_row_major(b, patchId, c, r, d) !=
+                  expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d),
+                              expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  Tensor<float, 5> twod_patch;
+  twod_patch = tensor.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(twod_patch.dimension(4), 7);
+
+  // 2D patch: RowMajor
+  Tensor<float, 5, RowMajor> twod_patch_row_major;
+  twod_patch_row_major = tensor_row_major.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2);
+
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  int row_padding = 0;
+  int col_padding = 0;
+  int stride = 1;
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 2; ++r) {
+        for (int c = 0; c < 2; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            for (int b = 0; b < 7; ++b) {
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r*stride + i - row_padding;
+              int col_offset = c*stride + j - col_padding;
+              // ColMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (twod_patch(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected);
+
+              // RowMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) {
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+
+              }
+              if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies VALID padding (no padding) with incrementing values.
+void test_patch_padding_valid()
+{
+  int input_depth = 3;
+  int input_rows = 3;
+  int input_cols = 3;
+  int input_batches = 1;
+  int ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  // ColMajor
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 1);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major = tensor_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) < input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) < input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r + i - row_padding;
+              int col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies VALID padding (no padding) with the same value.
+void test_patch_padding_valid_same_value()
+{
+  int input_depth = 1;
+  int input_rows = 5;
+  int input_cols = 5;
+  int input_batches = 2;
+  int ksize = 3;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  // ColMajor
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  tensor = tensor.constant(11.0f);
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 4);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major = tensor_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r + i - row_padding;
+              int col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies SAME padding.
+void test_patch_padding_same()
+{
+  int input_depth = 3;
+  int input_rows = 4;
+  int input_cols = 2;
+  int input_batches = 1;
+  int ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  // ColMajor
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 2);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major = tensor_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be
+  // 0.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r*stride + i - row_padding;
+              int col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_patch_no_extra_dim()
+{
+  Tensor<float, 3> tensor(2,3,5);
+  tensor.setRandom();
+  Tensor<float, 3, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(0));
+
+  // Single pixel patch: ColMajor
+  Tensor<float, 4> single_pixel_patch;
+  single_pixel_patch = tensor.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
+
+  // Single pixel patch: RowMajor
+  Tensor<float, 4, RowMajor> single_pixel_patch_row_major;
+  single_pixel_patch_row_major = tensor_row_major.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(3), 2);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    // ColMajor
+    if (tensor.data()[i] != single_pixel_patch.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_pixel_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor.data()[i] << " vs "
+           << single_pixel_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i],
+                    single_pixel_patch_row_major.data()[i]);
+  }
+
+  // Entire image patch: ColMajor
+  Tensor<float, 4> entire_image_patch;
+  entire_image_patch = tensor.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5);
+
+  // Entire image patch: RowMajor
+  Tensor<float, 4, RowMajor> entire_image_patch_row_major;
+  entire_image_patch_row_major = tensor_row_major.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 3; ++r) {
+        for (int c = 0; c < 5; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            float expected = 0.0f;
+            float expected_row_major = 0.0f;
+            if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+              expected = tensor(d, r-1+i, c-2+j);
+              expected_row_major = tensor_row_major(c-2+j, r-1+i, d);
+            }
+            // ColMajor
+            if (entire_image_patch(d, r, c, patchId) != expected) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected);
+            // RowMajor
+            if (entire_image_patch_row_major(patchId, c, r, d) !=
+                expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d),
+                            expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  Tensor<float, 4> twod_patch;
+  twod_patch = tensor.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
+
+  // 2D patch: RowMajor
+  Tensor<float, 4, RowMajor> twod_patch_row_major;
+  twod_patch_row_major = tensor_row_major.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  int row_padding = 0;
+  int col_padding = 0;
+  int stride = 1;
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 2; ++r) {
+        for (int c = 0; c < 2; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            float expected = 0.0f;
+            float expected_row_major = 0.0f;
+            int row_offset = r*stride + i - row_padding;
+            int col_offset = c*stride + j - col_padding;
+            // ColMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
+              expected = tensor(d, row_offset, col_offset);
+            }
+            if (twod_patch(d, r, c, patchId) != expected) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected);
+            // RowMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) {
+              expected_row_major = tensor_row_major(col_offset, row_offset, d);
+            }
+            if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_imagenet_patches()
+{
+  // Test the code on typical configurations used by the 'imagenet' benchmarks at
+  // https://github.com/soumith/convnet-benchmarks
+  // ColMajor
+  Tensor<float, 4> l_in(3, 128, 128, 16);
+  l_in.setRandom();
+  Tensor<float, 5> l_out = l_in.extract_image_patches(11, 11);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 11);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 128*128);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 16);
+
+  // RowMajor
+  Tensor<float, 5, RowMajor> l_out_row_major = l_in.swap_layout().extract_image_patches(11, 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 16);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 128*128);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 3);
+
+  for (int b = 0; b < 16; ++b) {
+    for (int i = 0; i < 128; ++i) {
+      for (int j = 0; j < 128; ++j) {
+        int patchId = i+128*j;
+        for (int c = 0; c < 11; ++c) {
+          for (int r = 0; r < 11; ++r) {
+            for (int d = 0; d < 3; ++d) {
+              float expected = 0.0f;
+              if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
+                expected = l_in(d, r-5+i, c-5+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) !=
+                  expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
+                              expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  l_in.resize(16, 64, 64, 32);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(9, 9);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 16);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 9);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 64*64);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 32);
+
+  // RowMajor
+  l_out_row_major = l_in.swap_layout().extract_image_patches(9, 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16);
+
+  for (int b = 0; b < 32; ++b) {
+    for (int i = 0; i < 64; ++i) {
+      for (int j = 0; j < 64; ++j) {
+        int patchId = i+64*j;
+        for (int c = 0; c < 9; ++c) {
+          for (int r = 0; r < 9; ++r) {
+            for (int d = 0; d < 16; ++d) {
+              float expected = 0.0f;
+              if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
+                expected = l_in(d, r-4+i, c-4+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  l_in.resize(32, 16, 16, 32);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(7, 7);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 7);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 16*16);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 32);
+
+  // RowMajor
+  l_out_row_major = l_in.swap_layout().extract_image_patches(7, 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32);
+
+  for (int b = 0; b < 32; ++b) {
+    for (int i = 0; i < 16; ++i) {
+      for (int j = 0; j < 16; ++j) {
+        int patchId = i+16*j;
+        for (int c = 0; c < 7; ++c) {
+          for (int r = 0; r < 7; ++r) {
+            for (int d = 0; d < 32; ++d) {
+              float expected = 0.0f;
+              if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
+                expected = l_in(d, r-3+i, c-3+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  l_in.resize(64, 13, 13, 32);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(3, 3);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 64);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 13*13);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 32);
+
+  // RowMajor
+  l_out_row_major = l_in.swap_layout().extract_image_patches(3, 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64);
+
+  for (int b = 0; b < 32; ++b) {
+    for (int i = 0; i < 13; ++i) {
+      for (int j = 0; j < 13; ++j) {
+        int patchId = i+13*j;
+        for (int c = 0; c < 3; ++c) {
+          for (int r = 0; r < 3; ++r) {
+            for (int d = 0; d < 64; ++d) {
+              float expected = 0.0f;
+              if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
+                expected = l_in(d, r-1+i, c-1+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_image_patch()
+{
+  CALL_SUBTEST_1(test_simple_patch());
+  CALL_SUBTEST_2(test_patch_no_extra_dim());
+  CALL_SUBTEST_3(test_patch_padding_valid());
+  CALL_SUBTEST_4(test_patch_padding_valid_same_value());
+  CALL_SUBTEST_5(test_patch_padding_same());
+  CALL_SUBTEST_6(test_imagenet_patches());
+}
diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
new file mode 100644
index 000000000..4cf5df666
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+#ifdef EIGEN_HAS_INDEX_LIST
+
+static void test_static_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  constexpr auto reduction_axis = make_index_list(0, 1, 2);
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
+
+  EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_axis) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  Tensor<float, 1> result = tensor.sum(reduction_axis);
+  for (int i = 0; i < result.size(); ++i) {
+    float expected = 0.0f;
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          expected += tensor(j,k,l,i);
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+
+static void test_type2index_list()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  tensor += tensor.constant(10.0f);
+
+  typedef Eigen::IndexList<Eigen::type2index<0>> Dims0;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>> Dims1;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>> Dims2;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>> Dims3;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> Dims4;
+
+#if 0
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims0>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims1>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims2>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims3>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims4>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const Dims0 reduction_axis0;
+  Tensor<float, 4> result0 = tensor.sum(reduction_axis0);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          float expected = 0.0f;
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+          VERIFY_IS_APPROX(result0(j,k,l,m), expected);
+        }
+      }
+    }
+  }
+
+  const Dims1 reduction_axis1;
+  Tensor<float, 3> result1 = tensor.sum(reduction_axis1);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        float expected = 0.0f;
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+        VERIFY_IS_APPROX(result1(k,l,m), expected);
+      }
+    }
+  }
+
+  const Dims2 reduction_axis2;
+  Tensor<float, 2> result2 = tensor.sum(reduction_axis2);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      float expected = 0.0f;
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+      VERIFY_IS_APPROX(result2(l,m), expected);
+    }
+  }
+
+  const Dims3 reduction_axis3;
+  Tensor<float, 1> result3 = tensor.sum(reduction_axis3);
+  for (int m = 0; m < 11; ++m) {
+    float expected = 0.0f;
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result3(m), expected);
+  }
+
+  const Dims4 reduction_axis4;
+  Tensor<float, 0> result4 = tensor.sum(reduction_axis4);
+  float expected = 0.0f;
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+    }
+  }
+  VERIFY_IS_APPROX(result4(), expected);
+}
+
+
+static void test_type2indexpair_list()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  tensor += tensor.constant(10.0f);
+
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>> Dims0;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::type2indexpair<1,11>, Eigen::type2indexpair<2,12>> Dims2_a;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<2,12>> Dims2_b;
+  typedef Eigen::IndexPairList<Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<DenseIndex>> Dims2_c;
+
+  Dims0 d0;
+  Dims2_a d2_a;
+
+  Dims2_b d2_b;
+  d2_b.set(1, Eigen::IndexPair<DenseIndex>(1,11));
+
+  Dims2_c d2_c;
+  d2_c.set(0, Eigen::IndexPair<DenseIndex>(Eigen::IndexPair<DenseIndex>(0,10)));
+  d2_c.set(1, Eigen::IndexPair<DenseIndex>(1,11));  // setting type2indexpair to correct value.
+  d2_c.set(2, Eigen::IndexPair<DenseIndex>(2,12));
+
+  VERIFY_IS_EQUAL(d2_a[0].first, 0);
+  VERIFY_IS_EQUAL(d2_a[0].second, 10);
+  VERIFY_IS_EQUAL(d2_a[1].first, 1);
+  VERIFY_IS_EQUAL(d2_a[1].second, 11);
+  VERIFY_IS_EQUAL(d2_a[2].first, 2);
+  VERIFY_IS_EQUAL(d2_a[2].second, 12);
+
+  VERIFY_IS_EQUAL(d2_b[0].first, 0);
+  VERIFY_IS_EQUAL(d2_b[0].second, 10);
+  VERIFY_IS_EQUAL(d2_b[1].first, 1);
+  VERIFY_IS_EQUAL(d2_b[1].second, 11);
+  VERIFY_IS_EQUAL(d2_b[2].first, 2);
+  VERIFY_IS_EQUAL(d2_b[2].second, 12);
+
+  VERIFY_IS_EQUAL(d2_c[0].first, 0);
+  VERIFY_IS_EQUAL(d2_c[0].second, 10);
+  VERIFY_IS_EQUAL(d2_c[1].first, 1);
+  VERIFY_IS_EQUAL(d2_c[1].second, 11);
+  VERIFY_IS_EQUAL(d2_c[2].first, 2);
+  VERIFY_IS_EQUAL(d2_c[2].second, 12);
+
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(0) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims0>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims0>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(1, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(0, 0) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(2, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims0>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims0>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(1, 11) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(2, 12) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(1, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(2, 12) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(0, 10) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(1, 11) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(2, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+}
+
+
+static void test_dynamic_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  int dim1 = 2;
+  int dim2 = 1;
+  int dim3 = 0;
+
+  auto reduction_axis = make_index_list(dim1, dim2, dim3);
+
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 2);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 0);
+
+  Tensor<float, 1> result = tensor.sum(reduction_axis);
+  for (int i = 0; i < result.size(); ++i) {
+    float expected = 0.0f;
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          expected += tensor(j,k,l,i);
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+static void test_mixed_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  int dim2 = 1;
+  int dim4 = 3;
+
+  auto reduction_axis = make_index_list(0, dim2, 2, dim4);
+
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[3]), 3);
+
+  typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices;
+  ReductionIndices reduction_indices;
+  reduction_indices.set(1, 1);
+  reduction_indices.set(3, 3);
+  EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_indices) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_indices) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#if 0
+  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionIndices>() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionIndices>() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  typedef IndexList<type2index<0>, type2index<1>, type2index<2>, type2index<3>> ReductionList;
+  ReductionList reduction_list;
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(3, 3) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#if 0
+  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionList>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionList>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  Tensor<float, 0> result1 = tensor.sum(reduction_axis);
+  Tensor<float, 0> result2 = tensor.sum(reduction_indices);
+  Tensor<float, 0> result3 = tensor.sum(reduction_list);
+
+  float expected = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          expected += tensor(i,j,k,l);
+        }
+      }
+    }
+  }
+  VERIFY_IS_APPROX(result1(), expected);
+  VERIFY_IS_APPROX(result2(), expected);
+  VERIFY_IS_APPROX(result3(), expected);
+}
+
+
+static void test_dim_check()
+{
+  Eigen::IndexList<Eigen::type2index<1>, int> dim1;
+  dim1.set(1, 2);
+  Eigen::IndexList<Eigen::type2index<1>, int> dim2;
+  dim2.set(1, 2);
+  VERIFY(dimensions_match(dim1, dim2));
+}
+
+
+#endif
+
+void test_cxx11_tensor_index_list()
+{
+#ifdef EIGEN_HAS_INDEX_LIST
+  CALL_SUBTEST(test_static_index_list());
+  CALL_SUBTEST(test_type2index_list());
+  CALL_SUBTEST(test_type2indexpair_list());
+  CALL_SUBTEST(test_dynamic_index_list());
+  CALL_SUBTEST(test_mixed_index_list());
+  CALL_SUBTEST(test_dim_check());
+#endif
+}
diff --git a/unsupported/test/cxx11_tensor_inflation.cpp b/unsupported/test/cxx11_tensor_inflation.cpp
new file mode 100644
index 000000000..4997935e9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_inflation.cpp
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_inflation()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_stride;
+  no_stride = tensor.inflate(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+  Tensor<float, 4, DataLayout> inflated;
+  inflated = tensor.inflate(strides);
+
+  VERIFY_IS_EQUAL(inflated.dimension(0), 3);
+  VERIFY_IS_EQUAL(inflated.dimension(1), 9);
+  VERIFY_IS_EQUAL(inflated.dimension(2), 9);
+  VERIFY_IS_EQUAL(inflated.dimension(3), 19);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 9; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          if (i % 2 == 0 &&
+              j % 4 == 0 &&
+              k % 2 == 0 &&
+              l % 3 == 0) {
+            VERIFY_IS_EQUAL(inflated(i,j,k,l),
+                            tensor(i/2, j/4, k/2, l/3));
+          } else {
+            VERIFY_IS_EQUAL(0, inflated(i,j,k,l));
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_inflation()
+{
+  CALL_SUBTEST(test_simple_inflation<ColMajor>());
+  CALL_SUBTEST(test_simple_inflation<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
new file mode 100644
index 000000000..8e2b70b75
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+void test_signed_32bit()
+{
+  // Divide by one
+  const Eigen::internal::TensorIntDivisor<int32_t, false> div_by_one(1);
+
+  for (int32_t j = 0; j < 25000; ++j) {
+    const int32_t fast_div = j / div_by_one;
+    const int32_t slow_div = j / 1;
+    VERIFY_IS_EQUAL(fast_div, slow_div);
+  }
+
+  // Standard divide by 2 or more
+  for (int32_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int32_t, false> div(i);
+
+    for (int32_t j = 0; j < 25000; ++j) {
+      const int32_t fast_div = j / div;
+      const int32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+
+  // Optimized divide by 2 or more
+  for (int32_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int32_t, true> div(i);
+
+    for (int32_t j = 0; j < 25000; ++j) {
+      const int32_t fast_div = j / div;
+      const int32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_unsigned_32bit()
+{
+  for (uint32_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<uint32_t> div(i);
+
+    for (uint32_t j = 0; j < 25000; ++j) {
+      const uint32_t fast_div = j / div;
+      const uint32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_signed_64bit()
+{
+  for (int64_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int64_t> div(i);
+
+    for (int64_t j = 0; j < 25000; ++j) {
+      const int64_t fast_div = j / div;
+      const int64_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_unsigned_64bit()
+{
+  for (uint64_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<uint64_t> div(i);
+
+    for (uint64_t j = 0; j < 25000; ++j) {
+      const uint64_t fast_div = j / div;
+      const uint64_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+void test_powers_32bit() {
+  for (int expon = 1; expon < 31; expon++) {
+    int32_t div = (1 << expon);
+    for (int num_expon = 0; num_expon < 32; num_expon++) {
+      int32_t start_num = (1 << num_expon) - 100;
+      int32_t end_num = (1 << num_expon) + 100;
+      if (start_num < 0)
+        start_num = 0;
+      for (int32_t num = start_num; num < end_num; num++) {
+        Eigen::internal::TensorIntDivisor<int32_t> divider =
+          Eigen::internal::TensorIntDivisor<int32_t>(div);
+        int32_t result = num/div;
+        int32_t result_op = divider.divide(num);
+        VERIFY_IS_EQUAL(result_op, result);
+      }
+    }
+  }
+}
+
+void test_powers_64bit() {
+  for (int expon = 0; expon < 63; expon++) {
+    int64_t div = (1ull << expon);
+    for (int num_expon = 0; num_expon < 63; num_expon++) {
+      int64_t start_num = (1ull << num_expon) - 10;
+      int64_t end_num = (1ull << num_expon) + 10;
+      if (start_num < 0)
+        start_num = 0;
+      for (int64_t num = start_num; num < end_num; num++) {
+        Eigen::internal::TensorIntDivisor<int64_t> divider(div);
+        int64_t result = num/div;
+        int64_t result_op = divider.divide(num);
+        VERIFY_IS_EQUAL(result_op, result);
+      }
+    }
+  }
+}
+
+void test_specific() {
+  // A particular combination that was previously failing
+  int64_t div = 209715200;
+  int64_t num = 3238002688ll;
+  Eigen::internal::TensorIntDivisor<int64_t> divider(div);
+  int64_t result = num/div;
+  int64_t result_op = divider.divide(num);
+  VERIFY_IS_EQUAL(result, result_op);
+}
+
+void test_cxx11_tensor_intdiv()
+{
+  CALL_SUBTEST_1(test_signed_32bit());
+  CALL_SUBTEST_2(test_unsigned_32bit());
+  CALL_SUBTEST_3(test_signed_64bit());
+  CALL_SUBTEST_4(test_unsigned_64bit());
+  CALL_SUBTEST_5(test_powers_32bit());
+  CALL_SUBTEST_6(test_powers_64bit());
+  CALL_SUBTEST_7(test_specific());
+}
diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
new file mode 100644
index 000000000..489960529
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_io.cpp
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <sstream>
+#include <string>
+#include <Eigen/CXX11/Tensor>
+
+
+template<int DataLayout>
+static void test_output_0d()
+{
+  Tensor<int, 0, DataLayout> tensor;
+  tensor() = 123;
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("123");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_1d()
+{
+  Tensor<int, 1, DataLayout> tensor(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor(i) = i;
+  }
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("0\n1\n2\n3\n4");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+
+  Eigen::Tensor<double,1,DataLayout> empty_tensor(0);
+  std::stringstream empty_os;
+  empty_os << empty_tensor;
+  std::string empty_string;
+  VERIFY_IS_EQUAL(std::string(empty_os.str()), empty_string);
+}
+
+
+template<int DataLayout>
+static void test_output_2d()
+{
+  Tensor<int, 2, DataLayout> tensor(5, 3);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      tensor(i, j) = i*j;
+    }
+  }
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("0  0  0\n0  1  2\n0  2  4\n0  3  6\n0  4  8");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_expr()
+{
+  Tensor<int, 1, DataLayout> tensor1(5);
+  Tensor<int, 1, DataLayout> tensor2(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor1(i) = i;
+    tensor2(i) = 7;
+  }
+
+  std::stringstream os;
+  os << tensor1 + tensor2;
+
+  std::string expected(" 7\n 8\n 9\n10\n11");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_string()
+{
+  Tensor<std::string, 2, DataLayout> tensor(5, 3);
+  tensor.setConstant(std::string("foo"));
+
+  std::cout << tensor << std::endl;
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("foo  foo  foo\nfoo  foo  foo\nfoo  foo  foo\nfoo  foo  foo\nfoo  foo  foo");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_const()
+{
+  Tensor<int, 1, DataLayout> tensor(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor(i) = i;
+  }
+
+  TensorMap<Tensor<const int, 1, DataLayout> > tensor_map(tensor.data(), 5);
+
+  std::stringstream os;
+  os << tensor_map;
+
+  std::string expected("0\n1\n2\n3\n4");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+void test_cxx11_tensor_io()
+{
+  CALL_SUBTEST(test_output_0d<ColMajor>());
+  CALL_SUBTEST(test_output_0d<RowMajor>());
+  CALL_SUBTEST(test_output_1d<ColMajor>());
+  CALL_SUBTEST(test_output_1d<RowMajor>());
+  CALL_SUBTEST(test_output_2d<ColMajor>());
+  CALL_SUBTEST(test_output_2d<RowMajor>());
+  CALL_SUBTEST(test_output_expr<ColMajor>());
+  CALL_SUBTEST(test_output_expr<RowMajor>());
+  CALL_SUBTEST(test_output_string<ColMajor>());
+  CALL_SUBTEST(test_output_string<RowMajor>());
+  CALL_SUBTEST(test_output_const<ColMajor>());
+  CALL_SUBTEST(test_output_const<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp
new file mode 100644
index 000000000..ae297a9da
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_layout_swap.cpp
@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_simple_swap()
+{
+  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+}
+
+
+static void test_swap_as_lvalue()
+{
+  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 3, RowMajor> tensor2(7,3,2);
+  tensor2.swap_layout() = tensor;
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_layout_swap()
+{
+  CALL_SUBTEST(test_simple_swap());
+  CALL_SUBTEST(test_swap_as_lvalue());
+}
diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp
new file mode 100644
index 000000000..071f5b406
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_lvalue.cpp
@@ -0,0 +1,42 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+
+static void test_compound_assignment()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  mat3 = mat1;
+  mat3 += mat2;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) + mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_lvalue()
+{
+  CALL_SUBTEST(test_compound_assignment());
+}
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
new file mode 100644
index 000000000..3db0ee7c0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -0,0 +1,277 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_0d()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  TensorMap<Tensor<const int, 0> > scalar3(scalar1.data());
+  TensorMap<Tensor<const int, 0, RowMajor> > scalar4(scalar2.data());
+
+  scalar1() = 7;
+  scalar2() = 13;
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar3(), 7);
+  VERIFY_IS_EQUAL(scalar4(), 13);
+}
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+
+  TensorMap<Tensor<const int, 1> > vec3(vec1.data(), 6);
+  TensorMap<Tensor<const int, 1, RowMajor> > vec4(vec2.data(), 6);
+
+  vec1(0) = 4;  vec2(0) = 0;
+  vec1(1) = 8;  vec2(1) = 1;
+  vec1(2) = 15; vec2(2) = 2;
+  vec1(3) = 16; vec2(3) = 3;
+  vec1(4) = 23; vec2(4) = 4;
+  vec1(5) = 42; vec2(5) = 5;
+
+  VERIFY_IS_EQUAL(vec1.rank(), 1);
+  VERIFY_IS_EQUAL(vec1.size(), 6);
+  VERIFY_IS_EQUAL(vec1.dimension(0), 6);
+
+  VERIFY_IS_EQUAL(vec3(0), 4);
+  VERIFY_IS_EQUAL(vec3(1), 8);
+  VERIFY_IS_EQUAL(vec3(2), 15);
+  VERIFY_IS_EQUAL(vec3(3), 16);
+  VERIFY_IS_EQUAL(vec3(4), 23);
+  VERIFY_IS_EQUAL(vec3(5), 42);
+
+  VERIFY_IS_EQUAL(vec4(0), 0);
+  VERIFY_IS_EQUAL(vec4(1), 1);
+  VERIFY_IS_EQUAL(vec4(2), 2);
+  VERIFY_IS_EQUAL(vec4(3), 3);
+  VERIFY_IS_EQUAL(vec4(4), 4);
+  VERIFY_IS_EQUAL(vec4(5), 5);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  TensorMap<Tensor<const int, 2> > mat3(mat1.data(), 2, 3);
+  TensorMap<Tensor<const int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 2);
+  VERIFY_IS_EQUAL(mat3.size(), 6);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 2);
+  VERIFY_IS_EQUAL(mat4.size(), 6);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+
+  VERIFY_IS_EQUAL(mat3(0,0), 0);
+  VERIFY_IS_EQUAL(mat3(0,1), 1);
+  VERIFY_IS_EQUAL(mat3(0,2), 2);
+  VERIFY_IS_EQUAL(mat3(1,0), 3);
+  VERIFY_IS_EQUAL(mat3(1,1), 4);
+  VERIFY_IS_EQUAL(mat3(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat4(0,0), 0);
+  VERIFY_IS_EQUAL(mat4(0,1), 1);
+  VERIFY_IS_EQUAL(mat4(0,2), 2);
+  VERIFY_IS_EQUAL(mat4(1,0), 3);
+  VERIFY_IS_EQUAL(mat4(1,1), 4);
+  VERIFY_IS_EQUAL(mat4(1,2), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<const int, 3> > mat3(mat1.data(), 2, 3, 7);
+  TensorMap<Tensor<const int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 3);
+  VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat3.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 3);
+  VERIFY_IS_EQUAL(mat4.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat4.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+
+static void test_from_tensor()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<int, 3> > mat3(mat1);
+  TensorMap<Tensor<int, 3, RowMajor> > mat4(mat2);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 3);
+  VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat3.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 3);
+  VERIFY_IS_EQUAL(mat4.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat4.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+
+  TensorFixedSize<int, Sizes<2,3,7> > mat5;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        array<ptrdiff_t, 3> coords;
+        coords[0] = i;
+        coords[1] = j;
+        coords[2] = k;
+        mat5(coords) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<TensorFixedSize<int, Sizes<2,3,7> > > mat6(mat5);
+
+  VERIFY_IS_EQUAL(mat6.rank(), 3);
+  VERIFY_IS_EQUAL(mat6.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat6.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat6.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat6.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat6(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+
+static int f(const TensorMap<Tensor<int, 3> >& tensor) {
+  //  Size<0> empty;
+  EIGEN_STATIC_ASSERT((internal::array_size<Sizes<> >::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_size<DSizes<int, 0> >::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  Tensor<int, 0> result = tensor.sum();
+  return result();
+}
+
+static void test_casting()
+{
+  Tensor<int, 3> tensor(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        tensor(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<int, 3> > map(tensor);
+  int sum1 = f(map);
+  int sum2 = f(tensor);
+
+  VERIFY_IS_EQUAL(sum1, sum2);
+  VERIFY_IS_EQUAL(sum1, 861);
+}
+
+void test_cxx11_tensor_map()
+{
+  CALL_SUBTEST(test_0d());
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+
+  CALL_SUBTEST(test_from_tensor());
+  CALL_SUBTEST(test_casting());
+}
diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp
new file mode 100644
index 000000000..61c742a16
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_math.cpp
@@ -0,0 +1,46 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_tanh()
+{
+  Tensor<float, 1> vec1(6);
+  vec1.setRandom();
+
+  Tensor<float, 1> vec2 = vec1.tanh();
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec2(i), tanhf(vec1(i)));
+  }
+}
+
+static void test_sigmoid()
+{
+  Tensor<float, 1> vec1(6);
+  vec1.setRandom();
+
+  Tensor<float, 1> vec2 = vec1.sigmoid();
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec2(i), 1.0f / (1.0f + std::exp(-vec1(i))));
+  }
+}
+
+
+void test_cxx11_tensor_math()
+{
+  CALL_SUBTEST(test_tanh());
+  CALL_SUBTEST(test_sigmoid());
+}
diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp
new file mode 100644
index 000000000..4fba6fdd1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp
@@ -0,0 +1,53 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_simple()
+{
+  Tensor<float, 1, ColMajor> vec1(6);
+  Tensor<float, 1, ColMajor, int> vec2(6);
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<Tensor<float, 1, ColMajor>> vec3(data3, 6);
+  vec3 = vec1.sqrt();
+  float data4[6];
+  TensorMap<Tensor<float, 1, ColMajor, int>> vec4(data4, 6);
+  vec4 = vec2.square();
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), 0.0f);
+  VERIFY_IS_APPROX(vec4(1), 1.0f);
+  VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f);
+}
+
+
+void test_cxx11_tensor_mixed_indices()
+{
+  CALL_SUBTEST(test_simple());
+}
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
new file mode 100644
index 000000000..f7de43110
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -0,0 +1,485 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename>
+static void test_simple_reshape()
+{
+  Tensor<float, 5> tensor1(2,3,1,7,1);
+  tensor1.setRandom();
+
+  Tensor<float, 3> tensor2(2,3,7);
+  Tensor<float, 2> tensor3(6,7);
+  Tensor<float, 2> tensor4(2,21);
+
+  Tensor<float, 3>::Dimensions dim1(2,3,7);
+  tensor2 = tensor1.reshape(dim1);
+  Tensor<float, 2>::Dimensions dim2(6,7);
+  tensor3 = tensor1.reshape(dim2);
+  Tensor<float, 2>::Dimensions dim3(2,21);
+  tensor4 = tensor1.reshape(dim1).reshape(dim3);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k));
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k));
+      }
+    }
+  }
+}
+
+template<typename>
+static void test_reshape_in_expr() {
+  MatrixXf m1(2,3*5*7*11);
+  MatrixXf m2(3*5*7*11,13);
+  m1.setRandom();
+  m2.setRandom();
+  MatrixXf m3 = m1 * m2;
+
+  TensorMap<Tensor<float, 5>> tensor1(m1.data(), 2,3,5,7,11);
+  TensorMap<Tensor<float, 5>> tensor2(m2.data(), 3,5,7,11,13);
+  Tensor<float, 2>::Dimensions newDims1(2,3*5*7*11);
+  Tensor<float, 2>::Dimensions newDims2(3*5*7*11,13);
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  array<DimPair, 1> contract_along{{DimPair(1, 0)}};
+  Tensor<float, 2> tensor3(2,13);
+  tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along);
+
+  Map<MatrixXf> res(tensor3.data(), 2, 13);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      VERIFY_IS_APPROX(res(i,j), m3(i,j));
+    }
+  }
+}
+
+template<typename>
+static void test_reshape_as_lvalue()
+{
+  Tensor<float, 3> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 2> tensor2d(6,7);
+  Tensor<float, 3>::Dimensions dim(2,3,7);
+  tensor2d.reshape(dim) = tensor;
+
+  float scratch[2*3*1*7*1];
+  TensorMap<Tensor<float, 5>> tensor5d(scratch, 2,3,1,7,1);
+  tensor5d.reshape(dim).device(Eigen::DefaultDevice()) = tensor;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k));
+        VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_simple_slice()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 5, DataLayout> slice1(1,1,1,1,1);
+  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+  slice1 = tensor.slice(indices, sizes);
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  Tensor<float, 5, DataLayout> slice2(1,1,2,2,3);
+  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+  slice2 = tensor.slice(indices2, sizes2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+}
+
+template<typename=void>
+static void test_const_slice()
+{
+  const float b[1] = {42};
+  TensorMap<Tensor<const float, 1> > m(b, 1);
+  DSizes<DenseIndex, 1> offsets;
+  offsets[0] = 0;
+  TensorRef<Tensor<const float, 1> > slice_ref(m.slice(offsets, m.dimensions()));
+  VERIFY_IS_EQUAL(slice_ref(0), 42);
+}
+
+template<int DataLayout>
+static void test_slice_in_expr() {
+  typedef Matrix<float, Dynamic, Dynamic, DataLayout> Mtx;
+  Mtx m1(7,7);
+  Mtx m2(3,3);
+  m1.setRandom();
+  m2.setRandom();
+
+  Mtx m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1);
+
+  TensorMap<Tensor<float, 2, DataLayout>> tensor1(m1.data(), 7, 7);
+  TensorMap<Tensor<float, 2, DataLayout>> tensor2(m2.data(), 3, 3);
+  Tensor<float, 2, DataLayout> tensor3(3,1);
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  array<DimPair, 1> contract_along{{DimPair(1, 0)}};
+
+  Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
+  Eigen::DSizes<ptrdiff_t, 2> sizes1(3,3);
+  Eigen::DSizes<ptrdiff_t, 2> indices2(0,2);
+  Eigen::DSizes<ptrdiff_t, 2> sizes2(3,1);
+  tensor3 = tensor1.slice(indices1, sizes1).contract(tensor2.slice(indices2, sizes2), contract_along);
+
+  Map<Mtx> res(tensor3.data(), 3, 1);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 1; ++j) {
+      VERIFY_IS_APPROX(res(i,j), m3(i,j));
+    }
+  }
+
+  // Take an arbitrary slice of an arbitrarily sized tensor.
+  TensorMap<Tensor<const float, 2, DataLayout>> tensor4(m1.data(), 7, 7);
+  Tensor<float, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
+  for (int i = 0; i < 35; ++i) {
+    VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i]));
+  }
+}
+
+template<int DataLayout>
+static void test_slice_as_lvalue()
+{
+  Tensor<float, 3, DataLayout> tensor1(2,2,7);
+  tensor1.setRandom();
+  Tensor<float, 3, DataLayout> tensor2(2,2,7);
+  tensor2.setRandom();
+  Tensor<float, 3, DataLayout> tensor3(4,3,5);
+  tensor3.setRandom();
+  Tensor<float, 3, DataLayout> tensor4(4,3,2);
+  tensor4.setRandom();
+  Tensor<float, 3, DataLayout> tensor5(10,13,12);
+  tensor5.setRandom();
+
+  Tensor<float, 3, DataLayout> result(4,5,7);
+  Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7);
+  Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0);
+  result.slice(first_slice, sizes12) = tensor1;
+  Eigen::DSizes<ptrdiff_t, 3> second_slice(2,0,0);
+  result.slice(second_slice, sizes12).device(Eigen::DefaultDevice()) = tensor2;
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes3(4,3,5);
+  Eigen::DSizes<ptrdiff_t, 3> third_slice(0,2,0);
+  result.slice(third_slice, sizes3) = tensor3;
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes4(4,3,2);
+  Eigen::DSizes<ptrdiff_t, 3> fourth_slice(0,2,5);
+  result.slice(fourth_slice, sizes4) = tensor4;
+
+  for (int j = 0; j < 2; ++j) {
+    for (int k = 0; k < 7; ++k) {
+      for (int i = 0; i < 2; ++i) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor1(i,j,k));
+        VERIFY_IS_EQUAL(result(i+2,j,k), tensor2(i,j,k));
+      }
+    }
+  }
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 2; j < 5; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor3(i,j-2,k));
+      }
+      for (int k = 5; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor4(i,j-2,k-5));
+      }
+    }
+  }
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes5(4,5,7);
+  Eigen::DSizes<ptrdiff_t, 3> fifth_slice(0,0,0);
+  result.slice(fifth_slice, sizes5) = tensor5.slice(fifth_slice, sizes5);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 2; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor5(i,j,k));
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_slice_raw_data()
+{
+  Tensor<float, 4, DataLayout> tensor(3,5,7,11);
+  tensor.setRandom();
+
+  Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4);
+  Eigen::DSizes<ptrdiff_t, 4> extents(1,1,1,1);
+  typedef TensorEvaluator<decltype(tensor.slice(offsets, extents)), DefaultDevice> SliceEvaluator;
+  auto slice1 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1);
+  VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4));
+
+  if (DataLayout == ColMajor) {
+    extents = Eigen::DSizes<ptrdiff_t, 4>(2,1,1,1);
+    auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2);
+    VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
+    VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4));
+  } else {
+    extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,1,2);
+    auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2);
+    VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
+    VERIFY_IS_EQUAL(slice2.data()[1], tensor(1,2,3,5));
+  }
+
+  extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
+  auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2);
+  VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0));
+
+  if (DataLayout == ColMajor) {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(3,2,1,1);
+    auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6);
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4));
+      }
+    }
+  } else {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(1,2,3,0);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,2,11);
+    auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 22);
+    for (int l = 0; l < 11; ++l) {
+      for (int k = 0; k < 2; ++k) {
+        VERIFY_IS_EQUAL(slice4.data()[l+11*k], tensor(1,2,3+k,l));
+      }
+    }
+  }
+
+  if (DataLayout == ColMajor) {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,4);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,2);
+    auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210);
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 5; ++j) {
+        for (int k = 0; k < 7; ++k) {
+          for (int l = 0; l < 2; ++l) {
+            int slice_index = i + 3 * (j + 5 * (k + 7 * l));
+            VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4));
+          }
+        }
+      }
+    }
+  } else {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(1,0,0,0);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(2,5,7,11);
+    auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 770);
+    for (int l = 0; l < 11; ++l) {
+      for (int k = 0; k < 7; ++k) {
+        for (int j = 0; j < 5; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            int slice_index = l + 11 * (k + 7 * (j + 5 * i));
+            VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i+1,j,k,l));
+          }
+        }
+      }
+    }
+
+  }
+
+  offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,0);
+  extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,11);
+  auto slice6 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice6.dimensions().TotalSize(), 3*5*7*11);
+  VERIFY_IS_EQUAL(slice6.data(), tensor.data());
+}
+
+
+template<int DataLayout>
+static void test_strided_slice()
+{
+  typedef Tensor<float, 5, DataLayout> Tensor5f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5;
+  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<float, 2, DataLayout> tensor2(7,11);
+  tensor.setRandom();
+  tensor2.setRandom();
+
+  if (true) {
+    Tensor2f slice(2,3);
+    Index2 strides(-2,-1);
+    Index2 indicesStart(5,7);
+    Index2 indicesStop(0,4);
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice(j,k), tensor2(5-2*j,7-k));
+      }
+    }
+  }
+
+  if(true) {
+    Tensor2f slice(0,1);
+    Index2 strides(1,1);
+    Index2 indicesStart(5,4);
+    Index2 indicesStop(5,5);
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+  }
+
+  if(true) { // test clamped degenerate interavls
+    Tensor2f slice(7,11);
+    Index2 strides(1,-1);
+    Index2 indicesStart(-3,20); // should become 0,10
+    Index2 indicesStop(20,-11); // should become 11, -1
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        VERIFY_IS_EQUAL(slice(j,k), tensor2(j,10-k));
+      }
+    }
+  }
+
+  if(true) {
+    Tensor5f slice1(1,1,1,1,1);
+    Eigen::DSizes<Eigen::DenseIndex, 5> indicesStart(1, 2, 3, 4, 5);
+    Eigen::DSizes<Eigen::DenseIndex, 5> indicesStop(2, 3, 4, 5, 6);
+    Eigen::DSizes<Eigen::DenseIndex, 5> strides(1, 1, 1, 1, 1);
+    slice1 = tensor.stridedSlice(indicesStart, indicesStop, strides);
+    VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+  }
+
+  if(true) {
+    Tensor5f slice(1,1,2,2,3);
+    Index5 start(1, 1, 3, 4, 5);
+    Index5 stop(2, 2, 5, 6, 8);
+    Index5 strides(1, 1, 1, 1, 1);
+    slice = tensor.stridedSlice(start, stop, strides);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 3; ++k) {
+          VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+        }
+      }
+    }
+  }
+
+  if(true) {
+    Tensor5f slice(1,1,2,2,3);
+    Index5 strides3(1, 1, -2, 1, -1);
+    Index5 indices3Start(1, 1, 4, 4, 7);
+    Index5 indices3Stop(2, 2, 0, 6, 4);
+    slice = tensor.stridedSlice(indices3Start, indices3Stop, strides3);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 3; ++k) {
+          VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,4-2*i,4+j,7-k));
+        }
+      }
+    }
+  }
+
+  if(false) { // tests degenerate interval
+    Tensor5f slice(1,1,2,2,3);
+    Index5 strides3(1, 1, 2, 1, 1);
+    Index5 indices3Start(1, 1, 4, 4, 7);
+    Index5 indices3Stop(2, 2, 0, 6, 4);
+    slice = tensor.stridedSlice(indices3Start, indices3Stop, strides3);
+  }
+}
+
+template<int DataLayout>
+static void test_strided_slice_write()
+{
+  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
+
+  Tensor<float, 2, DataLayout> tensor(7,11),tensor2(7,11);
+  tensor.setRandom();
+  tensor2=tensor;
+  Tensor2f slice(2,3);
+
+  slice.setRandom();
+
+  Index2 strides(1,1);
+  Index2 indicesStart(3,4);
+  Index2 indicesStop(5,7);
+  Index2 lengths(2,3);
+
+  tensor.slice(indicesStart,lengths)=slice;
+  tensor2.stridedSlice(indicesStart,indicesStop,strides)=slice;
+
+  for(int i=0;i<7;i++) for(int j=0;j<11;j++){
+    VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
+  }
+}
+
+
+template<int DataLayout>
+static void test_composition()
+{
+  Eigen::Tensor<float, 2, DataLayout> matrix(7, 11);
+  matrix.setRandom();
+
+  const DSizes<ptrdiff_t, 3> newDims(1, 1, 11);
+  Eigen::Tensor<float, 3, DataLayout> tensor =
+      matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
+
+  VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11);
+  VERIFY_IS_EQUAL(tensor.dimension(0), 1);
+  VERIFY_IS_EQUAL(tensor.dimension(1), 1);
+  VERIFY_IS_EQUAL(tensor.dimension(2), 11);
+  for (int i = 0; i < 11; ++i) {
+    VERIFY_IS_EQUAL(tensor(0,0,i), matrix(2,i));
+  }
+}
+
+
+void test_cxx11_tensor_morphing()
+{
+  CALL_SUBTEST_1(test_simple_reshape<void>());
+  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
+
+  CALL_SUBTEST_1(test_simple_slice<ColMajor>());
+  CALL_SUBTEST_1(test_simple_slice<RowMajor>());
+  CALL_SUBTEST_1(test_const_slice());
+  CALL_SUBTEST_2(test_slice_in_expr<ColMajor>());
+  CALL_SUBTEST_3(test_slice_in_expr<RowMajor>());
+  CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>());
+  CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>());
+  CALL_SUBTEST_5(test_slice_raw_data<ColMajor>());
+  CALL_SUBTEST_5(test_slice_raw_data<RowMajor>());
+
+  CALL_SUBTEST_6(test_strided_slice_write<ColMajor>());
+  CALL_SUBTEST_6(test_strided_slice<ColMajor>());
+  CALL_SUBTEST_6(test_strided_slice_write<RowMajor>());
+  CALL_SUBTEST_6(test_strided_slice<RowMajor>());
+
+  CALL_SUBTEST_7(test_composition<ColMajor>());
+  CALL_SUBTEST_7(test_composition<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
new file mode 100644
index 000000000..c946007b8
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Vijay Vasudevan <vrv@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <stdlib.h>
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+#if EIGEN_OS_WIN || EIGEN_OS_WIN64
+#include <windows.h>
+void sleep(int seconds) {
+  Sleep(seconds*1000);
+}
+#else
+#include <unistd.h>
+#endif
+
+
+namespace {
+
+void WaitAndAdd(Eigen::Notification* n, int* counter) {
+  n->Wait();
+  *counter = *counter + 1;
+}
+
+}  // namespace
+
+static void test_notification_single()
+{
+  ThreadPool thread_pool(1);
+
+  int counter = 0;
+  Eigen::Notification n;
+  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  thread_pool.Schedule(func);
+  sleep(1);
+
+  // The thread should be waiting for the notification.
+  VERIFY_IS_EQUAL(counter, 0);
+
+  // Unblock the thread
+  n.Notify();
+
+  sleep(1);
+
+  // Verify the counter has been incremented
+  VERIFY_IS_EQUAL(counter, 1);
+}
+
+// Like test_notification_single() but enqueues multiple threads to
+// validate that all threads get notified by Notify().
+static void test_notification_multiple()
+{
+  ThreadPool thread_pool(1);
+
+  int counter = 0;
+  Eigen::Notification n;
+  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  sleep(1);
+  VERIFY_IS_EQUAL(counter, 0);
+  n.Notify();
+  sleep(1);
+  VERIFY_IS_EQUAL(counter, 4);
+}
+
+void test_cxx11_tensor_notification()
+{
+  CALL_SUBTEST(test_notification_single());
+  CALL_SUBTEST(test_notification_multiple());
+}
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
new file mode 100644
index 000000000..e9d1b2d3c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+
+static void test_additions()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<float>, 1> data2(3);
+  for (int i = 0; i < 3; ++i) {
+    data1(i) = std::complex<float>(i, -i);
+    data2(i) = std::complex<float>(i, 7 * i);
+  }
+
+  Tensor<std::complex<float>, 1> sum = data1 + data2;
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_EQUAL(sum(i),  std::complex<float>(2*i, 6*i));
+  }
+}
+
+
+static void test_abs()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<double>, 1> data2(3);
+  data1.setRandom();
+  data2.setRandom();
+
+  Tensor<float, 1> abs1 = data1.abs();
+  Tensor<double, 1> abs2 = data2.abs();
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_APPROX(abs1(i), std::abs(data1(i)));
+    VERIFY_IS_APPROX(abs2(i), std::abs(data2(i)));
+  }
+}
+
+
+static void test_conjugate()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<double>, 1> data2(3);
+  Tensor<int, 1> data3(3);
+  data1.setRandom();
+  data2.setRandom();
+  data3.setRandom();
+
+  Tensor<std::complex<float>, 1> conj1 = data1.conjugate();
+  Tensor<std::complex<double>, 1> conj2 = data2.conjugate();
+  Tensor<int, 1> conj3 = data3.conjugate();
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_APPROX(conj1(i), std::conj(data1(i)));
+    VERIFY_IS_APPROX(conj2(i), std::conj(data2(i)));
+    VERIFY_IS_APPROX(conj3(i), data3(i));
+  }
+}
+
+static void test_contractions()
+{
+  Tensor<std::complex<float>, 4> t_left(30, 50, 8, 31);
+  Tensor<std::complex<float>, 5> t_right(8, 31, 7, 20, 10);
+  Tensor<std::complex<float>, 5> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Matrix<std::complex<float>, Dynamic, Dynamic>> MapXcf;
+  MapXcf m_left(t_left.data(), 1500, 248);
+  MapXcf m_right(t_right.data(), 248, 1400);
+  Matrix<std::complex<float>, Dynamic, Dynamic> m_result(1500, 1400);
+
+  // This contraction should be equivalent to a regular matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims;
+  dims[0] = DimPair(2, 0);
+  dims[1] = DimPair(3, 1);
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+
+void test_cxx11_tensor_of_complex()
+{
+  CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_abs());
+  CALL_SUBTEST(test_conjugate());
+  CALL_SUBTEST(test_contractions());
+}
diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp
new file mode 100644
index 000000000..f179a0c21
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_const_values.cpp
@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_assign()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  const TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+
+  Tensor<float, 2> rslt1;
+  rslt1 = mat1;
+  Tensor<float, 2> rslt2;
+  rslt2 = mat2;
+
+  Tensor<float, 2> rslt3 = mat1;
+  Tensor<float, 2> rslt4 = mat2;
+
+  Tensor<float, 2> rslt5(mat1);
+  Tensor<float, 2> rslt6(mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(rslt1(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt2(i,j), static_cast<float>(-i - 2*j));
+      VERIFY_IS_APPROX(rslt3(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt4(i,j), static_cast<float>(-i - 2*j));
+      VERIFY_IS_APPROX(rslt5(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt6(i,j), static_cast<float>(-i - 2*j));
+    }
+  }
+}
+
+
+static void test_plus()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+
+  Tensor<float, 2> sum1;
+  sum1 = mat1 + mat2;
+  Tensor<float, 2> sum2;
+  sum2 = mat2 + mat1;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(sum1(i,j), 0.0f);
+      VERIFY_IS_APPROX(sum2(i,j), 0.0f);
+    }
+  }
+}
+
+
+static void test_plus_equal()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+  mat2 += mat1;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(mat2(i,j), 0.0f);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_of_const_values()
+{
+  CALL_SUBTEST(test_assign());
+  CALL_SUBTEST(test_plus());
+  CALL_SUBTEST(test_plus_equal());
+}
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
new file mode 100644
index 000000000..2f86980a2
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -0,0 +1,494 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename>
+void test_cuda_numext() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+  bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+  Tensor<bool, 1> half_prec(num_elem);
+  Tensor<bool, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking numext " << i << std::endl;
+    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+
+template<typename>
+void test_cuda_conversion() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+  
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random();
+  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+template<typename>
+void test_cuda_unary() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking unary " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_cuda_elementwise() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
+      d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
+      d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
+  gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_cuda_trancendental() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+  gpu_float3.device(gpu_device) = gpu_float3.random();
+  gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
+  gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
+  gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+
+  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
+  gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
+
+  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
+  gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
+
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
+
+  Tensor<float, 1> input1(num_elem);
+  Tensor<Eigen::half, 1> half_prec1(num_elem);
+  Tensor<Eigen::half, 1> full_prec1(num_elem);
+  Tensor<float, 1> input2(num_elem);
+  Tensor<Eigen::half, 1> half_prec2(num_elem);
+  Tensor<Eigen::half, 1> full_prec2(num_elem);
+  Tensor<float, 1> input3(num_elem);
+  Tensor<Eigen::half, 1> half_prec3(num_elem);
+  Tensor<Eigen::half, 1> full_prec3(num_elem);
+  gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+    else
+      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
+  }
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float3);
+  gpu_device.deallocate(d_res1_half);
+  gpu_device.deallocate(d_res1_float);
+  gpu_device.deallocate(d_res2_half);
+  gpu_device.deallocate(d_res2_float);
+  gpu_device.deallocate(d_res3_float);
+  gpu_device.deallocate(d_res3_half);
+}
+
+template<typename>
+void test_cuda_contractions() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int rows = 23;
+  int cols = 23;
+  int num_elem = rows*cols;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
+      d_res_half, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
+      d_res_float, rows, cols);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
+
+  Tensor<Eigen::half, 2> half_prec(rows, cols);
+  Tensor<Eigen::half, 2> full_prec(rows, cols);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
+      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
+        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+      }
+    }
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_cuda_reductions(int size1, int size2, int redux) {
+
+   std::cout << "Reducing " << size1 << " by " << size2
+             << " tensor along dim " << redux << std::endl; 
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = size1*size2;
+  int result_size = (redux == 1 ? size1 : size2);
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, result_size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, result_size);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
+  gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
+
+  Eigen::array<int, 1> redux_dim = {{redux}};
+  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
+
+  Tensor<Eigen::half, 1> half_prec(result_size);
+  Tensor<Eigen::half, 1> full_prec(result_size);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < result_size; ++i) {
+    std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_cuda_reductions() {
+  test_cuda_reductions<void>(13, 13, 0);
+  test_cuda_reductions<void>(13, 13, 1);
+
+  test_cuda_reductions<void>(35, 36, 0);
+  test_cuda_reductions<void>(35, 36, 1);
+
+  test_cuda_reductions<void>(36, 35, 0);
+  test_cuda_reductions<void>(36, 35, 1);
+}
+
+template<typename>
+void test_cuda_full_reductions() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int size = 13;
+  int num_elem = size*size;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
+      d_res_half);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
+      d_res_float);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+
+  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+
+  Tensor<Eigen::half, 0> half_prec;
+  Tensor<Eigen::half, 0> full_prec;
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_cuda_forced_evals() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
+      d_res_half1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+      d_res_half2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  Eigen::array<int, 1> no_bcast;
+  no_bcast[0] = 1;
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+  gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
+
+  Tensor<float, 1> half_prec1(num_elem);
+  Tensor<float, 1> half_prec2(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
+    VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half1);
+  gpu_device.deallocate(d_res_half2);
+  gpu_device.deallocate(d_res_float);
+}
+#endif
+
+
+void test_cxx11_tensor_of_float16_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_numext<void>());
+
+#ifdef EIGEN_HAS_CUDA_FP16
+  CALL_SUBTEST_1(test_cuda_conversion<void>());
+  CALL_SUBTEST_1(test_cuda_unary<void>());
+  CALL_SUBTEST_1(test_cuda_elementwise<void>());
+  CALL_SUBTEST_1(test_cuda_trancendental<void>());
+  CALL_SUBTEST_2(test_cuda_contractions<void>());
+  CALL_SUBTEST_3(test_cuda_reductions<void>());
+  CALL_SUBTEST_4(test_cuda_full_reductions<void>());
+  CALL_SUBTEST_5(test_cuda_forced_evals<void>());
+#else
+  std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
+#endif
+}
diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp
new file mode 100644
index 000000000..4ef9aed91
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_strings.cpp
@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+static void test_assign()
+{
+  std::string data1[6];
+  TensorMap<Tensor<std::string, 2>> mat1(data1, 2, 3);
+  std::string data2[6];
+  const TensorMap<Tensor<const std::string, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    std::ostringstream s1;
+    s1 << "abc" << i*3;
+    data1[i] = s1.str();
+    std::ostringstream s2;
+    s2 << "def" << i*5;
+    data2[i] = s2.str();
+  }
+
+  Tensor<std::string, 2> rslt1;
+  rslt1 = mat1;
+  Tensor<std::string, 2> rslt2;
+  rslt2 = mat2;
+
+  Tensor<std::string, 2> rslt3 = mat1;
+  Tensor<std::string, 2> rslt4 = mat2;
+
+  Tensor<std::string, 2> rslt5(mat1);
+  Tensor<std::string, 2> rslt6(mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(rslt1(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt2(i,j), data2[i+2*j]);
+      VERIFY_IS_EQUAL(rslt3(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt4(i,j), data2[i+2*j]);
+      VERIFY_IS_EQUAL(rslt5(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt6(i,j), data2[i+2*j]);
+    }
+  }
+}
+
+
+static void test_concat()
+{
+  Tensor<std::string, 2> t1(2, 3);
+  Tensor<std::string, 2> t2(2, 3);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      std::ostringstream s1;
+      s1 << "abc" << i + j*2;
+      t1(i, j) = s1.str();
+      std::ostringstream s2;
+      s2 << "def" << i*5 + j*32;
+      t2(i, j) = s2.str();
+    }
+  }
+
+  Tensor<std::string, 2> result = t1.concatenate(t2, 1);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 6);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(result(i, j),   t1(i, j));
+      VERIFY_IS_EQUAL(result(i, j+3), t2(i, j));
+    }
+  }
+}
+
+
+static void test_slices()
+{
+  Tensor<std::string, 2> data(2, 6);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      std::ostringstream s1;
+      s1 << "abc" << i + j*2;
+      data(i, j) = s1.str();
+    }
+  }
+
+  const Eigen::DSizes<ptrdiff_t, 2> half_size(2, 3);
+  const Eigen::DSizes<ptrdiff_t, 2> first_half(0, 0);
+  const Eigen::DSizes<ptrdiff_t, 2> second_half(0, 3);
+
+  Tensor<std::string, 2> t1 = data.slice(first_half, half_size);
+  Tensor<std::string, 2> t2 = data.slice(second_half, half_size);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(data(i, j),   t1(i, j));
+      VERIFY_IS_EQUAL(data(i, j+3), t2(i, j));
+    }
+  }
+}
+
+
+static void test_additions()
+{
+  Tensor<std::string, 1> data1(3);
+  Tensor<std::string, 1> data2(3);
+  for (int i = 0; i < 3; ++i) {
+    data1(i) = "abc";
+    std::ostringstream s1;
+    s1 << i;
+    data2(i) = s1.str();
+  }
+
+  Tensor<std::string, 1> sum = data1 + data2;
+  for (int i = 0; i < 3; ++i) {
+    std::ostringstream concat;
+    concat << "abc" << i;
+    std::string expected = concat.str();
+    VERIFY_IS_EQUAL(sum(i), expected);
+  }
+}
+
+
+static void test_initialization()
+{
+  Tensor<std::string, 2> a(2, 3);
+  a.setConstant(std::string("foo"));
+  for (int i = 0; i < 2*3; ++i) {
+    VERIFY_IS_EQUAL(a(i), std::string("foo"));
+  }
+}
+
+
+void test_cxx11_tensor_of_strings()
+{
+  // Beware: none of this is likely to ever work on a GPU.
+  CALL_SUBTEST(test_assign());
+  CALL_SUBTEST(test_concat());
+  CALL_SUBTEST(test_slices());
+  CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_initialization());
+}
diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp
new file mode 100644
index 000000000..ffa19896e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_padding.cpp
@@ -0,0 +1,93 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_padding()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Tensor<float, 4, DataLayout> padded;
+  padded = tensor.pad(paddings);
+
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_padded_expr()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Eigen::DSizes<ptrdiff_t, 2> reshape_dims;
+  reshape_dims[0] = 12;
+  reshape_dims[1] = 84;
+
+  Tensor<float, 2, DataLayout> result;
+  result = tensor.pad(paddings).reshape(reshape_dims);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          const float result_value = DataLayout == ColMajor ?
+              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(result_value, 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_padding()
+{
+  CALL_SUBTEST(test_simple_padding<ColMajor>());
+  CALL_SUBTEST(test_simple_padding<RowMajor>());
+  CALL_SUBTEST(test_padded_expr<ColMajor>());
+  CALL_SUBTEST(test_padded_expr<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp
new file mode 100644
index 000000000..434359730
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_patch.cpp
@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_patch()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> patch_dims;
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 1;
+  patch_dims[2] = 1;
+  patch_dims[3] = 1;
+
+  Tensor<float, 5, DataLayout> no_patch;
+  no_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size());
+  } else {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size());
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
+  }
+
+  patch_dims[0] = 2;
+  patch_dims[1] = 3;
+  patch_dims[2] = 5;
+  patch_dims[3] = 7;
+  Tensor<float, 5, DataLayout> single_patch;
+  single_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+  } else {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 7);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 2;
+  patch_dims[3] = 1;
+  Tensor<float, 5, DataLayout> twod_patch;
+  twod_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7);
+  } else {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 4 * l));
+          } else {
+            patch_loc = l + 7 * (k + 4 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 2; ++y) {
+              if (DataLayout == ColMajor) {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc));
+              } else {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 3;
+  patch_dims[3] = 5;
+  Tensor<float, 5, DataLayout> threed_patch;
+  threed_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3);
+  } else {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 5);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 3 * l));
+          } else {
+            patch_loc = l + 3 * (k + 3 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 3; ++y) {
+              for (int z = 0; z < 5; ++z) {
+                if (DataLayout == ColMajor) {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc));
+                } else {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_patch()
+{
+   CALL_SUBTEST(test_simple_patch<ColMajor>());
+   CALL_SUBTEST(test_simple_patch<RowMajor>());
+   //   CALL_SUBTEST(test_expr_shuffling());
+}
diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
new file mode 100644
index 000000000..0f3dc5787
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random.cpp
@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+static void test_default()
+{
+  Tensor<float, 1> vec(6);
+  vec.setRandom();
+
+  // Fixme: we should check that the generated numbers follow a uniform
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+static void test_normal()
+{
+  Tensor<float, 1> vec(6);
+  vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>();
+
+  // Fixme: we should check that the generated numbers follow a gaussian
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+
+struct MyGenerator {
+  MyGenerator() { }
+  MyGenerator(const MyGenerator&) { }
+
+  // Return a random value to be used.  "element_location" is the
+  // location of the entry to set in the tensor, it can typically
+  // be ignored.
+  int operator()(Eigen::DenseIndex element_location, Eigen::DenseIndex /*unused*/ = 0) const {
+    return static_cast<int>(3 * element_location);
+  }
+
+  // Same as above but generates several numbers at a time.
+  internal::packet_traits<int>::type packetOp(
+      Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+    const int packetSize = internal::packet_traits<int>::size;
+    EIGEN_ALIGN_MAX int values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = static_cast<int>(3 * (packet_location + i));
+    }
+    return internal::pload<typename internal::packet_traits<int>::type>(values);
+  }
+};
+
+
+static void test_custom()
+{
+  Tensor<int, 1> vec(6);
+  vec.setRandom<MyGenerator>();
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(vec(i), 3*i);
+  }
+}
+
+void test_cxx11_tensor_random()
+{
+  CALL_SUBTEST(test_default());
+  CALL_SUBTEST(test_normal());
+  CALL_SUBTEST(test_custom());
+}
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu
new file mode 100644
index 000000000..b3be199e1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu
@@ -0,0 +1,88 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+
+void test_cuda_random_uniform()
+{
+  Tensor<float, 2> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_out;
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  gpu_out.device(gpu_device) = gpu_out.random();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+}
+
+
+void test_cuda_random_normal()
+{
+  Tensor<float, 2> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_out;
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  Eigen::internal::NormalRandomGenerator<float> gen(true);
+  gpu_out.device(gpu_device) = gpu_out.random(gen);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+}
+
+static void test_complex()
+{
+  Tensor<std::complex<float>, 1> vec(6);
+  vec.setRandom();
+
+  // Fixme: we should check that the generated numbers follow a uniform
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+
+void test_cxx11_tensor_random_cuda()
+{
+  CALL_SUBTEST(test_cuda_random_uniform());
+  CALL_SUBTEST(test_cuda_random_normal());
+  CALL_SUBTEST(test_complex());
+}
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
new file mode 100644
index 000000000..1490ec3da
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -0,0 +1,508 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <numeric>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_trivial_reductions() {
+  {
+    Tensor<float, 0, DataLayout> tensor;
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 0, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result(), tensor());
+  }
+
+  {
+    Tensor<float, 1, DataLayout> tensor(7);
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 1, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result.dimension(0), 7);
+    for (int i = 0; i < 7; ++i) {
+      VERIFY_IS_EQUAL(result(i), tensor(i));
+    }
+  }
+
+  {
+    Tensor<float, 2, DataLayout> tensor(2, 3);
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result.dimension(0), 2);
+    VERIFY_IS_EQUAL(result.dimension(1), 3);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        VERIFY_IS_EQUAL(result(i, j), tensor(i, j));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_reductions() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis2;
+  reduction_axis2[0] = 1;
+  reduction_axis2[1] = 3;
+
+  Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      float sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor(i, k, j, l);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), sum);
+    }
+  }
+
+  {
+    Tensor<float, 0, DataLayout> sum1 = tensor.sum();
+    VERIFY_IS_EQUAL(sum1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<float, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
+    VERIFY_IS_EQUAL(sum2.rank(), 0);
+
+    VERIFY_IS_APPROX(sum1(), sum2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 2;
+  result = tensor.prod(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 3);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float prod = 1.0f;
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          prod *= tensor(k, i, l, j);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), prod);
+    }
+  }
+
+  {
+    Tensor<float, 0, DataLayout> prod1 = tensor.prod();
+    VERIFY_IS_EQUAL(prod1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<float, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
+    VERIFY_IS_EQUAL(prod2.rank(), 0);
+
+    VERIFY_IS_APPROX(prod1(), prod2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 2;
+  result = tensor.maximum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 3);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float max_val = std::numeric_limits<float>::lowest();
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          max_val = (std::max)(max_val, tensor(k, i, l, j));
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), max_val);
+    }
+  }
+
+  {
+    Tensor<float, 0, DataLayout> max1 = tensor.maximum();
+    VERIFY_IS_EQUAL(max1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<float, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
+    VERIFY_IS_EQUAL(max2.rank(), 0);
+
+    VERIFY_IS_APPROX(max1(), max2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 1;
+  result = tensor.minimum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float min_val = (std::numeric_limits<float>::max)();
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          min_val = (std::min)(min_val, tensor(k, l, i, j));
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), min_val);
+    }
+  }
+
+  {
+    Tensor<float, 0, DataLayout> min1 = tensor.minimum();
+    VERIFY_IS_EQUAL(min1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<float, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
+    VERIFY_IS_EQUAL(min2.rank(), 0);
+
+    VERIFY_IS_APPROX(min1(), min2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 1;
+  result = tensor.mean(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float sum = 0.0f;
+      int count = 0;
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          sum += tensor(k, l, i, j);
+          ++count;
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), sum / count);
+    }
+  }
+
+  {
+    Tensor<float, 0, DataLayout> mean1 = tensor.mean();
+    VERIFY_IS_EQUAL(mean1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<float, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
+    VERIFY_IS_EQUAL(mean2.rank(), 0);
+
+    VERIFY_IS_APPROX(mean1(), mean2());
+  }
+
+  {
+    Tensor<int, 1> ints(10);
+    std::iota(ints.data(), ints.data() + ints.dimension(0), 0);
+
+    TensorFixedSize<bool, Sizes<> > all;
+    all = ints.all();
+    VERIFY(!all());
+    all = (ints >= ints.constant(0)).all();
+    VERIFY(all());
+
+    TensorFixedSize<bool, Sizes<> > any;
+    any = (ints > ints.constant(10)).any();
+    VERIFY(!any());
+    any = (ints < ints.constant(1)).any();
+    VERIFY(any());
+  }
+}
+
+
+template <int DataLayout>
+static void test_reductions_in_expr() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis2;
+  reduction_axis2[0] = 1;
+  reduction_axis2[1] = 3;
+
+  Tensor<float, 2, DataLayout> result(2, 5);
+  result = result.constant(1.0f) - tensor.sum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      float sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor(i, k, j, l);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), 1.0f - sum);
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_full_reductions() {
+  Tensor<float, 2, DataLayout> tensor(2, 3);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+
+  Tensor<float, 0, DataLayout> result = tensor.sum(reduction_axis);
+  VERIFY_IS_EQUAL(result.rank(), 0);
+
+  float sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor(i, j);
+    }
+  }
+  VERIFY_IS_APPROX(result(0), sum);
+
+  result = tensor.square().sum(reduction_axis).sqrt();
+  VERIFY_IS_EQUAL(result.rank(), 0);
+
+  sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor(i, j) * tensor(i, j);
+    }
+  }
+  VERIFY_IS_APPROX(result(), sqrtf(sum));
+}
+
+struct UserReducer {
+  static const bool PacketAccess = false;
+  UserReducer(float offset) : offset_(offset) {}
+  void reduce(const float val, float* accum) { *accum += val * val; }
+  float initialize() const { return 0; }
+  float finalize(const float accum) const { return 1.0f / (accum + offset_); }
+
+ private:
+  const float offset_;
+};
+
+template <int DataLayout>
+static void test_user_defined_reductions() {
+  Tensor<float, 2, DataLayout> tensor(5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 1> reduction_axis;
+  reduction_axis[0] = 1;
+
+  UserReducer reducer(10.0f);
+  Tensor<float, 1, DataLayout> result = tensor.reduce(reduction_axis, reducer);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  for (int i = 0; i < 5; ++i) {
+    float expected = 10.0f;
+    for (int j = 0; j < 7; ++j) {
+      expected += tensor(i, j) * tensor(i, j);
+    }
+    expected = 1.0f / expected;
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+template <int DataLayout>
+static void test_tensor_maps() {
+  int inputs[2 * 3 * 5 * 7];
+  TensorMap<Tensor<int, 4, DataLayout> > tensor_map(inputs, 2, 3, 5, 7);
+  TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const(inputs, 2, 3, 5,
+                                                                7);
+  const TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const_const(
+      inputs, 2, 3, 5, 7);
+
+  tensor_map.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  Tensor<int, 2, DataLayout> result = tensor_map.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result2 = tensor_map_const.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result3 =
+      tensor_map_const_const.sum(reduction_axis);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int sum = 0;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor_map(i, k, j, l);
+        }
+      }
+      VERIFY_IS_EQUAL(result(i, j), sum);
+      VERIFY_IS_EQUAL(result2(i, j), sum);
+      VERIFY_IS_EQUAL(result3(i, j), sum);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_static_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 97);
+  in.setRandom();
+
+#if !EIGEN_HAS_CONSTEXPR 
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+#else
+  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<3> > reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected = (std::max)(expected, in(i, k, j, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_innermost_last_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(97, 113);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if !EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+#else
+  // This triggers the use of packets for ColMajor.
+  Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1> > reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 97; ++i) {
+    for (int j = 0; j < 113; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 72; ++l) {
+          expected = (std::max)(expected, in(l, k, i, j));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_innermost_first_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 53);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if !EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 2;
+  reduction_axis[1] = 3;
+#else
+  // This triggers the use of packets for RowMajor.
+  Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>> reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 97; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected = (std::max)(expected, in(i, j, k, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_reduce_middle_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 53);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if !EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 2;
+#else
+  // This triggers the use of packets for RowMajor.
+  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2>> reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 113; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 97; ++l) {
+          expected = (std::max)(expected, in(i, k, l, j));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+void test_cxx11_tensor_reduction() {
+  CALL_SUBTEST(test_trivial_reductions<ColMajor>());
+  CALL_SUBTEST(test_trivial_reductions<RowMajor>());
+  CALL_SUBTEST(test_simple_reductions<ColMajor>());
+  CALL_SUBTEST(test_simple_reductions<RowMajor>());
+  CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
+  CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
+  CALL_SUBTEST(test_full_reductions<ColMajor>());
+  CALL_SUBTEST(test_full_reductions<RowMajor>());
+  CALL_SUBTEST(test_user_defined_reductions<ColMajor>());
+  CALL_SUBTEST(test_user_defined_reductions<RowMajor>());
+  CALL_SUBTEST(test_tensor_maps<ColMajor>());
+  CALL_SUBTEST(test_tensor_maps<RowMajor>());
+  CALL_SUBTEST(test_static_dims<ColMajor>());
+  CALL_SUBTEST(test_static_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_last_dims<ColMajor>());
+  CALL_SUBTEST(test_innermost_last_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_first_dims<ColMajor>());
+  CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
+  CALL_SUBTEST(test_reduce_middle_dims<ColMajor>());
+  CALL_SUBTEST(test_reduce_middle_dims<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu
new file mode 100644
index 000000000..6858b43a7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -0,0 +1,157 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+template<typename Type, int DataLayout>
+static void test_full_reductions() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<Type, 0, DataLayout> full_redux;
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(Type);
+  std::size_t out_bytes = full_redux.size() * sizeof(Type);
+  Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+  Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<Type, 0, DataLayout> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+template<typename Type, int DataLayout>
+static void test_first_dim_reductions() {
+  int dim_x = 33;
+  int dim_y = 1;
+  int dim_z = 128;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 0;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data(T)
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+template<typename Type, int DataLayout>
+static void test_last_dim_reductions() {
+  int dim_x = 128;
+  int dim_y = 1;
+  int dim_z = 33;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 2;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+
+void test_cxx11_tensor_reduction_cuda() {
+  CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
+  CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
+  
+  CALL_SUBTEST_3((test_first_dim_reductions<float, ColMajor>()));
+  CALL_SUBTEST_3((test_first_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST_4((test_first_dim_reductions<float, RowMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST_4((test_first_dim_reductions<double, RowMajor>()))
+
+  CALL_SUBTEST_5((test_last_dim_reductions<float, ColMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST_5((test_last_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST_6((test_last_dim_reductions<float, RowMajor>()));
+  CALL_SUBTEST_6((test_last_dim_reductions<double, RowMajor>()));
+}
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
new file mode 100644
index 000000000..a9ef82907
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -0,0 +1,138 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+
+static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
+
+  const int num_rows = 452;
+  const int num_cols = 765;
+  array<int, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<float, 2> in(tensorRange);
+  Tensor<float, 0> full_redux;
+  Tensor<float, 0> full_redux_gpu;
+
+  in.setRandom();
+
+  full_redux = in.sum();
+
+  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
+  float* gpu_out_data =(float*)sycl_device.allocate(sizeof(float));
+
+  TensorMap<Tensor<float, 2> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<float, 0> >  out_gpu(gpu_out_data);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  out_gpu.device(sycl_device) = in_gpu.sum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(float));
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
+
+  int dim_x = 145;
+  int dim_y = 1;
+  int dim_z = 67;
+
+  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 0;
+  array<int, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<float, 3> in(tensorRange);
+  Tensor<float, 2> redux(reduced_tensorRange);
+  Tensor<float, 2> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux= in.sum(red_axis);
+
+  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
+  float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+
+  TensorMap<Tensor<float, 3> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<float, 2> >  out_gpu(gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for(int j=0; j<reduced_tensorRange[0]; j++ )
+    for(int k=0; k<reduced_tensorRange[1]; k++ )
+      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {
+
+  int dim_x = 567;
+  int dim_y = 1;
+  int dim_z = 47;
+
+  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 2;
+  array<int, 2> reduced_tensorRange = {{dim_x, dim_y}};
+
+  Tensor<float, 3> in(tensorRange);
+  Tensor<float, 2> redux(reduced_tensorRange);
+  Tensor<float, 2> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux= in.sum(red_axis);
+
+  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
+  float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+
+  TensorMap<Tensor<float, 3> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<float, 2> >  out_gpu(gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+  // Check that the CPU and GPU reductions return the same result.
+  for(int j=0; j<reduced_tensorRange[0]; j++ )
+    for(int k=0; k<reduced_tensorRange[1]; k++ )
+      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+
+}
+
+void test_cxx11_tensor_reduction_sycl() {
+  cl::sycl::gpu_selector s;
+  Eigen::SyclDevice sycl_device(s);
+  CALL_SUBTEST((test_full_reductions_sycl(sycl_device)));
+  CALL_SUBTEST((test_first_dim_reductions_sycl(sycl_device)));
+  CALL_SUBTEST((test_last_dim_reductions_sycl(sycl_device)));
+
+}
diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
new file mode 100644
index 000000000..c8f105e3d
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -0,0 +1,248 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_simple_lvalue_ref()
+{
+  Tensor<int, 1> input(6);
+  input.setRandom();
+
+  TensorRef<Tensor<int, 1>> ref3(input);
+  TensorRef<Tensor<int, 1>> ref4 = input;
+
+  VERIFY_IS_EQUAL(ref3.data(), input.data());
+  VERIFY_IS_EQUAL(ref4.data(), input.data());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(ref3(i), input(i));
+    VERIFY_IS_EQUAL(ref4(i), input(i));
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    ref3.coeffRef(i) = i;
+  }
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(input(i), i);
+  }
+  for (int i = 0; i < 6; ++i) {
+    ref4.coeffRef(i) = -i * 2;
+  }
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(input(i), -i*2);
+  }
+}
+
+
+static void test_simple_rvalue_ref()
+{
+  Tensor<int, 1> input1(6);
+  input1.setRandom();
+  Tensor<int, 1> input2(6);
+  input2.setRandom();
+
+  TensorRef<Tensor<int, 1>> ref3(input1 + input2);
+  TensorRef<Tensor<int, 1>> ref4 = input1 + input2;
+
+  VERIFY_IS_NOT_EQUAL(ref3.data(), input1.data());
+  VERIFY_IS_NOT_EQUAL(ref4.data(), input1.data());
+  VERIFY_IS_NOT_EQUAL(ref3.data(), input2.data());
+  VERIFY_IS_NOT_EQUAL(ref4.data(), input2.data());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(ref3(i), input1(i) + input2(i));
+    VERIFY_IS_EQUAL(ref4(i), input1(i) + input2(i));
+  }
+}
+
+
+static void test_multiple_dims()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+
+  TensorRef<Tensor<float, 3>> ref(input);
+  VERIFY_IS_EQUAL(ref.data(), input.data());
+  VERIFY_IS_EQUAL(ref.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref.dimension(2), 7);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(ref(i,j,k), input(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_slice()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+  TensorRef<Tensor<float, 5>> slice = tensor.slice(indices, sizes);
+  VERIFY_IS_EQUAL(slice(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+  slice = tensor.slice(indices2, sizes2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+
+  Eigen::DSizes<ptrdiff_t, 5> indices3(0,0,0,0,0);
+  Eigen::DSizes<ptrdiff_t, 5> sizes3(2,3,1,1,1);
+  slice = tensor.slice(indices3, sizes3);
+  VERIFY_IS_EQUAL(slice.data(), tensor.data());
+}
+
+
+static void test_ref_of_ref()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+
+  TensorRef<Tensor<float, 3>> ref(input);
+  TensorRef<Tensor<float, 3>> ref_of_ref(ref);
+  TensorRef<Tensor<float, 3>> ref_of_ref2;
+  ref_of_ref2 = ref;
+
+  VERIFY_IS_EQUAL(ref_of_ref.data(), input.data());
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(ref_of_ref2.data(), input.data());
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(2), 7);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(ref_of_ref(i,j,k), input(i,j,k));
+        VERIFY_IS_EQUAL(ref_of_ref2(i,j,k), input(i,j,k));
+     }
+    }
+  }
+}
+
+
+static void test_ref_in_expr()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+  TensorRef<Tensor<float, 3>> input_ref(input);
+
+  Tensor<float, 3> result(3,5,7);
+  result.setRandom();
+  TensorRef<Tensor<float, 3>> result_ref(result);
+
+  Tensor<float, 3> bias(3,5,7);
+  bias.setRandom();
+
+  result_ref = input_ref + bias;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result_ref(i,j,k), input(i,j,k) + bias(i,j,k));
+        VERIFY_IS_NOT_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k));
+      }
+    }
+  }
+
+  result = result_ref;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_coeff_ref()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  Tensor<float, 5> original = tensor;
+
+  TensorRef<Tensor<float, 4>> slice = tensor.chip(7, 4);
+  slice.coeffRef(0, 0, 0, 0) = 1.0f;
+  slice.coeffRef(1, 0, 0, 0) += 2.0f;
+
+  VERIFY_IS_EQUAL(tensor(0,0,0,0,7), 1.0f);
+  VERIFY_IS_EQUAL(tensor(1,0,0,0,7), original(1,0,0,0,7) + 2.0f);
+}
+
+
+static void test_nested_ops_with_ref()
+{
+  Tensor<float, 4> t(2, 3, 5, 7);
+  t.setRandom();
+  TensorMap<Tensor<const float, 4> > m(t.data(), 2, 3, 5, 7);
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+  DSizes<Eigen::DenseIndex, 4> shuffle_dims(0, 1, 2, 3);
+  TensorRef<Tensor<const float, 4> > ref(m.pad(paddings));
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> trivial;
+  trivial[0] = std::make_pair(0, 0);
+  trivial[1] = std::make_pair(0, 0);
+  trivial[2] = std::make_pair(0, 0);
+  trivial[3] = std::make_pair(0, 0);
+  Tensor<float, 4> padded = ref.shuffle(shuffle_dims).pad(trivial);
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), t(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_ref()
+{
+  CALL_SUBTEST(test_simple_lvalue_ref());
+  CALL_SUBTEST(test_simple_rvalue_ref());
+  CALL_SUBTEST(test_multiple_dims());
+  CALL_SUBTEST(test_slice());
+  CALL_SUBTEST(test_ref_of_ref());
+  CALL_SUBTEST(test_ref_in_expr());
+  CALL_SUBTEST(test_coeff_ref());
+  CALL_SUBTEST(test_nested_ops_with_ref());
+}
diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
new file mode 100644
index 000000000..b35b8d29e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reverse.cpp
@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com and
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_simple_reverse()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = true;
+  dim_rev[3] = false;
+
+  Tensor<float, 4, DataLayout> reversed_tensor;
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = false;
+
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_expr_reverse(bool LValue)
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  Tensor<float, 4, DataLayout> expected(2, 3, 5, 7);
+  if (LValue) {
+    expected.reverse(dim_rev) = tensor;
+  } else {
+    expected = tensor.reverse(dim_rev);
+  }
+
+  Tensor<float, 4, DataLayout> result(2,3,5,7);
+
+  array<ptrdiff_t, 4> src_slice_dim;
+  src_slice_dim[0] = 2;
+  src_slice_dim[1] = 3;
+  src_slice_dim[2] = 1;
+  src_slice_dim[3] = 7;
+  array<ptrdiff_t, 4> src_slice_start;
+  src_slice_start[0] = 0;
+  src_slice_start[1] = 0;
+  src_slice_start[2] = 0;
+  src_slice_start[3] = 0;
+  array<ptrdiff_t, 4> dst_slice_dim = src_slice_dim;
+  array<ptrdiff_t, 4> dst_slice_start = src_slice_start;
+
+  for (int i = 0; i < 5; ++i) {
+    if (LValue) {
+      result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) =
+          tensor.slice(src_slice_start, src_slice_dim);
+    } else {
+      result.slice(dst_slice_start, dst_slice_dim) =
+          tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    }
+    src_slice_start[2] += 1;
+    dst_slice_start[2] += 1;
+  }
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 3);
+  VERIFY_IS_EQUAL(result.dimension(2), 5);
+  VERIFY_IS_EQUAL(result.dimension(3), 7);
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[2] = 0;
+  result.setRandom();
+  for (int i = 0; i < 5; ++i) {
+     if (LValue) {
+       result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) =
+           tensor.slice(dst_slice_start, dst_slice_dim);
+     } else {
+       result.slice(dst_slice_start, dst_slice_dim) =
+           tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+     }
+    dst_slice_start[2] += 1;
+  }
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_reverse()
+{
+  CALL_SUBTEST(test_simple_reverse<ColMajor>());
+  CALL_SUBTEST(test_simple_reverse<RowMajor>());
+  CALL_SUBTEST(test_expr_reverse<ColMajor>(true));
+  CALL_SUBTEST(test_expr_reverse<RowMajor>(true));
+  CALL_SUBTEST(test_expr_reverse<ColMajor>(false));
+  CALL_SUBTEST(test_expr_reverse<RowMajor>(false));
+}
diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp
new file mode 100644
index 000000000..2c26151ab
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_roundings.cpp
@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_float_rounding()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.round();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::round(ftensor(i,j)));
+    }
+  }
+}
+
+static void test_float_flooring()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.floor();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::floor(ftensor(i,j)));
+    }
+  }
+}
+
+static void test_float_ceiling()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.ceil();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::ceil(ftensor(i,j)));
+    }
+  }
+}
+
+void test_cxx11_tensor_roundings()
+{
+   CALL_SUBTEST(test_float_rounding());
+   CALL_SUBTEST(test_float_ceiling());
+   CALL_SUBTEST(test_float_flooring());
+}
diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp
new file mode 100644
index 000000000..af59aa3ef
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan.cpp
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <numeric>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout, typename Type=float, bool Exclusive = false>
+static void test_1d_scan()
+{
+  int size = 50;
+  Tensor<Type, 1, DataLayout> tensor(size);
+  tensor.setRandom();
+  Tensor<Type, 1, DataLayout> result = tensor.cumsum(0, Exclusive);
+
+  VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0));
+
+  float accum = 0;
+  for (int i = 0; i < size; i++) {
+    if (Exclusive) {
+      VERIFY_IS_EQUAL(result(i), accum);
+      accum += tensor(i);
+    } else {
+      accum += tensor(i);
+      VERIFY_IS_EQUAL(result(i), accum);
+    }
+  }
+
+  accum = 1;
+  result = tensor.cumprod(0, Exclusive);
+  for (int i = 0; i < size; i++) {
+    if (Exclusive) {
+      VERIFY_IS_EQUAL(result(i), accum);
+      accum *= tensor(i);
+    } else {
+      accum *= tensor(i);
+      VERIFY_IS_EQUAL(result(i), accum);
+    }
+  }
+}
+
+template <int DataLayout, typename Type=float>
+static void test_4d_scan()
+{
+  int size = 5;
+  Tensor<Type, 4, DataLayout> tensor(size, size, size, size);
+  tensor.setRandom();
+
+  Tensor<Type, 4, DataLayout> result(size, size, size, size);
+
+  result = tensor.cumsum(0);
+  float accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(i, 1, 2, 3);
+    VERIFY_IS_EQUAL(result(i, 1, 2, 3), accum);
+  }
+  result = tensor.cumsum(1);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, i, 2, 3);
+    VERIFY_IS_EQUAL(result(1, i, 2, 3), accum);
+  }
+  result = tensor.cumsum(2);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, 2, i, 3);
+    VERIFY_IS_EQUAL(result(1, 2, i, 3), accum);
+  }
+  result = tensor.cumsum(3);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, 2, 3, i);
+    VERIFY_IS_EQUAL(result(1, 2, 3, i), accum);
+  }
+}
+
+template <int DataLayout>
+static void test_tensor_maps() {
+  int inputs[20];
+  TensorMap<Tensor<int, 1, DataLayout> > tensor_map(inputs, 20);
+  tensor_map.setRandom();
+
+  Tensor<int, 1, DataLayout> result = tensor_map.cumsum(0);
+
+  int accum = 0;
+  for (int i = 0; i < 20; ++i) {
+    accum += tensor_map(i);
+    VERIFY_IS_EQUAL(result(i), accum);
+  }
+}
+
+void test_cxx11_tensor_scan() {
+  CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
+  CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
+  CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
+  CALL_SUBTEST((test_1d_scan<RowMajor, float, false>()));
+  CALL_SUBTEST(test_4d_scan<ColMajor>());
+  CALL_SUBTEST(test_4d_scan<RowMajor>());
+  CALL_SUBTEST(test_tensor_maps<ColMajor>());
+  CALL_SUBTEST(test_tensor_maps<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu
new file mode 100644
index 000000000..5f146f3c9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_cuda.cu
@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_cuda_cumsum(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
+
+  t_input.setRandom();
+
+  std::size_t t_input_bytes = t_input.size()  * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_input;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_input), t_input_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
+
+  gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
+  t_result = t_input.cumsum(1);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  cudaFree((void*)d_t_input);
+  cudaFree((void*)d_t_result);
+}
+
+
+void test_cxx11_tensor_scan_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
+}
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
new file mode 100644
index 000000000..d11444a14
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -0,0 +1,228 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_simple_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+
+  Tensor<float, 4, DataLayout> no_shuffle;
+  no_shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_expr_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> expected;
+  expected = tensor.shuffle(shuffles);
+
+  Tensor<float, 4, DataLayout> result(5,7,3,2);
+
+  array<int, 4> src_slice_dim{{2,3,1,7}};
+  array<int, 4> src_slice_start{{0,0,0,0}};
+  array<int, 4> dst_slice_dim{{1,7,3,2}};
+  array<int, 4> dst_slice_start{{0,0,0,0}};
+
+  for (int i = 0; i < 5; ++i) {
+    result.slice(dst_slice_start, dst_slice_dim) =
+        tensor.slice(src_slice_start, src_slice_dim).shuffle(shuffles);
+    src_slice_start[2] += 1;
+    dst_slice_start[0] += 1;
+  }
+
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  VERIFY_IS_EQUAL(result.dimension(2), 3);
+  VERIFY_IS_EQUAL(result.dimension(3), 2);
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[0] = 0;
+  result.setRandom();
+  for (int i = 0; i < 5; ++i) {
+    result.slice(dst_slice_start, dst_slice_dim) =
+        tensor.shuffle(shuffles).slice(dst_slice_start, dst_slice_dim);
+    dst_slice_start[0] += 1;
+  }
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_shuffling_as_value()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[2] = 0;
+  shuffles[3] = 1;
+  shuffles[1] = 2;
+  shuffles[0] = 3;
+  Tensor<float, 4, DataLayout> shuffle(5,7,3,2);
+  shuffle.shuffle(shuffles) = tensor;
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+
+  array<ptrdiff_t, 4> no_shuffle;
+  no_shuffle[0] = 0;
+  no_shuffle[1] = 1;
+  no_shuffle[2] = 2;
+  no_shuffle[3] = 3;
+  Tensor<float, 4, DataLayout> shuffle2(5,7,3,2);
+  shuffle2.shuffle(shuffles) = tensor.shuffle(no_shuffle);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          VERIFY_IS_EQUAL(shuffle2(i,j,k,l), shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_shuffle_unshuffle()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  // Choose a random permutation.
+  array<ptrdiff_t, 4> shuffles;
+  for (int i = 0; i < 4; ++i) {
+    shuffles[i] = i;
+  }
+  array<ptrdiff_t, 4> shuffles_inverse;
+  for (int i = 0; i < 4; ++i) {
+    const ptrdiff_t index = internal::random<ptrdiff_t>(i, 3);
+    shuffles_inverse[shuffles[index]] = i;
+    std::swap(shuffles[i], shuffles[index]);
+  }
+
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles).shuffle(shuffles_inverse);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_shuffling()
+{
+  CALL_SUBTEST(test_simple_shuffling<ColMajor>());
+  CALL_SUBTEST(test_simple_shuffling<RowMajor>());
+  CALL_SUBTEST(test_expr_shuffling<ColMajor>());
+  CALL_SUBTEST(test_expr_shuffling<RowMajor>());
+  CALL_SUBTEST(test_shuffling_as_value<ColMajor>());
+  CALL_SUBTEST(test_shuffling_as_value<RowMajor>());
+  CALL_SUBTEST(test_shuffle_unshuffle<ColMajor>());
+  CALL_SUBTEST(test_shuffle_unshuffle<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
new file mode 100644
index 000000000..5a0d339ef
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_simple.cpp
@@ -0,0 +1,327 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_0d()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+  Tensor<int, 0> scalar3;
+  Tensor<int, 0, RowMajor> scalar4;
+
+  scalar3.resize();
+  scalar4.resize();
+
+  scalar1() = 7;
+  scalar2() = 13;
+  scalar3.setValues(17);
+  scalar4.setZero();
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar1(), 7);
+  VERIFY_IS_EQUAL(scalar2(), 13);
+  VERIFY_IS_EQUAL(scalar3(), 17);
+  VERIFY_IS_EQUAL(scalar4(), 0);
+
+  Tensor<int, 0> scalar5(scalar1);
+
+  VERIFY_IS_EQUAL(scalar5(), 7);
+  VERIFY_IS_EQUAL(scalar5.data()[0], 7);
+}
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+  Tensor<int, 1> vec3;
+  Tensor<int, 1, RowMajor> vec4;
+
+  vec3.resize(6);
+  vec4.resize(6);
+
+  vec1(0) = 4;  vec2(0) = 0; vec3(0) = 5;
+  vec1(1) = 8;  vec2(1) = 1; vec3(1) = 4;
+  vec1(2) = 15; vec2(2) = 2; vec3(2) = 3;
+  vec1(3) = 16; vec2(3) = 3; vec3(3) = 2;
+  vec1(4) = 23; vec2(4) = 4; vec3(4) = 1;
+  vec1(5) = 42; vec2(5) = 5; vec3(5) = 0;
+  vec4.setZero();
+
+  VERIFY_IS_EQUAL((vec1.rank()), 1);
+  VERIFY_IS_EQUAL((vec1.size()), 6);
+  VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6);
+
+  VERIFY_IS_EQUAL((vec1[0]), 4);
+  VERIFY_IS_EQUAL((vec1[1]), 8);
+  VERIFY_IS_EQUAL((vec1[2]), 15);
+  VERIFY_IS_EQUAL((vec1[3]), 16);
+  VERIFY_IS_EQUAL((vec1[4]), 23);
+  VERIFY_IS_EQUAL((vec1[5]), 42);
+
+  VERIFY_IS_EQUAL((vec2[0]), 0);
+  VERIFY_IS_EQUAL((vec2[1]), 1);
+  VERIFY_IS_EQUAL((vec2[2]), 2);
+  VERIFY_IS_EQUAL((vec2[3]), 3);
+  VERIFY_IS_EQUAL((vec2[4]), 4);
+  VERIFY_IS_EQUAL((vec2[5]), 5);
+
+  VERIFY_IS_EQUAL((vec3[0]), 5);
+  VERIFY_IS_EQUAL((vec3[1]), 4);
+  VERIFY_IS_EQUAL((vec3[2]), 3);
+  VERIFY_IS_EQUAL((vec3[3]), 2);
+  VERIFY_IS_EQUAL((vec3[4]), 1);
+  VERIFY_IS_EQUAL((vec3[5]), 0);
+
+  VERIFY_IS_EQUAL((vec4[0]), 0);
+  VERIFY_IS_EQUAL((vec4[1]), 0);
+  VERIFY_IS_EQUAL((vec4[2]), 0);
+  VERIFY_IS_EQUAL((vec4[3]), 0);
+  VERIFY_IS_EQUAL((vec4[4]), 0);
+  VERIFY_IS_EQUAL((vec4[5]), 0);
+
+  Tensor<int, 1> vec5(vec1);
+
+  VERIFY_IS_EQUAL((vec5(0)), 4);
+  VERIFY_IS_EQUAL((vec5(1)), 8);
+  VERIFY_IS_EQUAL((vec5(2)), 15);
+  VERIFY_IS_EQUAL((vec5(3)), 16);
+  VERIFY_IS_EQUAL((vec5(4)), 23);
+  VERIFY_IS_EQUAL((vec5(5)), 42);
+
+  VERIFY_IS_EQUAL((vec5.data()[0]), 4);
+  VERIFY_IS_EQUAL((vec5.data()[1]), 8);
+  VERIFY_IS_EQUAL((vec5.data()[2]), 15);
+  VERIFY_IS_EQUAL((vec5.data()[3]), 16);
+  VERIFY_IS_EQUAL((vec5.data()[4]), 23);
+  VERIFY_IS_EQUAL((vec5.data()[5]), 42);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  VERIFY_IS_EQUAL((mat1.rank()), 2);
+  VERIFY_IS_EQUAL((mat1.size()), 6);
+  VERIFY_IS_EQUAL((mat1.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((mat1.dimensions()[1]), 3);
+
+  VERIFY_IS_EQUAL((mat2.rank()), 2);
+  VERIFY_IS_EQUAL((mat2.size()), 6);
+  VERIFY_IS_EQUAL((mat2.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((mat2.dimensions()[1]), 3);
+
+  VERIFY_IS_EQUAL((mat1.data()[0]), 0);
+  VERIFY_IS_EQUAL((mat1.data()[1]), 3);
+  VERIFY_IS_EQUAL((mat1.data()[2]), 1);
+  VERIFY_IS_EQUAL((mat1.data()[3]), 4);
+  VERIFY_IS_EQUAL((mat1.data()[4]), 2);
+  VERIFY_IS_EQUAL((mat1.data()[5]), 5);
+
+  VERIFY_IS_EQUAL((mat2.data()[0]), 0);
+  VERIFY_IS_EQUAL((mat2.data()[1]), 1);
+  VERIFY_IS_EQUAL((mat2.data()[2]), 2);
+  VERIFY_IS_EQUAL((mat2.data()[3]), 3);
+  VERIFY_IS_EQUAL((mat2.data()[4]), 4);
+  VERIFY_IS_EQUAL((mat2.data()[5]), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> epsilon(3,3,3);
+  epsilon.setZero();
+  epsilon(0,1,2) = epsilon(2,0,1) = epsilon(1,2,0) = 1;
+  epsilon(2,1,0) = epsilon(0,2,1) = epsilon(1,0,2) = -1;
+
+  VERIFY_IS_EQUAL((epsilon.size()), 27);
+  VERIFY_IS_EQUAL((epsilon.dimensions()[0]), 3);
+  VERIFY_IS_EQUAL((epsilon.dimensions()[1]), 3);
+  VERIFY_IS_EQUAL((epsilon.dimensions()[2]), 3);
+
+  VERIFY_IS_EQUAL((epsilon(0,0,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,0,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,0,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,1,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,1,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,2,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,2,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,0,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,0,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,1,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,1,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,1,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,2,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,2,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,0,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,0,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,1,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,1,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,2,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,2,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,2,2)), 0);
+
+  VERIFY_IS_EQUAL((epsilon(0,1,2)), 1);
+  VERIFY_IS_EQUAL((epsilon(2,0,1)), 1);
+  VERIFY_IS_EQUAL((epsilon(1,2,0)), 1);
+  VERIFY_IS_EQUAL((epsilon(2,1,0)), -1);
+  VERIFY_IS_EQUAL((epsilon(0,2,1)), -1);
+  VERIFY_IS_EQUAL((epsilon(1,0,2)), -1);
+
+  array<Eigen::DenseIndex, 3> dims;
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 4;
+  Tensor<int, 3> t1(dims);
+  Tensor<int, 3, RowMajor> t2(dims);
+
+  VERIFY_IS_EQUAL((t1.size()), 24);
+  VERIFY_IS_EQUAL((t1.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((t1.dimensions()[1]), 3);
+  VERIFY_IS_EQUAL((t1.dimensions()[2]), 4);
+
+  VERIFY_IS_EQUAL((t2.size()), 24);
+  VERIFY_IS_EQUAL((t2.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((t2.dimensions()[1]), 3);
+  VERIFY_IS_EQUAL((t2.dimensions()[2]), 4);
+
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 4; k++) {
+        t1(i, j, k) = 100 * i + 10 * j + k;
+        t2(i, j, k) = 100 * i + 10 * j + k;
+      }
+    }
+  }
+
+  VERIFY_IS_EQUAL((t1.data()[0]),    0);
+  VERIFY_IS_EQUAL((t1.data()[1]),  100);
+  VERIFY_IS_EQUAL((t1.data()[2]),   10);
+  VERIFY_IS_EQUAL((t1.data()[3]),  110);
+  VERIFY_IS_EQUAL((t1.data()[4]),   20);
+  VERIFY_IS_EQUAL((t1.data()[5]),  120);
+  VERIFY_IS_EQUAL((t1.data()[6]),    1);
+  VERIFY_IS_EQUAL((t1.data()[7]),  101);
+  VERIFY_IS_EQUAL((t1.data()[8]),   11);
+  VERIFY_IS_EQUAL((t1.data()[9]),  111);
+  VERIFY_IS_EQUAL((t1.data()[10]),  21);
+  VERIFY_IS_EQUAL((t1.data()[11]), 121);
+  VERIFY_IS_EQUAL((t1.data()[12]),   2);
+  VERIFY_IS_EQUAL((t1.data()[13]), 102);
+  VERIFY_IS_EQUAL((t1.data()[14]),  12);
+  VERIFY_IS_EQUAL((t1.data()[15]), 112);
+  VERIFY_IS_EQUAL((t1.data()[16]),  22);
+  VERIFY_IS_EQUAL((t1.data()[17]), 122);
+  VERIFY_IS_EQUAL((t1.data()[18]),   3);
+  VERIFY_IS_EQUAL((t1.data()[19]), 103);
+  VERIFY_IS_EQUAL((t1.data()[20]),  13);
+  VERIFY_IS_EQUAL((t1.data()[21]), 113);
+  VERIFY_IS_EQUAL((t1.data()[22]),  23);
+  VERIFY_IS_EQUAL((t1.data()[23]), 123);
+
+  VERIFY_IS_EQUAL((t2.data()[0]),    0);
+  VERIFY_IS_EQUAL((t2.data()[1]),    1);
+  VERIFY_IS_EQUAL((t2.data()[2]),    2);
+  VERIFY_IS_EQUAL((t2.data()[3]),    3);
+  VERIFY_IS_EQUAL((t2.data()[4]),   10);
+  VERIFY_IS_EQUAL((t2.data()[5]),   11);
+  VERIFY_IS_EQUAL((t2.data()[6]),   12);
+  VERIFY_IS_EQUAL((t2.data()[7]),   13);
+  VERIFY_IS_EQUAL((t2.data()[8]),   20);
+  VERIFY_IS_EQUAL((t2.data()[9]),   21);
+  VERIFY_IS_EQUAL((t2.data()[10]),  22);
+  VERIFY_IS_EQUAL((t2.data()[11]),  23);
+  VERIFY_IS_EQUAL((t2.data()[12]), 100);
+  VERIFY_IS_EQUAL((t2.data()[13]), 101);
+  VERIFY_IS_EQUAL((t2.data()[14]), 102);
+  VERIFY_IS_EQUAL((t2.data()[15]), 103);
+  VERIFY_IS_EQUAL((t2.data()[16]), 110);
+  VERIFY_IS_EQUAL((t2.data()[17]), 111);
+  VERIFY_IS_EQUAL((t2.data()[18]), 112);
+  VERIFY_IS_EQUAL((t2.data()[19]), 113);
+  VERIFY_IS_EQUAL((t2.data()[20]), 120);
+  VERIFY_IS_EQUAL((t2.data()[21]), 121);
+  VERIFY_IS_EQUAL((t2.data()[22]), 122);
+  VERIFY_IS_EQUAL((t2.data()[23]), 123);
+}
+
+static void test_simple_assign()
+{
+  Tensor<int, 3> epsilon(3,3,3);
+  epsilon.setZero();
+  epsilon(0,1,2) = epsilon(2,0,1) = epsilon(1,2,0) = 1;
+  epsilon(2,1,0) = epsilon(0,2,1) = epsilon(1,0,2) = -1;
+
+  Tensor<int, 3> e2(3,3,3);
+  e2.setZero();
+  VERIFY_IS_EQUAL((e2(1,2,0)), 0);
+
+  e2 = epsilon;
+  VERIFY_IS_EQUAL((e2(1,2,0)), 1);
+  VERIFY_IS_EQUAL((e2(0,1,2)), 1);
+  VERIFY_IS_EQUAL((e2(2,0,1)), 1);
+  VERIFY_IS_EQUAL((e2(2,1,0)), -1);
+  VERIFY_IS_EQUAL((e2(0,2,1)), -1);
+  VERIFY_IS_EQUAL((e2(1,0,2)), -1);
+}
+
+static void test_resize()
+{
+  Tensor<int, 3> epsilon;
+  epsilon.resize(2,3,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 2);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.size(), 2*3*7);
+
+  const int* old_data = epsilon.data();
+  epsilon.resize(3,2,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 2);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.size(), 2*3*7);
+  VERIFY_IS_EQUAL(epsilon.data(), old_data);
+
+  epsilon.resize(3,5,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 5);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.size(), 3*5*7);
+}
+
+void test_cxx11_tensor_simple()
+{
+  CALL_SUBTEST(test_0d());
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_simple_assign());
+  CALL_SUBTEST(test_resize());
+}
diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp
new file mode 100644
index 000000000..935b908cc
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_striding.cpp
@@ -0,0 +1,119 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_striding()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_stride;
+  no_stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+  Tensor<float, 4, DataLayout> stride;
+  stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(stride.dimension(0), 1);
+  VERIFY_IS_EQUAL(stride.dimension(1), 1);
+  VERIFY_IS_EQUAL(stride.dimension(2), 3);
+  VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+  for (int i = 0; i < 1; ++i) {
+    for (int j = 0; j < 1; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_striding_as_lvalue()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+  Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+  result.stride(strides) = tensor;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), result(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+
+  array<ptrdiff_t, 4> no_strides;
+  no_strides[0] = 1;
+  no_strides[1] = 1;
+  no_strides[2] = 1;
+  no_strides[3] = 1;
+  Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+  result2.stride(strides) = tensor.stride(no_strides);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), result2(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_striding()
+{
+  CALL_SUBTEST(test_simple_striding<ColMajor>());
+  CALL_SUBTEST(test_simple_striding<RowMajor>());
+  CALL_SUBTEST(test_striding_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_striding_as_lvalue<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
new file mode 100644
index 000000000..2f56eb495
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_sugar.cpp
@@ -0,0 +1,81 @@
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_comparison_sugar() {
+  // we already trust comparisons between tensors, we're simply checking that
+  // the sugared versions are doing the same thing
+  Tensor<int, 3> t(6, 7, 5);
+
+  t.setRandom();
+  // make sure we have at least one value == 0
+  t(0,0,0) = 0;
+
+  Tensor<bool,0> b;
+
+#define TEST_TENSOR_EQUAL(e1, e2) \
+  b = ((e1) == (e2)).all();       \
+  VERIFY(b())
+
+#define TEST_OP(op) TEST_TENSOR_EQUAL(t op 0, t op t.constant(0))
+
+  TEST_OP(==);
+  TEST_OP(!=);
+  TEST_OP(<=);
+  TEST_OP(>=);
+  TEST_OP(<);
+  TEST_OP(>);
+#undef TEST_OP
+#undef TEST_TENSOR_EQUAL
+}
+
+
+static void test_scalar_sugar_add_mul() {
+  Tensor<float, 3> A(6, 7, 5);
+  Tensor<float, 3> B(6, 7, 5);
+  A.setRandom();
+  B.setRandom();
+
+  const float alpha = 0.43f;
+  const float beta = 0.21f;
+  const float gamma = 0.14f;
+
+  Tensor<float, 3> R = A.constant(gamma) + A * A.constant(alpha) + B * B.constant(beta);
+  Tensor<float, 3> S = A * alpha + B * beta + gamma;
+  Tensor<float, 3> T = gamma + alpha * A + beta * B;
+
+  for (int i = 0; i < 6*7*5; ++i) {
+    VERIFY_IS_APPROX(R(i), S(i));
+    VERIFY_IS_APPROX(R(i), T(i));
+  }
+}
+
+static void test_scalar_sugar_sub_div() {
+  Tensor<float, 3> A(6, 7, 5);
+  Tensor<float, 3> B(6, 7, 5);
+  A.setRandom();
+  B.setRandom();
+
+  const float alpha = 0.43f;
+  const float beta = 0.21f;
+  const float gamma = 0.14f;
+  const float delta = 0.32f;
+
+  Tensor<float, 3> R = A.constant(gamma) - A / A.constant(alpha)
+      - B.constant(beta) / B - A.constant(delta);
+  Tensor<float, 3> S = gamma - A / alpha - beta / B - delta;
+
+  for (int i = 0; i < 6*7*5; ++i) {
+    VERIFY_IS_APPROX(R(i), S(i));
+  }
+}
+
+void test_cxx11_tensor_sugar()
+{
+  CALL_SUBTEST(test_comparison_sugar());
+  CALL_SUBTEST(test_scalar_sugar_add_mul());
+  CALL_SUBTEST(test_scalar_sugar_sub_div());
+}
diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
new file mode 100644
index 000000000..6a9c33422
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
+
+  int sizeDim1 = 100;
+  int sizeDim2 = 100;
+  int sizeDim3 = 100;
+  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<float, 3> in1(tensorRange);
+  Tensor<float, 3> in2(tensorRange);
+  Tensor<float, 3> in3(tensorRange);
+  Tensor<float, 3> out(tensorRange);
+
+  in2 = in2.random();
+  in3 = in3.random();
+
+  float * gpu_in1_data  = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_in2_data  = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_in3_data  = static_cast<float*>(sycl_device.allocate(in3.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_out_data =  static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+
+  TensorMap<Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<float, 3>> gpu_in3(gpu_in3_data, tensorRange);
+  TensorMap<Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
+
+  /// a=1.2f
+  gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
+  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
+      }
+    }
+  }
+  printf("a=1.2f Test passed\n");
+
+  /// a=b*1.2f
+  gpu_out.device(sycl_device) = gpu_in1 * 1.2f;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) * 1.2f);
+      }
+    }
+  }
+  printf("a=b*1.2f Test Passed\n");
+
+  /// c=a*b
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(float));
+  gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) *
+                             in2(i,j,k));
+      }
+    }
+  }
+  printf("c=a*b Test Passed\n");
+
+  /// c=a+b
+  gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) +
+                             in2(i,j,k));
+      }
+    }
+  }
+  printf("c=a+b Test Passed\n");
+
+  /// c=a*a
+  gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) *
+                             in1(i,j,k));
+      }
+    }
+  }
+  printf("c= a*a Test Passed\n");
+
+  //a*3.14f + b*2.7f
+  gpu_out.device(sycl_device) =  gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
+  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) * 3.14f
+                       + in2(i,j,k) * 2.7f);
+      }
+    }
+  }
+  printf("a*3.14f + b*2.7f Test Passed\n");
+
+  ///d= (a>0.5? b:c)
+  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.dimensions().TotalSize())*sizeof(float));
+  gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
+                                                ? in2(i, j, k)
+                                                : in3(i, j, k));
+      }
+    }
+  }
+  printf("d= (a>0.5? b:c) Test Passed\n");
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_in3_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+void test_cxx11_tensor_sycl() {
+  cl::sycl::gpu_selector s;
+  Eigen::SyclDevice sycl_device(s);
+  CALL_SUBTEST(test_sycl_cpu(sycl_device));
+}
diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp
new file mode 100644
index 000000000..d680e9b3b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_symmetry.cpp
@@ -0,0 +1,818 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+#include <Eigen/CXX11/TensorSymmetry>
+
+#include <map>
+#include <set>
+
+using Eigen::Tensor;
+using Eigen::SGroup;
+using Eigen::DynamicSGroup;
+using Eigen::StaticSGroup;
+using Eigen::Symmetry;
+using Eigen::AntiSymmetry;
+using Eigen::Hermiticity;
+using Eigen::AntiHermiticity;
+
+using Eigen::NegationFlag;
+using Eigen::ConjugationFlag;
+using Eigen::GlobalZeroFlag;
+using Eigen::GlobalRealFlag;
+using Eigen::GlobalImagFlag;
+
+// helper function to determine if the compiler intantiated a static
+// or dynamic symmetry group
+template<typename... Sym>
+bool isDynGroup(StaticSGroup<Sym...> const& dummy)
+{
+  (void)dummy;
+  return false;
+}
+
+bool isDynGroup(DynamicSGroup const& dummy)
+{
+  (void)dummy;
+  return true;
+}
+
+// helper class for checking that the symmetry groups are correct
+struct checkIdx {
+  template<typename ArrType>
+  static inline int doCheck_(ArrType e, int flags, int dummy, std::set<uint64_t>& found, std::map<uint64_t, int> const& expected)
+  {
+    // use decimal representation of value
+    uint64_t value = e[0];
+    for (std::size_t i = 1; i < e.size(); i++)
+      value = value * 10 + e[i];
+
+    // we want to make sure that we find each element
+    auto it = expected.find(value);
+    VERIFY((it != expected.end()));
+    VERIFY_IS_EQUAL(it->second, flags);
+
+    // we want to make sure we only have each element once;
+    // set::insert returns true for the second part of the pair
+    // if the element was really inserted and not already there
+    auto p = found.insert(value);
+    VERIFY((p.second));
+
+    return dummy;
+  }
+
+  static inline int run(std::vector<int> e, int flags, int dummy, std::set<uint64_t>& found, std::map<uint64_t, int> const& expected)
+  {
+    return doCheck_(e, flags, dummy, found, expected);
+  }
+
+  template<std::size_t N>
+  static inline int run(std::array<int, N> e, int flags, int dummy, std::set<uint64_t>& found, std::map<uint64_t, int> const& expected)
+  {
+    return doCheck_(e, flags, dummy, found, expected);
+  }
+};
+
+static void test_symgroups_static()
+{
+  std::array<int, 7> identity{{0,1,2,3,4,5,6}};
+
+  // Simple static symmetry group
+  StaticSGroup<
+    AntiSymmetry<0,1>,
+    Hermiticity<0,2>
+  > group;
+
+  std::set<uint64_t> found;
+  std::map<uint64_t, int> expected;
+  expected[ 123456] = 0;
+  expected[1023456] = NegationFlag;
+  expected[2103456] = ConjugationFlag;
+  expected[1203456] = ConjugationFlag | NegationFlag;
+  expected[2013456] = ConjugationFlag | NegationFlag;
+  expected[ 213456] = ConjugationFlag;
+
+  VERIFY_IS_EQUAL(group.size(), 6u);
+  VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+  group.apply<checkIdx, int>(identity, 0, found, expected);
+  VERIFY_IS_EQUAL(found.size(), 6u);
+}
+
+static void test_symgroups_dynamic()
+{
+  std::vector<int> identity;
+  for (int i = 0; i <= 6; i++)
+    identity.push_back(i);
+
+  // Simple dynamic symmetry group
+  DynamicSGroup group;
+  group.add(0,1,NegationFlag);
+  group.add(0,2,ConjugationFlag);
+
+  VERIFY_IS_EQUAL(group.size(), 6u);
+  VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+
+  std::set<uint64_t> found;
+  std::map<uint64_t, int> expected;
+  expected[ 123456] = 0;
+  expected[1023456] = NegationFlag;
+  expected[2103456] = ConjugationFlag;
+  expected[1203456] = ConjugationFlag | NegationFlag;
+  expected[2013456] = ConjugationFlag | NegationFlag;
+  expected[ 213456] = ConjugationFlag;
+
+  VERIFY_IS_EQUAL(group.size(), 6u);
+  VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+  group.apply<checkIdx, int>(identity, 0, found, expected);
+  VERIFY_IS_EQUAL(found.size(), 6u);
+}
+
+static void test_symgroups_selection()
+{
+  std::array<int, 7> identity7{{0,1,2,3,4,5,6}};
+  std::array<int, 10> identity10{{0,1,2,3,4,5,6,7,8,9}};
+
+  {
+    // Do the same test as in test_symgroups_static but
+    // require selection via SGroup
+    SGroup<
+      AntiSymmetry<0,1>,
+      Hermiticity<0,2>
+    > group;
+
+    std::set<uint64_t> found;
+    std::map<uint64_t, int> expected;
+    expected[ 123456] = 0;
+    expected[1023456] = NegationFlag;
+    expected[2103456] = ConjugationFlag;
+    expected[1203456] = ConjugationFlag | NegationFlag;
+    expected[2013456] = ConjugationFlag | NegationFlag;
+    expected[ 213456] = ConjugationFlag;
+
+    VERIFY(!isDynGroup(group));
+    VERIFY_IS_EQUAL(group.size(), 6u);
+    VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+    group.apply<checkIdx, int>(identity7, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 6u);
+  }
+
+  {
+    // simple factorizing group: 5 generators, 2^5 = 32 elements
+    // selection should make this dynamic, although static group
+    // can still be reasonably generated
+    SGroup<
+      Symmetry<0,1>,
+      Symmetry<2,3>,
+      Symmetry<4,5>,
+      Symmetry<6,7>,
+      Symmetry<8,9>
+    > group;
+
+    std::set<uint64_t> found;
+    std::map<uint64_t, int> expected;
+    expected[ 123456789] = 0; expected[ 123456798] = 0; expected[ 123457689] = 0; expected[ 123457698] = 0;
+    expected[ 123546789] = 0; expected[ 123546798] = 0; expected[ 123547689] = 0; expected[ 123547698] = 0;
+    expected[ 132456789] = 0; expected[ 132456798] = 0; expected[ 132457689] = 0; expected[ 132457698] = 0;
+    expected[ 132546789] = 0; expected[ 132546798] = 0; expected[ 132547689] = 0; expected[ 132547698] = 0;
+    expected[1023456789] = 0; expected[1023456798] = 0; expected[1023457689] = 0; expected[1023457698] = 0;
+    expected[1023546789] = 0; expected[1023546798] = 0; expected[1023547689] = 0; expected[1023547698] = 0;
+    expected[1032456789] = 0; expected[1032456798] = 0; expected[1032457689] = 0; expected[1032457698] = 0;
+    expected[1032546789] = 0; expected[1032546798] = 0; expected[1032547689] = 0; expected[1032547698] = 0;
+
+    VERIFY(isDynGroup(group));
+    VERIFY_IS_EQUAL(group.size(), 32u);
+    VERIFY_IS_EQUAL(group.globalFlags(), 0);
+    group.apply<checkIdx, int>(identity10, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 32u);
+
+    // no verify that we could also generate a static group
+    // with these generators
+    found.clear();
+    StaticSGroup<
+      Symmetry<0,1>,
+      Symmetry<2,3>,
+      Symmetry<4,5>,
+      Symmetry<6,7>,
+      Symmetry<8,9>
+    > group_static;
+    VERIFY_IS_EQUAL(group_static.size(), 32u);
+    VERIFY_IS_EQUAL(group_static.globalFlags(), 0);
+    group_static.apply<checkIdx, int>(identity10, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 32u);
+  }
+
+  {
+    // try to create a HUGE group
+    SGroup<
+      Symmetry<0,1>,
+      Symmetry<1,2>,
+      Symmetry<2,3>,
+      Symmetry<3,4>,
+      Symmetry<4,5>,
+      Symmetry<5,6>
+    > group;
+
+    std::set<uint64_t> found;
+    uint64_t pre_expected[5040] = {
+       123456, 1023456,  213456, 2013456, 1203456, 2103456,  132456, 1032456,  312456, 3012456, 1302456, 3102456,
+       231456, 2031456,  321456, 3021456, 2301456, 3201456, 1230456, 2130456, 1320456, 3120456, 2310456, 3210456,
+       124356, 1024356,  214356, 2014356, 1204356, 2104356,  142356, 1042356,  412356, 4012356, 1402356, 4102356,
+       241356, 2041356,  421356, 4021356, 2401356, 4201356, 1240356, 2140356, 1420356, 4120356, 2410356, 4210356,
+       134256, 1034256,  314256, 3014256, 1304256, 3104256,  143256, 1043256,  413256, 4013256, 1403256, 4103256,
+       341256, 3041256,  431256, 4031256, 3401256, 4301256, 1340256, 3140256, 1430256, 4130256, 3410256, 4310256,
+       234156, 2034156,  324156, 3024156, 2304156, 3204156,  243156, 2043156,  423156, 4023156, 2403156, 4203156,
+       342156, 3042156,  432156, 4032156, 3402156, 4302156, 2340156, 3240156, 2430156, 4230156, 3420156, 4320156,
+      1234056, 2134056, 1324056, 3124056, 2314056, 3214056, 1243056, 2143056, 1423056, 4123056, 2413056, 4213056,
+      1342056, 3142056, 1432056, 4132056, 3412056, 4312056, 2341056, 3241056, 2431056, 4231056, 3421056, 4321056,
+       123546, 1023546,  213546, 2013546, 1203546, 2103546,  132546, 1032546,  312546, 3012546, 1302546, 3102546,
+       231546, 2031546,  321546, 3021546, 2301546, 3201546, 1230546, 2130546, 1320546, 3120546, 2310546, 3210546,
+       125346, 1025346,  215346, 2015346, 1205346, 2105346,  152346, 1052346,  512346, 5012346, 1502346, 5102346,
+       251346, 2051346,  521346, 5021346, 2501346, 5201346, 1250346, 2150346, 1520346, 5120346, 2510346, 5210346,
+       135246, 1035246,  315246, 3015246, 1305246, 3105246,  153246, 1053246,  513246, 5013246, 1503246, 5103246,
+       351246, 3051246,  531246, 5031246, 3501246, 5301246, 1350246, 3150246, 1530246, 5130246, 3510246, 5310246,
+       235146, 2035146,  325146, 3025146, 2305146, 3205146,  253146, 2053146,  523146, 5023146, 2503146, 5203146,
+       352146, 3052146,  532146, 5032146, 3502146, 5302146, 2350146, 3250146, 2530146, 5230146, 3520146, 5320146,
+      1235046, 2135046, 1325046, 3125046, 2315046, 3215046, 1253046, 2153046, 1523046, 5123046, 2513046, 5213046,
+      1352046, 3152046, 1532046, 5132046, 3512046, 5312046, 2351046, 3251046, 2531046, 5231046, 3521046, 5321046,
+       124536, 1024536,  214536, 2014536, 1204536, 2104536,  142536, 1042536,  412536, 4012536, 1402536, 4102536,
+       241536, 2041536,  421536, 4021536, 2401536, 4201536, 1240536, 2140536, 1420536, 4120536, 2410536, 4210536,
+       125436, 1025436,  215436, 2015436, 1205436, 2105436,  152436, 1052436,  512436, 5012436, 1502436, 5102436,
+       251436, 2051436,  521436, 5021436, 2501436, 5201436, 1250436, 2150436, 1520436, 5120436, 2510436, 5210436,
+       145236, 1045236,  415236, 4015236, 1405236, 4105236,  154236, 1054236,  514236, 5014236, 1504236, 5104236,
+       451236, 4051236,  541236, 5041236, 4501236, 5401236, 1450236, 4150236, 1540236, 5140236, 4510236, 5410236,
+       245136, 2045136,  425136, 4025136, 2405136, 4205136,  254136, 2054136,  524136, 5024136, 2504136, 5204136,
+       452136, 4052136,  542136, 5042136, 4502136, 5402136, 2450136, 4250136, 2540136, 5240136, 4520136, 5420136,
+      1245036, 2145036, 1425036, 4125036, 2415036, 4215036, 1254036, 2154036, 1524036, 5124036, 2514036, 5214036,
+      1452036, 4152036, 1542036, 5142036, 4512036, 5412036, 2451036, 4251036, 2541036, 5241036, 4521036, 5421036,
+       134526, 1034526,  314526, 3014526, 1304526, 3104526,  143526, 1043526,  413526, 4013526, 1403526, 4103526,
+       341526, 3041526,  431526, 4031526, 3401526, 4301526, 1340526, 3140526, 1430526, 4130526, 3410526, 4310526,
+       135426, 1035426,  315426, 3015426, 1305426, 3105426,  153426, 1053426,  513426, 5013426, 1503426, 5103426,
+       351426, 3051426,  531426, 5031426, 3501426, 5301426, 1350426, 3150426, 1530426, 5130426, 3510426, 5310426,
+       145326, 1045326,  415326, 4015326, 1405326, 4105326,  154326, 1054326,  514326, 5014326, 1504326, 5104326,
+       451326, 4051326,  541326, 5041326, 4501326, 5401326, 1450326, 4150326, 1540326, 5140326, 4510326, 5410326,
+       345126, 3045126,  435126, 4035126, 3405126, 4305126,  354126, 3054126,  534126, 5034126, 3504126, 5304126,
+       453126, 4053126,  543126, 5043126, 4503126, 5403126, 3450126, 4350126, 3540126, 5340126, 4530126, 5430126,
+      1345026, 3145026, 1435026, 4135026, 3415026, 4315026, 1354026, 3154026, 1534026, 5134026, 3514026, 5314026,
+      1453026, 4153026, 1543026, 5143026, 4513026, 5413026, 3451026, 4351026, 3541026, 5341026, 4531026, 5431026,
+       234516, 2034516,  324516, 3024516, 2304516, 3204516,  243516, 2043516,  423516, 4023516, 2403516, 4203516,
+       342516, 3042516,  432516, 4032516, 3402516, 4302516, 2340516, 3240516, 2430516, 4230516, 3420516, 4320516,
+       235416, 2035416,  325416, 3025416, 2305416, 3205416,  253416, 2053416,  523416, 5023416, 2503416, 5203416,
+       352416, 3052416,  532416, 5032416, 3502416, 5302416, 2350416, 3250416, 2530416, 5230416, 3520416, 5320416,
+       245316, 2045316,  425316, 4025316, 2405316, 4205316,  254316, 2054316,  524316, 5024316, 2504316, 5204316,
+       452316, 4052316,  542316, 5042316, 4502316, 5402316, 2450316, 4250316, 2540316, 5240316, 4520316, 5420316,
+       345216, 3045216,  435216, 4035216, 3405216, 4305216,  354216, 3054216,  534216, 5034216, 3504216, 5304216,
+       453216, 4053216,  543216, 5043216, 4503216, 5403216, 3450216, 4350216, 3540216, 5340216, 4530216, 5430216,
+      2345016, 3245016, 2435016, 4235016, 3425016, 4325016, 2354016, 3254016, 2534016, 5234016, 3524016, 5324016,
+      2453016, 4253016, 2543016, 5243016, 4523016, 5423016, 3452016, 4352016, 3542016, 5342016, 4532016, 5432016,
+      1234506, 2134506, 1324506, 3124506, 2314506, 3214506, 1243506, 2143506, 1423506, 4123506, 2413506, 4213506,
+      1342506, 3142506, 1432506, 4132506, 3412506, 4312506, 2341506, 3241506, 2431506, 4231506, 3421506, 4321506,
+      1235406, 2135406, 1325406, 3125406, 2315406, 3215406, 1253406, 2153406, 1523406, 5123406, 2513406, 5213406,
+      1352406, 3152406, 1532406, 5132406, 3512406, 5312406, 2351406, 3251406, 2531406, 5231406, 3521406, 5321406,
+      1245306, 2145306, 1425306, 4125306, 2415306, 4215306, 1254306, 2154306, 1524306, 5124306, 2514306, 5214306,
+      1452306, 4152306, 1542306, 5142306, 4512306, 5412306, 2451306, 4251306, 2541306, 5241306, 4521306, 5421306,
+      1345206, 3145206, 1435206, 4135206, 3415206, 4315206, 1354206, 3154206, 1534206, 5134206, 3514206, 5314206,
+      1453206, 4153206, 1543206, 5143206, 4513206, 5413206, 3451206, 4351206, 3541206, 5341206, 4531206, 5431206,
+      2345106, 3245106, 2435106, 4235106, 3425106, 4325106, 2354106, 3254106, 2534106, 5234106, 3524106, 5324106,
+      2453106, 4253106, 2543106, 5243106, 4523106, 5423106, 3452106, 4352106, 3542106, 5342106, 4532106, 5432106,
+       123465, 1023465,  213465, 2013465, 1203465, 2103465,  132465, 1032465,  312465, 3012465, 1302465, 3102465,
+       231465, 2031465,  321465, 3021465, 2301465, 3201465, 1230465, 2130465, 1320465, 3120465, 2310465, 3210465,
+       124365, 1024365,  214365, 2014365, 1204365, 2104365,  142365, 1042365,  412365, 4012365, 1402365, 4102365,
+       241365, 2041365,  421365, 4021365, 2401365, 4201365, 1240365, 2140365, 1420365, 4120365, 2410365, 4210365,
+       134265, 1034265,  314265, 3014265, 1304265, 3104265,  143265, 1043265,  413265, 4013265, 1403265, 4103265,
+       341265, 3041265,  431265, 4031265, 3401265, 4301265, 1340265, 3140265, 1430265, 4130265, 3410265, 4310265,
+       234165, 2034165,  324165, 3024165, 2304165, 3204165,  243165, 2043165,  423165, 4023165, 2403165, 4203165,
+       342165, 3042165,  432165, 4032165, 3402165, 4302165, 2340165, 3240165, 2430165, 4230165, 3420165, 4320165,
+      1234065, 2134065, 1324065, 3124065, 2314065, 3214065, 1243065, 2143065, 1423065, 4123065, 2413065, 4213065,
+      1342065, 3142065, 1432065, 4132065, 3412065, 4312065, 2341065, 3241065, 2431065, 4231065, 3421065, 4321065,
+       123645, 1023645,  213645, 2013645, 1203645, 2103645,  132645, 1032645,  312645, 3012645, 1302645, 3102645,
+       231645, 2031645,  321645, 3021645, 2301645, 3201645, 1230645, 2130645, 1320645, 3120645, 2310645, 3210645,
+       126345, 1026345,  216345, 2016345, 1206345, 2106345,  162345, 1062345,  612345, 6012345, 1602345, 6102345,
+       261345, 2061345,  621345, 6021345, 2601345, 6201345, 1260345, 2160345, 1620345, 6120345, 2610345, 6210345,
+       136245, 1036245,  316245, 3016245, 1306245, 3106245,  163245, 1063245,  613245, 6013245, 1603245, 6103245,
+       361245, 3061245,  631245, 6031245, 3601245, 6301245, 1360245, 3160245, 1630245, 6130245, 3610245, 6310245,
+       236145, 2036145,  326145, 3026145, 2306145, 3206145,  263145, 2063145,  623145, 6023145, 2603145, 6203145,
+       362145, 3062145,  632145, 6032145, 3602145, 6302145, 2360145, 3260145, 2630145, 6230145, 3620145, 6320145,
+      1236045, 2136045, 1326045, 3126045, 2316045, 3216045, 1263045, 2163045, 1623045, 6123045, 2613045, 6213045,
+      1362045, 3162045, 1632045, 6132045, 3612045, 6312045, 2361045, 3261045, 2631045, 6231045, 3621045, 6321045,
+       124635, 1024635,  214635, 2014635, 1204635, 2104635,  142635, 1042635,  412635, 4012635, 1402635, 4102635,
+       241635, 2041635,  421635, 4021635, 2401635, 4201635, 1240635, 2140635, 1420635, 4120635, 2410635, 4210635,
+       126435, 1026435,  216435, 2016435, 1206435, 2106435,  162435, 1062435,  612435, 6012435, 1602435, 6102435,
+       261435, 2061435,  621435, 6021435, 2601435, 6201435, 1260435, 2160435, 1620435, 6120435, 2610435, 6210435,
+       146235, 1046235,  416235, 4016235, 1406235, 4106235,  164235, 1064235,  614235, 6014235, 1604235, 6104235,
+       461235, 4061235,  641235, 6041235, 4601235, 6401235, 1460235, 4160235, 1640235, 6140235, 4610235, 6410235,
+       246135, 2046135,  426135, 4026135, 2406135, 4206135,  264135, 2064135,  624135, 6024135, 2604135, 6204135,
+       462135, 4062135,  642135, 6042135, 4602135, 6402135, 2460135, 4260135, 2640135, 6240135, 4620135, 6420135,
+      1246035, 2146035, 1426035, 4126035, 2416035, 4216035, 1264035, 2164035, 1624035, 6124035, 2614035, 6214035,
+      1462035, 4162035, 1642035, 6142035, 4612035, 6412035, 2461035, 4261035, 2641035, 6241035, 4621035, 6421035,
+       134625, 1034625,  314625, 3014625, 1304625, 3104625,  143625, 1043625,  413625, 4013625, 1403625, 4103625,
+       341625, 3041625,  431625, 4031625, 3401625, 4301625, 1340625, 3140625, 1430625, 4130625, 3410625, 4310625,
+       136425, 1036425,  316425, 3016425, 1306425, 3106425,  163425, 1063425,  613425, 6013425, 1603425, 6103425,
+       361425, 3061425,  631425, 6031425, 3601425, 6301425, 1360425, 3160425, 1630425, 6130425, 3610425, 6310425,
+       146325, 1046325,  416325, 4016325, 1406325, 4106325,  164325, 1064325,  614325, 6014325, 1604325, 6104325,
+       461325, 4061325,  641325, 6041325, 4601325, 6401325, 1460325, 4160325, 1640325, 6140325, 4610325, 6410325,
+       346125, 3046125,  436125, 4036125, 3406125, 4306125,  364125, 3064125,  634125, 6034125, 3604125, 6304125,
+       463125, 4063125,  643125, 6043125, 4603125, 6403125, 3460125, 4360125, 3640125, 6340125, 4630125, 6430125,
+      1346025, 3146025, 1436025, 4136025, 3416025, 4316025, 1364025, 3164025, 1634025, 6134025, 3614025, 6314025,
+      1463025, 4163025, 1643025, 6143025, 4613025, 6413025, 3461025, 4361025, 3641025, 6341025, 4631025, 6431025,
+       234615, 2034615,  324615, 3024615, 2304615, 3204615,  243615, 2043615,  423615, 4023615, 2403615, 4203615,
+       342615, 3042615,  432615, 4032615, 3402615, 4302615, 2340615, 3240615, 2430615, 4230615, 3420615, 4320615,
+       236415, 2036415,  326415, 3026415, 2306415, 3206415,  263415, 2063415,  623415, 6023415, 2603415, 6203415,
+       362415, 3062415,  632415, 6032415, 3602415, 6302415, 2360415, 3260415, 2630415, 6230415, 3620415, 6320415,
+       246315, 2046315,  426315, 4026315, 2406315, 4206315,  264315, 2064315,  624315, 6024315, 2604315, 6204315,
+       462315, 4062315,  642315, 6042315, 4602315, 6402315, 2460315, 4260315, 2640315, 6240315, 4620315, 6420315,
+       346215, 3046215,  436215, 4036215, 3406215, 4306215,  364215, 3064215,  634215, 6034215, 3604215, 6304215,
+       463215, 4063215,  643215, 6043215, 4603215, 6403215, 3460215, 4360215, 3640215, 6340215, 4630215, 6430215,
+      2346015, 3246015, 2436015, 4236015, 3426015, 4326015, 2364015, 3264015, 2634015, 6234015, 3624015, 6324015,
+      2463015, 4263015, 2643015, 6243015, 4623015, 6423015, 3462015, 4362015, 3642015, 6342015, 4632015, 6432015,
+      1234605, 2134605, 1324605, 3124605, 2314605, 3214605, 1243605, 2143605, 1423605, 4123605, 2413605, 4213605,
+      1342605, 3142605, 1432605, 4132605, 3412605, 4312605, 2341605, 3241605, 2431605, 4231605, 3421605, 4321605,
+      1236405, 2136405, 1326405, 3126405, 2316405, 3216405, 1263405, 2163405, 1623405, 6123405, 2613405, 6213405,
+      1362405, 3162405, 1632405, 6132405, 3612405, 6312405, 2361405, 3261405, 2631405, 6231405, 3621405, 6321405,
+      1246305, 2146305, 1426305, 4126305, 2416305, 4216305, 1264305, 2164305, 1624305, 6124305, 2614305, 6214305,
+      1462305, 4162305, 1642305, 6142305, 4612305, 6412305, 2461305, 4261305, 2641305, 6241305, 4621305, 6421305,
+      1346205, 3146205, 1436205, 4136205, 3416205, 4316205, 1364205, 3164205, 1634205, 6134205, 3614205, 6314205,
+      1463205, 4163205, 1643205, 6143205, 4613205, 6413205, 3461205, 4361205, 3641205, 6341205, 4631205, 6431205,
+      2346105, 3246105, 2436105, 4236105, 3426105, 4326105, 2364105, 3264105, 2634105, 6234105, 3624105, 6324105,
+      2463105, 4263105, 2643105, 6243105, 4623105, 6423105, 3462105, 4362105, 3642105, 6342105, 4632105, 6432105,
+       123564, 1023564,  213564, 2013564, 1203564, 2103564,  132564, 1032564,  312564, 3012564, 1302564, 3102564,
+       231564, 2031564,  321564, 3021564, 2301564, 3201564, 1230564, 2130564, 1320564, 3120564, 2310564, 3210564,
+       125364, 1025364,  215364, 2015364, 1205364, 2105364,  152364, 1052364,  512364, 5012364, 1502364, 5102364,
+       251364, 2051364,  521364, 5021364, 2501364, 5201364, 1250364, 2150364, 1520364, 5120364, 2510364, 5210364,
+       135264, 1035264,  315264, 3015264, 1305264, 3105264,  153264, 1053264,  513264, 5013264, 1503264, 5103264,
+       351264, 3051264,  531264, 5031264, 3501264, 5301264, 1350264, 3150264, 1530264, 5130264, 3510264, 5310264,
+       235164, 2035164,  325164, 3025164, 2305164, 3205164,  253164, 2053164,  523164, 5023164, 2503164, 5203164,
+       352164, 3052164,  532164, 5032164, 3502164, 5302164, 2350164, 3250164, 2530164, 5230164, 3520164, 5320164,
+      1235064, 2135064, 1325064, 3125064, 2315064, 3215064, 1253064, 2153064, 1523064, 5123064, 2513064, 5213064,
+      1352064, 3152064, 1532064, 5132064, 3512064, 5312064, 2351064, 3251064, 2531064, 5231064, 3521064, 5321064,
+       123654, 1023654,  213654, 2013654, 1203654, 2103654,  132654, 1032654,  312654, 3012654, 1302654, 3102654,
+       231654, 2031654,  321654, 3021654, 2301654, 3201654, 1230654, 2130654, 1320654, 3120654, 2310654, 3210654,
+       126354, 1026354,  216354, 2016354, 1206354, 2106354,  162354, 1062354,  612354, 6012354, 1602354, 6102354,
+       261354, 2061354,  621354, 6021354, 2601354, 6201354, 1260354, 2160354, 1620354, 6120354, 2610354, 6210354,
+       136254, 1036254,  316254, 3016254, 1306254, 3106254,  163254, 1063254,  613254, 6013254, 1603254, 6103254,
+       361254, 3061254,  631254, 6031254, 3601254, 6301254, 1360254, 3160254, 1630254, 6130254, 3610254, 6310254,
+       236154, 2036154,  326154, 3026154, 2306154, 3206154,  263154, 2063154,  623154, 6023154, 2603154, 6203154,
+       362154, 3062154,  632154, 6032154, 3602154, 6302154, 2360154, 3260154, 2630154, 6230154, 3620154, 6320154,
+      1236054, 2136054, 1326054, 3126054, 2316054, 3216054, 1263054, 2163054, 1623054, 6123054, 2613054, 6213054,
+      1362054, 3162054, 1632054, 6132054, 3612054, 6312054, 2361054, 3261054, 2631054, 6231054, 3621054, 6321054,
+       125634, 1025634,  215634, 2015634, 1205634, 2105634,  152634, 1052634,  512634, 5012634, 1502634, 5102634,
+       251634, 2051634,  521634, 5021634, 2501634, 5201634, 1250634, 2150634, 1520634, 5120634, 2510634, 5210634,
+       126534, 1026534,  216534, 2016534, 1206534, 2106534,  162534, 1062534,  612534, 6012534, 1602534, 6102534,
+       261534, 2061534,  621534, 6021534, 2601534, 6201534, 1260534, 2160534, 1620534, 6120534, 2610534, 6210534,
+       156234, 1056234,  516234, 5016234, 1506234, 5106234,  165234, 1065234,  615234, 6015234, 1605234, 6105234,
+       561234, 5061234,  651234, 6051234, 5601234, 6501234, 1560234, 5160234, 1650234, 6150234, 5610234, 6510234,
+       256134, 2056134,  526134, 5026134, 2506134, 5206134,  265134, 2065134,  625134, 6025134, 2605134, 6205134,
+       562134, 5062134,  652134, 6052134, 5602134, 6502134, 2560134, 5260134, 2650134, 6250134, 5620134, 6520134,
+      1256034, 2156034, 1526034, 5126034, 2516034, 5216034, 1265034, 2165034, 1625034, 6125034, 2615034, 6215034,
+      1562034, 5162034, 1652034, 6152034, 5612034, 6512034, 2561034, 5261034, 2651034, 6251034, 5621034, 6521034,
+       135624, 1035624,  315624, 3015624, 1305624, 3105624,  153624, 1053624,  513624, 5013624, 1503624, 5103624,
+       351624, 3051624,  531624, 5031624, 3501624, 5301624, 1350624, 3150624, 1530624, 5130624, 3510624, 5310624,
+       136524, 1036524,  316524, 3016524, 1306524, 3106524,  163524, 1063524,  613524, 6013524, 1603524, 6103524,
+       361524, 3061524,  631524, 6031524, 3601524, 6301524, 1360524, 3160524, 1630524, 6130524, 3610524, 6310524,
+       156324, 1056324,  516324, 5016324, 1506324, 5106324,  165324, 1065324,  615324, 6015324, 1605324, 6105324,
+       561324, 5061324,  651324, 6051324, 5601324, 6501324, 1560324, 5160324, 1650324, 6150324, 5610324, 6510324,
+       356124, 3056124,  536124, 5036124, 3506124, 5306124,  365124, 3065124,  635124, 6035124, 3605124, 6305124,
+       563124, 5063124,  653124, 6053124, 5603124, 6503124, 3560124, 5360124, 3650124, 6350124, 5630124, 6530124,
+      1356024, 3156024, 1536024, 5136024, 3516024, 5316024, 1365024, 3165024, 1635024, 6135024, 3615024, 6315024,
+      1563024, 5163024, 1653024, 6153024, 5613024, 6513024, 3561024, 5361024, 3651024, 6351024, 5631024, 6531024,
+       235614, 2035614,  325614, 3025614, 2305614, 3205614,  253614, 2053614,  523614, 5023614, 2503614, 5203614,
+       352614, 3052614,  532614, 5032614, 3502614, 5302614, 2350614, 3250614, 2530614, 5230614, 3520614, 5320614,
+       236514, 2036514,  326514, 3026514, 2306514, 3206514,  263514, 2063514,  623514, 6023514, 2603514, 6203514,
+       362514, 3062514,  632514, 6032514, 3602514, 6302514, 2360514, 3260514, 2630514, 6230514, 3620514, 6320514,
+       256314, 2056314,  526314, 5026314, 2506314, 5206314,  265314, 2065314,  625314, 6025314, 2605314, 6205314,
+       562314, 5062314,  652314, 6052314, 5602314, 6502314, 2560314, 5260314, 2650314, 6250314, 5620314, 6520314,
+       356214, 3056214,  536214, 5036214, 3506214, 5306214,  365214, 3065214,  635214, 6035214, 3605214, 6305214,
+       563214, 5063214,  653214, 6053214, 5603214, 6503214, 3560214, 5360214, 3650214, 6350214, 5630214, 6530214,
+      2356014, 3256014, 2536014, 5236014, 3526014, 5326014, 2365014, 3265014, 2635014, 6235014, 3625014, 6325014,
+      2563014, 5263014, 2653014, 6253014, 5623014, 6523014, 3562014, 5362014, 3652014, 6352014, 5632014, 6532014,
+      1235604, 2135604, 1325604, 3125604, 2315604, 3215604, 1253604, 2153604, 1523604, 5123604, 2513604, 5213604,
+      1352604, 3152604, 1532604, 5132604, 3512604, 5312604, 2351604, 3251604, 2531604, 5231604, 3521604, 5321604,
+      1236504, 2136504, 1326504, 3126504, 2316504, 3216504, 1263504, 2163504, 1623504, 6123504, 2613504, 6213504,
+      1362504, 3162504, 1632504, 6132504, 3612504, 6312504, 2361504, 3261504, 2631504, 6231504, 3621504, 6321504,
+      1256304, 2156304, 1526304, 5126304, 2516304, 5216304, 1265304, 2165304, 1625304, 6125304, 2615304, 6215304,
+      1562304, 5162304, 1652304, 6152304, 5612304, 6512304, 2561304, 5261304, 2651304, 6251304, 5621304, 6521304,
+      1356204, 3156204, 1536204, 5136204, 3516204, 5316204, 1365204, 3165204, 1635204, 6135204, 3615204, 6315204,
+      1563204, 5163204, 1653204, 6153204, 5613204, 6513204, 3561204, 5361204, 3651204, 6351204, 5631204, 6531204,
+      2356104, 3256104, 2536104, 5236104, 3526104, 5326104, 2365104, 3265104, 2635104, 6235104, 3625104, 6325104,
+      2563104, 5263104, 2653104, 6253104, 5623104, 6523104, 3562104, 5362104, 3652104, 6352104, 5632104, 6532104,
+       124563, 1024563,  214563, 2014563, 1204563, 2104563,  142563, 1042563,  412563, 4012563, 1402563, 4102563,
+       241563, 2041563,  421563, 4021563, 2401563, 4201563, 1240563, 2140563, 1420563, 4120563, 2410563, 4210563,
+       125463, 1025463,  215463, 2015463, 1205463, 2105463,  152463, 1052463,  512463, 5012463, 1502463, 5102463,
+       251463, 2051463,  521463, 5021463, 2501463, 5201463, 1250463, 2150463, 1520463, 5120463, 2510463, 5210463,
+       145263, 1045263,  415263, 4015263, 1405263, 4105263,  154263, 1054263,  514263, 5014263, 1504263, 5104263,
+       451263, 4051263,  541263, 5041263, 4501263, 5401263, 1450263, 4150263, 1540263, 5140263, 4510263, 5410263,
+       245163, 2045163,  425163, 4025163, 2405163, 4205163,  254163, 2054163,  524163, 5024163, 2504163, 5204163,
+       452163, 4052163,  542163, 5042163, 4502163, 5402163, 2450163, 4250163, 2540163, 5240163, 4520163, 5420163,
+      1245063, 2145063, 1425063, 4125063, 2415063, 4215063, 1254063, 2154063, 1524063, 5124063, 2514063, 5214063,
+      1452063, 4152063, 1542063, 5142063, 4512063, 5412063, 2451063, 4251063, 2541063, 5241063, 4521063, 5421063,
+       124653, 1024653,  214653, 2014653, 1204653, 2104653,  142653, 1042653,  412653, 4012653, 1402653, 4102653,
+       241653, 2041653,  421653, 4021653, 2401653, 4201653, 1240653, 2140653, 1420653, 4120653, 2410653, 4210653,
+       126453, 1026453,  216453, 2016453, 1206453, 2106453,  162453, 1062453,  612453, 6012453, 1602453, 6102453,
+       261453, 2061453,  621453, 6021453, 2601453, 6201453, 1260453, 2160453, 1620453, 6120453, 2610453, 6210453,
+       146253, 1046253,  416253, 4016253, 1406253, 4106253,  164253, 1064253,  614253, 6014253, 1604253, 6104253,
+       461253, 4061253,  641253, 6041253, 4601253, 6401253, 1460253, 4160253, 1640253, 6140253, 4610253, 6410253,
+       246153, 2046153,  426153, 4026153, 2406153, 4206153,  264153, 2064153,  624153, 6024153, 2604153, 6204153,
+       462153, 4062153,  642153, 6042153, 4602153, 6402153, 2460153, 4260153, 2640153, 6240153, 4620153, 6420153,
+      1246053, 2146053, 1426053, 4126053, 2416053, 4216053, 1264053, 2164053, 1624053, 6124053, 2614053, 6214053,
+      1462053, 4162053, 1642053, 6142053, 4612053, 6412053, 2461053, 4261053, 2641053, 6241053, 4621053, 6421053,
+       125643, 1025643,  215643, 2015643, 1205643, 2105643,  152643, 1052643,  512643, 5012643, 1502643, 5102643,
+       251643, 2051643,  521643, 5021643, 2501643, 5201643, 1250643, 2150643, 1520643, 5120643, 2510643, 5210643,
+       126543, 1026543,  216543, 2016543, 1206543, 2106543,  162543, 1062543,  612543, 6012543, 1602543, 6102543,
+       261543, 2061543,  621543, 6021543, 2601543, 6201543, 1260543, 2160543, 1620543, 6120543, 2610543, 6210543,
+       156243, 1056243,  516243, 5016243, 1506243, 5106243,  165243, 1065243,  615243, 6015243, 1605243, 6105243,
+       561243, 5061243,  651243, 6051243, 5601243, 6501243, 1560243, 5160243, 1650243, 6150243, 5610243, 6510243,
+       256143, 2056143,  526143, 5026143, 2506143, 5206143,  265143, 2065143,  625143, 6025143, 2605143, 6205143,
+       562143, 5062143,  652143, 6052143, 5602143, 6502143, 2560143, 5260143, 2650143, 6250143, 5620143, 6520143,
+      1256043, 2156043, 1526043, 5126043, 2516043, 5216043, 1265043, 2165043, 1625043, 6125043, 2615043, 6215043,
+      1562043, 5162043, 1652043, 6152043, 5612043, 6512043, 2561043, 5261043, 2651043, 6251043, 5621043, 6521043,
+       145623, 1045623,  415623, 4015623, 1405623, 4105623,  154623, 1054623,  514623, 5014623, 1504623, 5104623,
+       451623, 4051623,  541623, 5041623, 4501623, 5401623, 1450623, 4150623, 1540623, 5140623, 4510623, 5410623,
+       146523, 1046523,  416523, 4016523, 1406523, 4106523,  164523, 1064523,  614523, 6014523, 1604523, 6104523,
+       461523, 4061523,  641523, 6041523, 4601523, 6401523, 1460523, 4160523, 1640523, 6140523, 4610523, 6410523,
+       156423, 1056423,  516423, 5016423, 1506423, 5106423,  165423, 1065423,  615423, 6015423, 1605423, 6105423,
+       561423, 5061423,  651423, 6051423, 5601423, 6501423, 1560423, 5160423, 1650423, 6150423, 5610423, 6510423,
+       456123, 4056123,  546123, 5046123, 4506123, 5406123,  465123, 4065123,  645123, 6045123, 4605123, 6405123,
+       564123, 5064123,  654123, 6054123, 5604123, 6504123, 4560123, 5460123, 4650123, 6450123, 5640123, 6540123,
+      1456023, 4156023, 1546023, 5146023, 4516023, 5416023, 1465023, 4165023, 1645023, 6145023, 4615023, 6415023,
+      1564023, 5164023, 1654023, 6154023, 5614023, 6514023, 4561023, 5461023, 4651023, 6451023, 5641023, 6541023,
+       245613, 2045613,  425613, 4025613, 2405613, 4205613,  254613, 2054613,  524613, 5024613, 2504613, 5204613,
+       452613, 4052613,  542613, 5042613, 4502613, 5402613, 2450613, 4250613, 2540613, 5240613, 4520613, 5420613,
+       246513, 2046513,  426513, 4026513, 2406513, 4206513,  264513, 2064513,  624513, 6024513, 2604513, 6204513,
+       462513, 4062513,  642513, 6042513, 4602513, 6402513, 2460513, 4260513, 2640513, 6240513, 4620513, 6420513,
+       256413, 2056413,  526413, 5026413, 2506413, 5206413,  265413, 2065413,  625413, 6025413, 2605413, 6205413,
+       562413, 5062413,  652413, 6052413, 5602413, 6502413, 2560413, 5260413, 2650413, 6250413, 5620413, 6520413,
+       456213, 4056213,  546213, 5046213, 4506213, 5406213,  465213, 4065213,  645213, 6045213, 4605213, 6405213,
+       564213, 5064213,  654213, 6054213, 5604213, 6504213, 4560213, 5460213, 4650213, 6450213, 5640213, 6540213,
+      2456013, 4256013, 2546013, 5246013, 4526013, 5426013, 2465013, 4265013, 2645013, 6245013, 4625013, 6425013,
+      2564013, 5264013, 2654013, 6254013, 5624013, 6524013, 4562013, 5462013, 4652013, 6452013, 5642013, 6542013,
+      1245603, 2145603, 1425603, 4125603, 2415603, 4215603, 1254603, 2154603, 1524603, 5124603, 2514603, 5214603,
+      1452603, 4152603, 1542603, 5142603, 4512603, 5412603, 2451603, 4251603, 2541603, 5241603, 4521603, 5421603,
+      1246503, 2146503, 1426503, 4126503, 2416503, 4216503, 1264503, 2164503, 1624503, 6124503, 2614503, 6214503,
+      1462503, 4162503, 1642503, 6142503, 4612503, 6412503, 2461503, 4261503, 2641503, 6241503, 4621503, 6421503,
+      1256403, 2156403, 1526403, 5126403, 2516403, 5216403, 1265403, 2165403, 1625403, 6125403, 2615403, 6215403,
+      1562403, 5162403, 1652403, 6152403, 5612403, 6512403, 2561403, 5261403, 2651403, 6251403, 5621403, 6521403,
+      1456203, 4156203, 1546203, 5146203, 4516203, 5416203, 1465203, 4165203, 1645203, 6145203, 4615203, 6415203,
+      1564203, 5164203, 1654203, 6154203, 5614203, 6514203, 4561203, 5461203, 4651203, 6451203, 5641203, 6541203,
+      2456103, 4256103, 2546103, 5246103, 4526103, 5426103, 2465103, 4265103, 2645103, 6245103, 4625103, 6425103,
+      2564103, 5264103, 2654103, 6254103, 5624103, 6524103, 4562103, 5462103, 4652103, 6452103, 5642103, 6542103,
+       134562, 1034562,  314562, 3014562, 1304562, 3104562,  143562, 1043562,  413562, 4013562, 1403562, 4103562,
+       341562, 3041562,  431562, 4031562, 3401562, 4301562, 1340562, 3140562, 1430562, 4130562, 3410562, 4310562,
+       135462, 1035462,  315462, 3015462, 1305462, 3105462,  153462, 1053462,  513462, 5013462, 1503462, 5103462,
+       351462, 3051462,  531462, 5031462, 3501462, 5301462, 1350462, 3150462, 1530462, 5130462, 3510462, 5310462,
+       145362, 1045362,  415362, 4015362, 1405362, 4105362,  154362, 1054362,  514362, 5014362, 1504362, 5104362,
+       451362, 4051362,  541362, 5041362, 4501362, 5401362, 1450362, 4150362, 1540362, 5140362, 4510362, 5410362,
+       345162, 3045162,  435162, 4035162, 3405162, 4305162,  354162, 3054162,  534162, 5034162, 3504162, 5304162,
+       453162, 4053162,  543162, 5043162, 4503162, 5403162, 3450162, 4350162, 3540162, 5340162, 4530162, 5430162,
+      1345062, 3145062, 1435062, 4135062, 3415062, 4315062, 1354062, 3154062, 1534062, 5134062, 3514062, 5314062,
+      1453062, 4153062, 1543062, 5143062, 4513062, 5413062, 3451062, 4351062, 3541062, 5341062, 4531062, 5431062,
+       134652, 1034652,  314652, 3014652, 1304652, 3104652,  143652, 1043652,  413652, 4013652, 1403652, 4103652,
+       341652, 3041652,  431652, 4031652, 3401652, 4301652, 1340652, 3140652, 1430652, 4130652, 3410652, 4310652,
+       136452, 1036452,  316452, 3016452, 1306452, 3106452,  163452, 1063452,  613452, 6013452, 1603452, 6103452,
+       361452, 3061452,  631452, 6031452, 3601452, 6301452, 1360452, 3160452, 1630452, 6130452, 3610452, 6310452,
+       146352, 1046352,  416352, 4016352, 1406352, 4106352,  164352, 1064352,  614352, 6014352, 1604352, 6104352,
+       461352, 4061352,  641352, 6041352, 4601352, 6401352, 1460352, 4160352, 1640352, 6140352, 4610352, 6410352,
+       346152, 3046152,  436152, 4036152, 3406152, 4306152,  364152, 3064152,  634152, 6034152, 3604152, 6304152,
+       463152, 4063152,  643152, 6043152, 4603152, 6403152, 3460152, 4360152, 3640152, 6340152, 4630152, 6430152,
+      1346052, 3146052, 1436052, 4136052, 3416052, 4316052, 1364052, 3164052, 1634052, 6134052, 3614052, 6314052,
+      1463052, 4163052, 1643052, 6143052, 4613052, 6413052, 3461052, 4361052, 3641052, 6341052, 4631052, 6431052,
+       135642, 1035642,  315642, 3015642, 1305642, 3105642,  153642, 1053642,  513642, 5013642, 1503642, 5103642,
+       351642, 3051642,  531642, 5031642, 3501642, 5301642, 1350642, 3150642, 1530642, 5130642, 3510642, 5310642,
+       136542, 1036542,  316542, 3016542, 1306542, 3106542,  163542, 1063542,  613542, 6013542, 1603542, 6103542,
+       361542, 3061542,  631542, 6031542, 3601542, 6301542, 1360542, 3160542, 1630542, 6130542, 3610542, 6310542,
+       156342, 1056342,  516342, 5016342, 1506342, 5106342,  165342, 1065342,  615342, 6015342, 1605342, 6105342,
+       561342, 5061342,  651342, 6051342, 5601342, 6501342, 1560342, 5160342, 1650342, 6150342, 5610342, 6510342,
+       356142, 3056142,  536142, 5036142, 3506142, 5306142,  365142, 3065142,  635142, 6035142, 3605142, 6305142,
+       563142, 5063142,  653142, 6053142, 5603142, 6503142, 3560142, 5360142, 3650142, 6350142, 5630142, 6530142,
+      1356042, 3156042, 1536042, 5136042, 3516042, 5316042, 1365042, 3165042, 1635042, 6135042, 3615042, 6315042,
+      1563042, 5163042, 1653042, 6153042, 5613042, 6513042, 3561042, 5361042, 3651042, 6351042, 5631042, 6531042,
+       145632, 1045632,  415632, 4015632, 1405632, 4105632,  154632, 1054632,  514632, 5014632, 1504632, 5104632,
+       451632, 4051632,  541632, 5041632, 4501632, 5401632, 1450632, 4150632, 1540632, 5140632, 4510632, 5410632,
+       146532, 1046532,  416532, 4016532, 1406532, 4106532,  164532, 1064532,  614532, 6014532, 1604532, 6104532,
+       461532, 4061532,  641532, 6041532, 4601532, 6401532, 1460532, 4160532, 1640532, 6140532, 4610532, 6410532,
+       156432, 1056432,  516432, 5016432, 1506432, 5106432,  165432, 1065432,  615432, 6015432, 1605432, 6105432,
+       561432, 5061432,  651432, 6051432, 5601432, 6501432, 1560432, 5160432, 1650432, 6150432, 5610432, 6510432,
+       456132, 4056132,  546132, 5046132, 4506132, 5406132,  465132, 4065132,  645132, 6045132, 4605132, 6405132,
+       564132, 5064132,  654132, 6054132, 5604132, 6504132, 4560132, 5460132, 4650132, 6450132, 5640132, 6540132,
+      1456032, 4156032, 1546032, 5146032, 4516032, 5416032, 1465032, 4165032, 1645032, 6145032, 4615032, 6415032,
+      1564032, 5164032, 1654032, 6154032, 5614032, 6514032, 4561032, 5461032, 4651032, 6451032, 5641032, 6541032,
+       345612, 3045612,  435612, 4035612, 3405612, 4305612,  354612, 3054612,  534612, 5034612, 3504612, 5304612,
+       453612, 4053612,  543612, 5043612, 4503612, 5403612, 3450612, 4350612, 3540612, 5340612, 4530612, 5430612,
+       346512, 3046512,  436512, 4036512, 3406512, 4306512,  364512, 3064512,  634512, 6034512, 3604512, 6304512,
+       463512, 4063512,  643512, 6043512, 4603512, 6403512, 3460512, 4360512, 3640512, 6340512, 4630512, 6430512,
+       356412, 3056412,  536412, 5036412, 3506412, 5306412,  365412, 3065412,  635412, 6035412, 3605412, 6305412,
+       563412, 5063412,  653412, 6053412, 5603412, 6503412, 3560412, 5360412, 3650412, 6350412, 5630412, 6530412,
+       456312, 4056312,  546312, 5046312, 4506312, 5406312,  465312, 4065312,  645312, 6045312, 4605312, 6405312,
+       564312, 5064312,  654312, 6054312, 5604312, 6504312, 4560312, 5460312, 4650312, 6450312, 5640312, 6540312,
+      3456012, 4356012, 3546012, 5346012, 4536012, 5436012, 3465012, 4365012, 3645012, 6345012, 4635012, 6435012,
+      3564012, 5364012, 3654012, 6354012, 5634012, 6534012, 4563012, 5463012, 4653012, 6453012, 5643012, 6543012,
+      1345602, 3145602, 1435602, 4135602, 3415602, 4315602, 1354602, 3154602, 1534602, 5134602, 3514602, 5314602,
+      1453602, 4153602, 1543602, 5143602, 4513602, 5413602, 3451602, 4351602, 3541602, 5341602, 4531602, 5431602,
+      1346502, 3146502, 1436502, 4136502, 3416502, 4316502, 1364502, 3164502, 1634502, 6134502, 3614502, 6314502,
+      1463502, 4163502, 1643502, 6143502, 4613502, 6413502, 3461502, 4361502, 3641502, 6341502, 4631502, 6431502,
+      1356402, 3156402, 1536402, 5136402, 3516402, 5316402, 1365402, 3165402, 1635402, 6135402, 3615402, 6315402,
+      1563402, 5163402, 1653402, 6153402, 5613402, 6513402, 3561402, 5361402, 3651402, 6351402, 5631402, 6531402,
+      1456302, 4156302, 1546302, 5146302, 4516302, 5416302, 1465302, 4165302, 1645302, 6145302, 4615302, 6415302,
+      1564302, 5164302, 1654302, 6154302, 5614302, 6514302, 4561302, 5461302, 4651302, 6451302, 5641302, 6541302,
+      3456102, 4356102, 3546102, 5346102, 4536102, 5436102, 3465102, 4365102, 3645102, 6345102, 4635102, 6435102,
+      3564102, 5364102, 3654102, 6354102, 5634102, 6534102, 4563102, 5463102, 4653102, 6453102, 5643102, 6543102,
+       234561, 2034561,  324561, 3024561, 2304561, 3204561,  243561, 2043561,  423561, 4023561, 2403561, 4203561,
+       342561, 3042561,  432561, 4032561, 3402561, 4302561, 2340561, 3240561, 2430561, 4230561, 3420561, 4320561,
+       235461, 2035461,  325461, 3025461, 2305461, 3205461,  253461, 2053461,  523461, 5023461, 2503461, 5203461,
+       352461, 3052461,  532461, 5032461, 3502461, 5302461, 2350461, 3250461, 2530461, 5230461, 3520461, 5320461,
+       245361, 2045361,  425361, 4025361, 2405361, 4205361,  254361, 2054361,  524361, 5024361, 2504361, 5204361,
+       452361, 4052361,  542361, 5042361, 4502361, 5402361, 2450361, 4250361, 2540361, 5240361, 4520361, 5420361,
+       345261, 3045261,  435261, 4035261, 3405261, 4305261,  354261, 3054261,  534261, 5034261, 3504261, 5304261,
+       453261, 4053261,  543261, 5043261, 4503261, 5403261, 3450261, 4350261, 3540261, 5340261, 4530261, 5430261,
+      2345061, 3245061, 2435061, 4235061, 3425061, 4325061, 2354061, 3254061, 2534061, 5234061, 3524061, 5324061,
+      2453061, 4253061, 2543061, 5243061, 4523061, 5423061, 3452061, 4352061, 3542061, 5342061, 4532061, 5432061,
+       234651, 2034651,  324651, 3024651, 2304651, 3204651,  243651, 2043651,  423651, 4023651, 2403651, 4203651,
+       342651, 3042651,  432651, 4032651, 3402651, 4302651, 2340651, 3240651, 2430651, 4230651, 3420651, 4320651,
+       236451, 2036451,  326451, 3026451, 2306451, 3206451,  263451, 2063451,  623451, 6023451, 2603451, 6203451,
+       362451, 3062451,  632451, 6032451, 3602451, 6302451, 2360451, 3260451, 2630451, 6230451, 3620451, 6320451,
+       246351, 2046351,  426351, 4026351, 2406351, 4206351,  264351, 2064351,  624351, 6024351, 2604351, 6204351,
+       462351, 4062351,  642351, 6042351, 4602351, 6402351, 2460351, 4260351, 2640351, 6240351, 4620351, 6420351,
+       346251, 3046251,  436251, 4036251, 3406251, 4306251,  364251, 3064251,  634251, 6034251, 3604251, 6304251,
+       463251, 4063251,  643251, 6043251, 4603251, 6403251, 3460251, 4360251, 3640251, 6340251, 4630251, 6430251,
+      2346051, 3246051, 2436051, 4236051, 3426051, 4326051, 2364051, 3264051, 2634051, 6234051, 3624051, 6324051,
+      2463051, 4263051, 2643051, 6243051, 4623051, 6423051, 3462051, 4362051, 3642051, 6342051, 4632051, 6432051,
+       235641, 2035641,  325641, 3025641, 2305641, 3205641,  253641, 2053641,  523641, 5023641, 2503641, 5203641,
+       352641, 3052641,  532641, 5032641, 3502641, 5302641, 2350641, 3250641, 2530641, 5230641, 3520641, 5320641,
+       236541, 2036541,  326541, 3026541, 2306541, 3206541,  263541, 2063541,  623541, 6023541, 2603541, 6203541,
+       362541, 3062541,  632541, 6032541, 3602541, 6302541, 2360541, 3260541, 2630541, 6230541, 3620541, 6320541,
+       256341, 2056341,  526341, 5026341, 2506341, 5206341,  265341, 2065341,  625341, 6025341, 2605341, 6205341,
+       562341, 5062341,  652341, 6052341, 5602341, 6502341, 2560341, 5260341, 2650341, 6250341, 5620341, 6520341,
+       356241, 3056241,  536241, 5036241, 3506241, 5306241,  365241, 3065241,  635241, 6035241, 3605241, 6305241,
+       563241, 5063241,  653241, 6053241, 5603241, 6503241, 3560241, 5360241, 3650241, 6350241, 5630241, 6530241,
+      2356041, 3256041, 2536041, 5236041, 3526041, 5326041, 2365041, 3265041, 2635041, 6235041, 3625041, 6325041,
+      2563041, 5263041, 2653041, 6253041, 5623041, 6523041, 3562041, 5362041, 3652041, 6352041, 5632041, 6532041,
+       245631, 2045631,  425631, 4025631, 2405631, 4205631,  254631, 2054631,  524631, 5024631, 2504631, 5204631,
+       452631, 4052631,  542631, 5042631, 4502631, 5402631, 2450631, 4250631, 2540631, 5240631, 4520631, 5420631,
+       246531, 2046531,  426531, 4026531, 2406531, 4206531,  264531, 2064531,  624531, 6024531, 2604531, 6204531,
+       462531, 4062531,  642531, 6042531, 4602531, 6402531, 2460531, 4260531, 2640531, 6240531, 4620531, 6420531,
+       256431, 2056431,  526431, 5026431, 2506431, 5206431,  265431, 2065431,  625431, 6025431, 2605431, 6205431,
+       562431, 5062431,  652431, 6052431, 5602431, 6502431, 2560431, 5260431, 2650431, 6250431, 5620431, 6520431,
+       456231, 4056231,  546231, 5046231, 4506231, 5406231,  465231, 4065231,  645231, 6045231, 4605231, 6405231,
+       564231, 5064231,  654231, 6054231, 5604231, 6504231, 4560231, 5460231, 4650231, 6450231, 5640231, 6540231,
+      2456031, 4256031, 2546031, 5246031, 4526031, 5426031, 2465031, 4265031, 2645031, 6245031, 4625031, 6425031,
+      2564031, 5264031, 2654031, 6254031, 5624031, 6524031, 4562031, 5462031, 4652031, 6452031, 5642031, 6542031,
+       345621, 3045621,  435621, 4035621, 3405621, 4305621,  354621, 3054621,  534621, 5034621, 3504621, 5304621,
+       453621, 4053621,  543621, 5043621, 4503621, 5403621, 3450621, 4350621, 3540621, 5340621, 4530621, 5430621,
+       346521, 3046521,  436521, 4036521, 3406521, 4306521,  364521, 3064521,  634521, 6034521, 3604521, 6304521,
+       463521, 4063521,  643521, 6043521, 4603521, 6403521, 3460521, 4360521, 3640521, 6340521, 4630521, 6430521,
+       356421, 3056421,  536421, 5036421, 3506421, 5306421,  365421, 3065421,  635421, 6035421, 3605421, 6305421,
+       563421, 5063421,  653421, 6053421, 5603421, 6503421, 3560421, 5360421, 3650421, 6350421, 5630421, 6530421,
+       456321, 4056321,  546321, 5046321, 4506321, 5406321,  465321, 4065321,  645321, 6045321, 4605321, 6405321,
+       564321, 5064321,  654321, 6054321, 5604321, 6504321, 4560321, 5460321, 4650321, 6450321, 5640321, 6540321,
+      3456021, 4356021, 3546021, 5346021, 4536021, 5436021, 3465021, 4365021, 3645021, 6345021, 4635021, 6435021,
+      3564021, 5364021, 3654021, 6354021, 5634021, 6534021, 4563021, 5463021, 4653021, 6453021, 5643021, 6543021,
+      2345601, 3245601, 2435601, 4235601, 3425601, 4325601, 2354601, 3254601, 2534601, 5234601, 3524601, 5324601,
+      2453601, 4253601, 2543601, 5243601, 4523601, 5423601, 3452601, 4352601, 3542601, 5342601, 4532601, 5432601,
+      2346501, 3246501, 2436501, 4236501, 3426501, 4326501, 2364501, 3264501, 2634501, 6234501, 3624501, 6324501,
+      2463501, 4263501, 2643501, 6243501, 4623501, 6423501, 3462501, 4362501, 3642501, 6342501, 4632501, 6432501,
+      2356401, 3256401, 2536401, 5236401, 3526401, 5326401, 2365401, 3265401, 2635401, 6235401, 3625401, 6325401,
+      2563401, 5263401, 2653401, 6253401, 5623401, 6523401, 3562401, 5362401, 3652401, 6352401, 5632401, 6532401,
+      2456301, 4256301, 2546301, 5246301, 4526301, 5426301, 2465301, 4265301, 2645301, 6245301, 4625301, 6425301,
+      2564301, 5264301, 2654301, 6254301, 5624301, 6524301, 4562301, 5462301, 4652301, 6452301, 5642301, 6542301,
+      3456201, 4356201, 3546201, 5346201, 4536201, 5436201, 3465201, 4365201, 3645201, 6345201, 4635201, 6435201,
+      3564201, 5364201, 3654201, 6354201, 5634201, 6534201, 4563201, 5463201, 4653201, 6453201, 5643201, 6543201,
+      1234560, 2134560, 1324560, 3124560, 2314560, 3214560, 1243560, 2143560, 1423560, 4123560, 2413560, 4213560,
+      1342560, 3142560, 1432560, 4132560, 3412560, 4312560, 2341560, 3241560, 2431560, 4231560, 3421560, 4321560,
+      1235460, 2135460, 1325460, 3125460, 2315460, 3215460, 1253460, 2153460, 1523460, 5123460, 2513460, 5213460,
+      1352460, 3152460, 1532460, 5132460, 3512460, 5312460, 2351460, 3251460, 2531460, 5231460, 3521460, 5321460,
+      1245360, 2145360, 1425360, 4125360, 2415360, 4215360, 1254360, 2154360, 1524360, 5124360, 2514360, 5214360,
+      1452360, 4152360, 1542360, 5142360, 4512360, 5412360, 2451360, 4251360, 2541360, 5241360, 4521360, 5421360,
+      1345260, 3145260, 1435260, 4135260, 3415260, 4315260, 1354260, 3154260, 1534260, 5134260, 3514260, 5314260,
+      1453260, 4153260, 1543260, 5143260, 4513260, 5413260, 3451260, 4351260, 3541260, 5341260, 4531260, 5431260,
+      2345160, 3245160, 2435160, 4235160, 3425160, 4325160, 2354160, 3254160, 2534160, 5234160, 3524160, 5324160,
+      2453160, 4253160, 2543160, 5243160, 4523160, 5423160, 3452160, 4352160, 3542160, 5342160, 4532160, 5432160,
+      1234650, 2134650, 1324650, 3124650, 2314650, 3214650, 1243650, 2143650, 1423650, 4123650, 2413650, 4213650,
+      1342650, 3142650, 1432650, 4132650, 3412650, 4312650, 2341650, 3241650, 2431650, 4231650, 3421650, 4321650,
+      1236450, 2136450, 1326450, 3126450, 2316450, 3216450, 1263450, 2163450, 1623450, 6123450, 2613450, 6213450,
+      1362450, 3162450, 1632450, 6132450, 3612450, 6312450, 2361450, 3261450, 2631450, 6231450, 3621450, 6321450,
+      1246350, 2146350, 1426350, 4126350, 2416350, 4216350, 1264350, 2164350, 1624350, 6124350, 2614350, 6214350,
+      1462350, 4162350, 1642350, 6142350, 4612350, 6412350, 2461350, 4261350, 2641350, 6241350, 4621350, 6421350,
+      1346250, 3146250, 1436250, 4136250, 3416250, 4316250, 1364250, 3164250, 1634250, 6134250, 3614250, 6314250,
+      1463250, 4163250, 1643250, 6143250, 4613250, 6413250, 3461250, 4361250, 3641250, 6341250, 4631250, 6431250,
+      2346150, 3246150, 2436150, 4236150, 3426150, 4326150, 2364150, 3264150, 2634150, 6234150, 3624150, 6324150,
+      2463150, 4263150, 2643150, 6243150, 4623150, 6423150, 3462150, 4362150, 3642150, 6342150, 4632150, 6432150,
+      1235640, 2135640, 1325640, 3125640, 2315640, 3215640, 1253640, 2153640, 1523640, 5123640, 2513640, 5213640,
+      1352640, 3152640, 1532640, 5132640, 3512640, 5312640, 2351640, 3251640, 2531640, 5231640, 3521640, 5321640,
+      1236540, 2136540, 1326540, 3126540, 2316540, 3216540, 1263540, 2163540, 1623540, 6123540, 2613540, 6213540,
+      1362540, 3162540, 1632540, 6132540, 3612540, 6312540, 2361540, 3261540, 2631540, 6231540, 3621540, 6321540,
+      1256340, 2156340, 1526340, 5126340, 2516340, 5216340, 1265340, 2165340, 1625340, 6125340, 2615340, 6215340,
+      1562340, 5162340, 1652340, 6152340, 5612340, 6512340, 2561340, 5261340, 2651340, 6251340, 5621340, 6521340,
+      1356240, 3156240, 1536240, 5136240, 3516240, 5316240, 1365240, 3165240, 1635240, 6135240, 3615240, 6315240,
+      1563240, 5163240, 1653240, 6153240, 5613240, 6513240, 3561240, 5361240, 3651240, 6351240, 5631240, 6531240,
+      2356140, 3256140, 2536140, 5236140, 3526140, 5326140, 2365140, 3265140, 2635140, 6235140, 3625140, 6325140,
+      2563140, 5263140, 2653140, 6253140, 5623140, 6523140, 3562140, 5362140, 3652140, 6352140, 5632140, 6532140,
+      1245630, 2145630, 1425630, 4125630, 2415630, 4215630, 1254630, 2154630, 1524630, 5124630, 2514630, 5214630,
+      1452630, 4152630, 1542630, 5142630, 4512630, 5412630, 2451630, 4251630, 2541630, 5241630, 4521630, 5421630,
+      1246530, 2146530, 1426530, 4126530, 2416530, 4216530, 1264530, 2164530, 1624530, 6124530, 2614530, 6214530,
+      1462530, 4162530, 1642530, 6142530, 4612530, 6412530, 2461530, 4261530, 2641530, 6241530, 4621530, 6421530,
+      1256430, 2156430, 1526430, 5126430, 2516430, 5216430, 1265430, 2165430, 1625430, 6125430, 2615430, 6215430,
+      1562430, 5162430, 1652430, 6152430, 5612430, 6512430, 2561430, 5261430, 2651430, 6251430, 5621430, 6521430,
+      1456230, 4156230, 1546230, 5146230, 4516230, 5416230, 1465230, 4165230, 1645230, 6145230, 4615230, 6415230,
+      1564230, 5164230, 1654230, 6154230, 5614230, 6514230, 4561230, 5461230, 4651230, 6451230, 5641230, 6541230,
+      2456130, 4256130, 2546130, 5246130, 4526130, 5426130, 2465130, 4265130, 2645130, 6245130, 4625130, 6425130,
+      2564130, 5264130, 2654130, 6254130, 5624130, 6524130, 4562130, 5462130, 4652130, 6452130, 5642130, 6542130,
+      1345620, 3145620, 1435620, 4135620, 3415620, 4315620, 1354620, 3154620, 1534620, 5134620, 3514620, 5314620,
+      1453620, 4153620, 1543620, 5143620, 4513620, 5413620, 3451620, 4351620, 3541620, 5341620, 4531620, 5431620,
+      1346520, 3146520, 1436520, 4136520, 3416520, 4316520, 1364520, 3164520, 1634520, 6134520, 3614520, 6314520,
+      1463520, 4163520, 1643520, 6143520, 4613520, 6413520, 3461520, 4361520, 3641520, 6341520, 4631520, 6431520,
+      1356420, 3156420, 1536420, 5136420, 3516420, 5316420, 1365420, 3165420, 1635420, 6135420, 3615420, 6315420,
+      1563420, 5163420, 1653420, 6153420, 5613420, 6513420, 3561420, 5361420, 3651420, 6351420, 5631420, 6531420,
+      1456320, 4156320, 1546320, 5146320, 4516320, 5416320, 1465320, 4165320, 1645320, 6145320, 4615320, 6415320,
+      1564320, 5164320, 1654320, 6154320, 5614320, 6514320, 4561320, 5461320, 4651320, 6451320, 5641320, 6541320,
+      3456120, 4356120, 3546120, 5346120, 4536120, 5436120, 3465120, 4365120, 3645120, 6345120, 4635120, 6435120,
+      3564120, 5364120, 3654120, 6354120, 5634120, 6534120, 4563120, 5463120, 4653120, 6453120, 5643120, 6543120,
+      2345610, 3245610, 2435610, 4235610, 3425610, 4325610, 2354610, 3254610, 2534610, 5234610, 3524610, 5324610,
+      2453610, 4253610, 2543610, 5243610, 4523610, 5423610, 3452610, 4352610, 3542610, 5342610, 4532610, 5432610,
+      2346510, 3246510, 2436510, 4236510, 3426510, 4326510, 2364510, 3264510, 2634510, 6234510, 3624510, 6324510,
+      2463510, 4263510, 2643510, 6243510, 4623510, 6423510, 3462510, 4362510, 3642510, 6342510, 4632510, 6432510,
+      2356410, 3256410, 2536410, 5236410, 3526410, 5326410, 2365410, 3265410, 2635410, 6235410, 3625410, 6325410,
+      2563410, 5263410, 2653410, 6253410, 5623410, 6523410, 3562410, 5362410, 3652410, 6352410, 5632410, 6532410,
+      2456310, 4256310, 2546310, 5246310, 4526310, 5426310, 2465310, 4265310, 2645310, 6245310, 4625310, 6425310,
+      2564310, 5264310, 2654310, 6254310, 5624310, 6524310, 4562310, 5462310, 4652310, 6452310, 5642310, 6542310,
+      3456210, 4356210, 3546210, 5346210, 4536210, 5436210, 3465210, 4365210, 3645210, 6345210, 4635210, 6435210,
+      3564210, 5364210, 3654210, 6354210, 5634210, 6534210, 4563210, 5463210, 4653210, 6453210, 5643210, 6543210
+    };
+    std::map<uint64_t, int> expected;
+    for (std::size_t i = 0; i < 5040; i++)
+      expected[pre_expected[i]] = 0; // flags are 0, everything is symmetric here
+
+    VERIFY(isDynGroup(group));
+    VERIFY_IS_EQUAL(group.size(), 5040u);
+    VERIFY_IS_EQUAL(group.globalFlags(), 0);
+    group.apply<checkIdx, int>(identity7, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 5040u);
+  }
+}
+
+static void test_tensor_epsilon()
+{
+  SGroup<AntiSymmetry<0,1>, AntiSymmetry<1,2>> sym;
+  Tensor<int, 3> epsilon(3,3,3);
+
+  epsilon.setZero();
+  sym(epsilon, 0, 1, 2) = 1;
+
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 3; k++) {
+        VERIFY_IS_EQUAL((epsilon(i,j,k)), (- (j - i) * (k - j) * (i - k) / 2) );
+      }
+    }
+  }
+}
+
+static void test_tensor_sym()
+{
+  SGroup<Symmetry<0,1>, Symmetry<2,3>> sym;
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = l; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = j; i < 10; i++) {
+          sym(t, i, j, k, l) = (i + j) * (k + l);
+        }
+      }
+    }
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          VERIFY_IS_EQUAL((t(i, j, k, l)), ((i + j) * (k + l)));
+        }
+      }
+    }
+  }
+
+}
+
+static void test_tensor_asym()
+{
+  SGroup<AntiSymmetry<0,1>, AntiSymmetry<2,3>> sym;
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = l + 1; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = j + 1; i < 10; i++) {
+          sym(t, i, j, k, l) = ((i * j) + (k * l));
+        }
+      }
+    }
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          if (i < j && k < l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (((i * j) + (k * l))));
+          else if (i > j && k > l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (((i * j) + (k * l))));
+          else if (i < j && k > l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (- ((i * j) + (k * l))));
+          else if (i > j && k < l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (- ((i * j) + (k * l))));
+          else
+            VERIFY_IS_EQUAL((t(i, j, k, l)), 0);
+        }
+      }
+    }
+  }
+}
+
+static void test_tensor_dynsym()
+{
+  DynamicSGroup sym;
+  sym.addSymmetry(0,1);
+  sym.addSymmetry(2,3);
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = l; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = j; i < 10; i++) {
+          sym(t, i, j, k, l) = (i + j) * (k + l);
+        }
+      }
+    }
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          VERIFY_IS_EQUAL((t(i, j, k, l)), ((i + j) * (k + l)));
+        }
+      }
+    }
+  }
+}
+
+static void test_tensor_randacc()
+{
+  SGroup<Symmetry<0,1>, Symmetry<2,3>> sym;
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  // set elements 1 million times, that way we access the
+  // entire matrix
+  for (int n = 0; n < 1000000; n++) {
+    int i = rand() % 10;
+    int j = rand() % 10;
+    int k = rand() % 10;
+    int l = rand() % 10;
+    // only access those indices in a given order
+    if (i < j)
+      std::swap(i, j);
+    if (k < l)
+      std::swap(k, l);
+    sym(t, i, j, k, l) = (i + j) * (k + l);
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          VERIFY_IS_EQUAL((t(i, j, k, l)), ((i + j) * (k + l)));
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_symmetry()
+{
+  CALL_SUBTEST(test_symgroups_static());
+  CALL_SUBTEST(test_symgroups_dynamic());
+  CALL_SUBTEST(test_symgroups_selection());
+  CALL_SUBTEST(test_tensor_epsilon());
+  CALL_SUBTEST(test_tensor_sym());
+  CALL_SUBTEST(test_tensor_asym());
+  CALL_SUBTEST(test_tensor_dynsym());
+  CALL_SUBTEST(test_tensor_randacc());
+}
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
new file mode 100644
index 000000000..2ef665f30
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -0,0 +1,373 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+
+#include "main.h"
+#include <iostream>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+void test_multithread_elementwise()
+{
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+  out.device(thread_pool_device) = in1 + in2 * 3.14f;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+      }
+    }
+  }
+}
+
+
+void test_multithread_compound_assignment()
+{
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+  out.device(thread_pool_device) = in1;
+  out.device(thread_pool_device) += in2 * 3.14f;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+void test_multithread_contraction()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
+  Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 1147);
+  MapXf m_right(t_right.data(), 1147, 1400);
+  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  Eigen::ThreadPool tp(4);
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, 4);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+ for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    if (fabsf(t_result(i) - m_result(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  m_result(i) << std::endl;
+    assert(false);
+  }
+}
+
+template<int DataLayout>
+void test_contraction_corner_cases()
+{
+  Tensor<float, 2, DataLayout> t_left(32, 500);
+  Tensor<float, 2, DataLayout> t_right(32, 28*28);
+  Tensor<float, 2, DataLayout> t_result(500, 28*28);
+
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result = t_result.constant(NAN);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
+
+  typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 32, 500);
+  MapXf m_right(t_right.data(), 32, 28*28);
+  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28);
+
+  Eigen::ThreadPool tp(12);
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, 12);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left.transpose() * m_right;
+
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_result.resize (1, 28*28);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 500);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (500, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 500);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (1, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+}
+
+template<int DataLayout>
+void test_multithread_contraction_agrees_with_singlethread() {
+  int contract_size = internal::random<int>(1, 5000);
+
+  Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
+                                    contract_size,
+                                    internal::random<int>(1, 100));
+
+  Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25),
+                                     internal::random<int>(1, 37),
+                                     contract_size,
+                                     internal::random<int>(1, 51));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
+
+  Tensor<float, 5, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test will fail
+    // due to numerical precision issues when values are small)
+    if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
+
+template<int DataLayout>
+void test_full_contraction() {
+  int contract_size1 = internal::random<int>(1, 500);
+  int contract_size2 = internal::random<int>(1, 500);
+
+  Tensor<float, 2, DataLayout> left(contract_size1,
+                                    contract_size2);
+  Tensor<float, 2, DataLayout> right(contract_size1,
+                                    contract_size2);
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
+
+  Tensor<float, 0, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 0, DataLayout> tp_result;
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  // if both of the values are very small, then do nothing (because the test will fail
+  // due to numerical precision issues when values are small)
+  if (numext::abs(st_result() - tp_result()) >= 1e-4f) {
+    VERIFY_IS_APPROX(st_result(), tp_result());
+  }
+}
+
+template<int DataLayout>
+void test_multithreaded_reductions() {
+  const int num_threads = internal::random<int>(3, 11);
+  ThreadPool thread_pool(num_threads);
+  Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
+
+  const int num_rows = internal::random<int>(13, 732);
+  const int num_cols = internal::random<int>(13, 732);
+  Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
+  t1.setRandom();
+
+  Tensor<float, 0, DataLayout> full_redux;
+  full_redux = t1.sum();
+
+  Tensor<float, 0, DataLayout> full_redux_tp;
+  full_redux_tp.device(thread_pool_device) = t1.sum();
+
+  // Check that the single threaded and the multi threaded reductions return
+  // the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_tp());
+}
+
+
+void test_memcpy() {
+
+  for (int i = 0; i < 5; ++i) {
+    const int num_threads = internal::random<int>(3, 11);
+    Eigen::ThreadPool tp(num_threads);
+    Eigen::ThreadPoolDevice thread_pool_device(&tp, num_threads);
+
+    const int size = internal::random<int>(13, 7632);
+    Tensor<float, 1> t1(size);
+    t1.setRandom();
+    std::vector<float> result(size);
+    thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float));
+    for (int j = 0; j < size; j++) {
+      VERIFY_IS_EQUAL(t1(j), result[j]);
+    }
+  }
+}
+
+
+void test_multithread_random()
+{
+  Eigen::ThreadPool tp(2);
+  Eigen::ThreadPoolDevice device(&tp, 2);
+  Tensor<float, 1> t(1 << 20);
+  t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
+}
+
+template<int DataLayout>
+void test_multithread_shuffle()
+{
+  Tensor<float, 4, DataLayout> tensor(17,5,7,11);
+  tensor.setRandom();
+
+  const int num_threads = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
+  array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
+  shuffle.device(device) = tensor.shuffle(shuffles);
+
+  for (int i = 0; i < 17; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i));
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_thread_pool()
+{
+  CALL_SUBTEST_1(test_multithread_elementwise());
+  CALL_SUBTEST_1(test_multithread_compound_assignment());
+
+  CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_multithread_contraction<RowMajor>());
+
+  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+
+  // Exercise various cases that have been problematic in the past.
+  CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
+
+  CALL_SUBTEST_4(test_full_contraction<ColMajor>());
+  CALL_SUBTEST_4(test_full_contraction<RowMajor>());
+
+  CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
+  CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
+
+  CALL_SUBTEST_6(test_memcpy());
+  CALL_SUBTEST_6(test_multithread_random());
+  CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>());
+  CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp
new file mode 100644
index 000000000..d2a1e8673
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_uint128.cpp
@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+#if EIGEN_COMP_MSVC
+#define EIGEN_NO_INT128
+#else
+typedef __uint128_t uint128_t;
+#endif
+
+// Only run the test on compilers that support 128bit integers natively
+#ifndef EIGEN_NO_INT128
+
+using Eigen::internal::TensorUInt128;
+using Eigen::internal::static_val;
+
+void VERIFY_EQUAL(TensorUInt128<uint64_t, uint64_t> actual, uint128_t expected) {
+  bool matchl = actual.lower() == static_cast<uint64_t>(expected);
+  bool matchh = actual.upper() == static_cast<uint64_t>(expected >> 64);
+  if (!matchl || !matchh) {
+    const char* testname = g_test_stack.back().c_str();
+    std::cerr << "Test " << testname << " failed in " << __FILE__
+              << " (" << __LINE__ << ")"
+              << std::endl;
+    abort();
+  }
+}
+
+
+void test_add() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i + j;
+          uint128_t expected = a + b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_sub() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i - j;
+          uint128_t expected = a - b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_mul() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i * j;
+          uint128_t expected = a * b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_div() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i / j;
+          uint128_t expected = a / b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_misc1() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+    TensorUInt128<static_val<0>, uint64_t> i(0, i2);
+    uint128_t a = static_cast<uint128_t>(i2);
+    for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+      TensorUInt128<static_val<0>, uint64_t> j(0, j2);
+      uint128_t b = static_cast<uint128_t>(j2);
+      uint64_t actual = (i * j).upper();
+      uint64_t expected = (a * b) >> 64;
+      VERIFY_IS_EQUAL(actual, expected);
+    }
+  }
+}
+
+void test_misc2() {
+  int64_t incr = internal::random<int64_t>(1, 100);
+  for (int64_t log_div = 0; log_div < 63; ++log_div) {
+    for (int64_t divider = 1; divider <= 1000000 * incr; divider += incr) {
+      uint64_t expected = (static_cast<uint128_t>(1) << (64+log_div)) / static_cast<uint128_t>(divider) - (static_cast<uint128_t>(1) << 64) + 1;
+      uint64_t shift = 1ULL << log_div;
+
+      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+      uint64_t actual = static_cast<uint64_t>(result);
+      VERIFY_IS_EQUAL(actual, expected);
+    }
+  }
+}
+#endif
+
+
+void test_cxx11_tensor_uint128()
+{
+#ifdef EIGEN_NO_INT128
+  // Skip the test on compilers that don't support 128bit integers natively
+  return;
+#else
+  CALL_SUBTEST_1(test_add());
+  CALL_SUBTEST_2(test_sub());
+  CALL_SUBTEST_3(test_mul());
+  CALL_SUBTEST_4(test_div());
+  CALL_SUBTEST_5(test_misc1());
+  CALL_SUBTEST_6(test_misc2());
+#endif
+}
diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp
new file mode 100644
index 000000000..ca6840f3b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_volume_patch.cpp
@@ -0,0 +1,112 @@
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_single_voxel_patch()
+{
+  Tensor<float, 5> tensor(4,2,3,5,7);
+  tensor.setRandom();
+  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  Tensor<float, 6> single_voxel_patch;
+  single_voxel_patch = tensor.extract_volume_patches(1, 1, 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(0), 4);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(4), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(5), 7);
+
+  Tensor<float, 6, RowMajor> single_voxel_patch_row_major;
+  single_voxel_patch_row_major = tensor_row_major.extract_volume_patches(1, 1, 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_voxel_patch.data()[i]);
+    VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
+  }
+}
+
+
+static void test_entire_volume_patch()
+{
+  const int depth = 4;
+  const int patch_z = 2;
+  const int patch_y = 3;
+  const int patch_x = 5;
+  const int batch = 7;
+
+  Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
+  tensor.setRandom();
+  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  Tensor<float, 6> entire_volume_patch;
+  entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(0), depth);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(1), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(2), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(3), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(4), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(5), batch);
+
+  Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
+  entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
+
+  const int dz = patch_z - 1;
+  const int dy = patch_y - 1;
+  const int dx = patch_x - 1;
+
+  const int forward_pad_z = dz - dz / 2;
+  const int forward_pad_y = dy - dy / 2;
+  const int forward_pad_x = dx - dx / 2;
+
+  for (int pz = 0; pz < patch_z; pz++) {
+    for (int py = 0; py < patch_y; py++) {
+      for (int px = 0; px < patch_x; px++) {
+        const int patchId = pz + patch_z * (py + px * patch_y);
+        for (int z = 0; z < patch_z; z++) {
+          for (int y = 0; y < patch_y; y++) {
+            for (int x = 0; x < patch_x; x++) {
+              for (int b = 0; b < batch; b++) {
+                for (int d = 0; d < depth; d++) {
+                  float expected = 0.0f;
+                  float expected_row_major = 0.0f;
+                  const int eff_z = z - forward_pad_z + pz;
+                  const int eff_y = y - forward_pad_y + py;
+                  const int eff_x = x - forward_pad_x + px;
+                  if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
+                      eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
+                    expected = tensor(d, eff_z, eff_y, eff_x, b);
+                    expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
+                  }
+                  VERIFY_IS_EQUAL(entire_volume_patch(d, z, y, x, patchId, b), expected);
+                  VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_volume_patch()
+{
+  CALL_SUBTEST(test_single_voxel_patch());
+  CALL_SUBTEST(test_entire_volume_patch());
+}
diff --git a/unsupported/test/forward_adolc.cpp b/unsupported/test/forward_adolc.cpp
index d4baafe62..866db8e86 100644
--- a/unsupported/test/forward_adolc.cpp
+++ b/unsupported/test/forward_adolc.cpp
@@ -13,8 +13,6 @@
 #define NUMBER_DIRECTIONS 16
 #include <unsupported/Eigen/AdolcForward>
 
-int adtl::ADOLC_numDir;
-
 template<typename Vector>
 EIGEN_DONT_INLINE typename Vector::Scalar foo(const Vector& p)
 {
@@ -123,7 +121,7 @@ template<typename Func> void adolc_forward_jacobian(const Func& f)
 
 void test_forward_adolc()
 {
-  adtl::ADOLC_numDir = NUMBER_DIRECTIONS;
+  adtl::setNumDir(NUMBER_DIRECTIONS);
 
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double,2,2>()) ));
diff --git a/unsupported/test/jacobisvd.cpp b/unsupported/test/jacobisvd.cpp
deleted file mode 100644
index b4e884eee..000000000
--- a/unsupported/test/jacobisvd.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "svd_common.h"
-
-template<typename MatrixType, int QRPreconditioner>
-void jacobisvd_check_full(const MatrixType& m, const JacobiSVD<MatrixType, QRPreconditioner>& svd)
-{
-  svd_check_full<MatrixType, JacobiSVD<MatrixType, QRPreconditioner > >(m, svd);
-}
-
-template<typename MatrixType, int QRPreconditioner>
-void jacobisvd_compare_to_full(const MatrixType& m,
-                               unsigned int computationOptions,
-                               const JacobiSVD<MatrixType, QRPreconditioner>& referenceSvd)
-{
-  svd_compare_to_full<MatrixType, JacobiSVD<MatrixType, QRPreconditioner> >(m, computationOptions, referenceSvd);
-}
-
-
-template<typename MatrixType, int QRPreconditioner>
-void jacobisvd_solve(const MatrixType& m, unsigned int computationOptions)
-{
-  svd_solve< MatrixType, JacobiSVD< MatrixType, QRPreconditioner > >(m, computationOptions);
-}
-
-
-
-template<typename MatrixType, int QRPreconditioner>
-void jacobisvd_test_all_computation_options(const MatrixType& m)
-{
-  
-  if (QRPreconditioner == NoQRPreconditioner && m.rows() != m.cols())
-    return;
-
-  JacobiSVD< MatrixType, QRPreconditioner > fullSvd(m, ComputeFullU|ComputeFullV);
-  svd_test_computation_options_1< MatrixType, JacobiSVD< MatrixType, QRPreconditioner > >(m, fullSvd);
-
-  if(QRPreconditioner == FullPivHouseholderQRPreconditioner)
-    return;
-  svd_test_computation_options_2< MatrixType, JacobiSVD< MatrixType, QRPreconditioner > >(m, fullSvd);
-
-}
-
-template<typename MatrixType>
-void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
-{
-  MatrixType m = pickrandom ? MatrixType::Random(a.rows(), a.cols()) : a;
-
-  jacobisvd_test_all_computation_options<MatrixType, FullPivHouseholderQRPreconditioner>(m);
-  jacobisvd_test_all_computation_options<MatrixType, ColPivHouseholderQRPreconditioner>(m);
-  jacobisvd_test_all_computation_options<MatrixType, HouseholderQRPreconditioner>(m);
-  jacobisvd_test_all_computation_options<MatrixType, NoQRPreconditioner>(m);
-}
-
-
-template<typename MatrixType> 
-void jacobisvd_verify_assert(const MatrixType& m)
-{
-  
-  svd_verify_assert<MatrixType, JacobiSVD< MatrixType > >(m);
-
-  typedef typename MatrixType::Index Index;
-  Index rows = m.rows();
-  Index cols = m.cols();
-
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime
-  };
-
-  MatrixType a = MatrixType::Zero(rows, cols);
-  a.setZero();
-
-  if (ColsAtCompileTime == Dynamic)
-  {
-    JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner> svd_fullqr;
-    VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeFullU|ComputeThinV))
-    VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeThinU|ComputeThinV))
-    VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeThinU|ComputeFullV))
-  }
-}
-
-template<typename MatrixType>
-void jacobisvd_method()
-{
-  enum { Size = MatrixType::RowsAtCompileTime };
-  typedef typename MatrixType::RealScalar RealScalar;
-  typedef Matrix<RealScalar, Size, 1> RealVecType;
-  MatrixType m = MatrixType::Identity();
-  VERIFY_IS_APPROX(m.jacobiSvd().singularValues(), RealVecType::Ones());
-  VERIFY_RAISES_ASSERT(m.jacobiSvd().matrixU());
-  VERIFY_RAISES_ASSERT(m.jacobiSvd().matrixV());
-  VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).solve(m), m);
-}
-
-
-
-template<typename MatrixType>
-void jacobisvd_inf_nan()
-{
-  svd_inf_nan<MatrixType, JacobiSVD< MatrixType > >();
-}
-
-
-// Regression test for bug 286: JacobiSVD loops indefinitely with some
-// matrices containing denormal numbers.
-void jacobisvd_bug286()
-{
-#if defined __INTEL_COMPILER
-// shut up warning #239: floating point underflow
-#pragma warning push
-#pragma warning disable 239
-#endif
-  Matrix2d M;
-  M << -7.90884e-313, -4.94e-324,
-                 0, 5.60844e-313;
-#if defined __INTEL_COMPILER
-#pragma warning pop
-#endif
-  JacobiSVD<Matrix2d> svd;
-  svd.compute(M); // just check we don't loop indefinitely
-}
-
-
-void jacobisvd_preallocate()
-{
-  svd_preallocate< JacobiSVD <MatrixXf> >();
-}
-
-void test_jacobisvd()
-{
-  CALL_SUBTEST_11(( jacobisvd<Matrix<double,Dynamic,Dynamic> >
-		    (Matrix<double,Dynamic,Dynamic>(16, 6)) ));
-
-  CALL_SUBTEST_3(( jacobisvd_verify_assert(Matrix3f()) ));
-  CALL_SUBTEST_4(( jacobisvd_verify_assert(Matrix4d()) ));
-  CALL_SUBTEST_7(( jacobisvd_verify_assert(MatrixXf(10,12)) ));
-  CALL_SUBTEST_8(( jacobisvd_verify_assert(MatrixXcd(7,5)) ));
-
-  for(int i = 0; i < g_repeat; i++) {
-    Matrix2cd m;
-    m << 0, 1,
-         0, 1;
-    CALL_SUBTEST_1(( jacobisvd(m, false) ));
-    m << 1, 0,
-         1, 0;
-    CALL_SUBTEST_1(( jacobisvd(m, false) ));
-
-    Matrix2d n;
-    n << 0, 0,
-         0, 0;
-    CALL_SUBTEST_2(( jacobisvd(n, false) ));
-    n << 0, 0,
-         0, 1;
-    CALL_SUBTEST_2(( jacobisvd(n, false) ));
-    
-    CALL_SUBTEST_3(( jacobisvd<Matrix3f>() ));
-    CALL_SUBTEST_4(( jacobisvd<Matrix4d>() ));
-    CALL_SUBTEST_5(( jacobisvd<Matrix<float,3,5> >() ));
-    CALL_SUBTEST_6(( jacobisvd<Matrix<double,Dynamic,2> >(Matrix<double,Dynamic,2>(10,2)) ));
-
-    int r = internal::random<int>(1, 30),
-        c = internal::random<int>(1, 30);
-    CALL_SUBTEST_7(( jacobisvd<MatrixXf>(MatrixXf(r,c)) ));
-    CALL_SUBTEST_8(( jacobisvd<MatrixXcd>(MatrixXcd(r,c)) ));
-    (void) r;
-    (void) c;
-
-    // Test on inf/nan matrix
-    CALL_SUBTEST_7( jacobisvd_inf_nan<MatrixXf>() );
-  }
-
-  CALL_SUBTEST_7(( jacobisvd<MatrixXf>(MatrixXf(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
-  CALL_SUBTEST_8(( jacobisvd<MatrixXcd>(MatrixXcd(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/3), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/3))) ));
-
-
-  // test matrixbase method
-  CALL_SUBTEST_1(( jacobisvd_method<Matrix2cd>() ));
-  CALL_SUBTEST_3(( jacobisvd_method<Matrix3f>() ));
-
-
-  // Test problem size constructors
-  CALL_SUBTEST_7( JacobiSVD<MatrixXf>(10,10) );
-
-  // Check that preallocation avoids subsequent mallocs
-  CALL_SUBTEST_9( jacobisvd_preallocate() );
-
-  // Regression check for bug 286
-  CALL_SUBTEST_2( jacobisvd_bug286() );
-}
diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
index 8ddc6ec28..e770049e5 100644
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@@ -9,12 +9,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifdef EIGEN_TEST_PART_1
 
 #include "sparse.h"
 #include <Eigen/SparseExtra>
 #include <Eigen/KroneckerProduct>
 
-
 template<typename MatrixType>
 void check_dimension(const MatrixType& ab, const int rows,  const int cols)
 {
@@ -107,31 +107,34 @@ void test_kronecker_product()
 
   SparseMatrix<double,RowMajor> SM_row_a(SM_a), SM_row_b(SM_b);
 
-  // test kroneckerProduct(DM_block,DM,DM_fixedSize)
+  // test DM_fixedSize = kroneckerProduct(DM_block,DM)
   Matrix<double, 6, 6> DM_fix_ab = kroneckerProduct(DM_a.topLeftCorner<2,3>(),DM_b);
 
   CALL_SUBTEST(check_kronecker_product(DM_fix_ab));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(DM_a.topLeftCorner<2,3>(),DM_b)));
 
   for(int i=0;i<DM_fix_ab.rows();++i)
     for(int j=0;j<DM_fix_ab.cols();++j)
        VERIFY_IS_APPROX(kroneckerProduct(DM_a,DM_b).coeff(i,j), DM_fix_ab(i,j));
 
-  // test kroneckerProduct(DM,DM,DM_block)
+  // test DM_block = kroneckerProduct(DM,DM)
   MatrixXd DM_block_ab(10,15);
   DM_block_ab.block<6,6>(2,5) = kroneckerProduct(DM_a,DM_b);
   CALL_SUBTEST(check_kronecker_product(DM_block_ab.block<6,6>(2,5)));
 
-  // test kroneckerProduct(DM,DM,DM)
+  // test DM = kroneckerProduct(DM,DM)
   MatrixXd DM_ab = kroneckerProduct(DM_a,DM_b);
   CALL_SUBTEST(check_kronecker_product(DM_ab));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(DM_a,DM_b)));
 
-  // test kroneckerProduct(SM,DM,SM)
+  // test SM = kroneckerProduct(SM,DM)
   SparseMatrix<double> SM_ab = kroneckerProduct(SM_a,DM_b);
   CALL_SUBTEST(check_kronecker_product(SM_ab));
   SparseMatrix<double,RowMajor> SM_ab2 = kroneckerProduct(SM_a,DM_b);
   CALL_SUBTEST(check_kronecker_product(SM_ab2));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(SM_a,DM_b)));
 
-  // test kroneckerProduct(DM,SM,SM)
+  // test SM = kroneckerProduct(DM,SM)
   SM_ab.setZero();
   SM_ab.insert(0,0)=37.0;
   SM_ab = kroneckerProduct(DM_a,SM_b);
@@ -140,8 +143,9 @@ void test_kronecker_product()
   SM_ab2.insert(0,0)=37.0;
   SM_ab2 = kroneckerProduct(DM_a,SM_b);
   CALL_SUBTEST(check_kronecker_product(SM_ab2));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(DM_a,SM_b)));
 
-  // test kroneckerProduct(SM,SM,SM)
+  // test SM = kroneckerProduct(SM,SM)
   SM_ab.resize(2,33);
   SM_ab.insert(0,0)=37.0;
   SM_ab = kroneckerProduct(SM_a,SM_b);
@@ -150,8 +154,9 @@ void test_kronecker_product()
   SM_ab2.insert(0,0)=37.0;
   SM_ab2 = kroneckerProduct(SM_a,SM_b);
   CALL_SUBTEST(check_kronecker_product(SM_ab2));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(SM_a,SM_b)));
 
-  // test kroneckerProduct(SM,SM,SM) with sparse pattern
+  // test SM = kroneckerProduct(SM,SM) with sparse pattern
   SM_a.resize(4,5);
   SM_b.resize(3,2);
   SM_a.resizeNonZeros(0);
@@ -169,7 +174,7 @@ void test_kronecker_product()
   SM_ab = kroneckerProduct(SM_a,SM_b);
   CALL_SUBTEST(check_sparse_kronecker_product(SM_ab));
 
-  // test dimension of result of kroneckerProduct(DM,DM,DM)
+  // test dimension of result of DM = kroneckerProduct(DM,DM)
   MatrixXd DM_a2(2,1);
   MatrixXd DM_b2(5,4);
   MatrixXd DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
@@ -178,4 +183,70 @@ void test_kronecker_product()
   DM_b2.resize(4,8);
   DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
   CALL_SUBTEST(check_dimension(DM_ab2,10*4,9*8));
+  
+  for(int i = 0; i < g_repeat; i++)
+  {
+    double density = Eigen::internal::random<double>(0.01,0.5);
+    int ra = Eigen::internal::random<int>(1,50);
+    int ca = Eigen::internal::random<int>(1,50);
+    int rb = Eigen::internal::random<int>(1,50);
+    int cb = Eigen::internal::random<int>(1,50);
+    SparseMatrix<float,ColMajor> sA(ra,ca), sB(rb,cb), sC;
+    SparseMatrix<float,RowMajor> sC2;
+    MatrixXf dA(ra,ca), dB(rb,cb), dC;
+    initSparse(density, dA, sA);
+    initSparse(density, dB, sB);
+    
+    sC = kroneckerProduct(sA,sB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+    
+    sC = kroneckerProduct(sA.transpose(),sB);
+    dC = kroneckerProduct(dA.transpose(),dB);
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+    
+    sC = kroneckerProduct(sA.transpose(),sB.transpose());
+    dC = kroneckerProduct(dA.transpose(),dB.transpose());
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+    
+    sC = kroneckerProduct(sA,sB.transpose());
+    dC = kroneckerProduct(dA,dB.transpose());
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+    
+    sC2 = kroneckerProduct(sA,sB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+    
+    sC2 = kroneckerProduct(dA,sB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+    
+    sC2 = kroneckerProduct(sA,dB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+    
+    sC2 = kroneckerProduct(2*sA,sB);
+    dC = kroneckerProduct(2*dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+  }
+}
+
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+
+// simply check that for a dense kronecker product, sparse module is not needed
+
+#include "main.h"
+#include <Eigen/KroneckerProduct>
+
+void test_kronecker_product()
+{
+  MatrixXd a(2,2), b(3,3), c;
+  a.setRandom();
+  b.setRandom();
+  c = kroneckerProduct(a,b);
+  VERIFY_IS_APPROX(c.block(3,3,3,3), a(1,1)*b);
 }
+
+#endif
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index 04464727d..64f168c16 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -9,6 +9,9 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 
+// FIXME: These tests all check for hard-coded values. Ideally, parameters and start estimates should be randomized.
+
+
 #include <stdio.h>
 
 #include "main.h"
@@ -20,6 +23,9 @@
 
 using std::sqrt;
 
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
 struct lmder_functor : DenseFunctor<double>
 {
     lmder_functor(void): DenseFunctor<double>(3,15) {}
@@ -275,7 +281,7 @@ const double chwirut2_functor::m_y[54] = { 92.9000E0 ,57.1000E0 ,31.0500E0 ,11.5
 void testNistChwirut2(void)
 {
   const int n=3;
-  int info;
+  LevenbergMarquardtSpace::Status info;
 
   VectorXd x(n);
 
@@ -610,7 +616,7 @@ const double lanczos1_functor::y[24] = { 2.513400000000E+00 ,2.044333373291E+00
 void testNistLanczos1(void)
 {
   const int n=6;
-  int info;
+  LevenbergMarquardtSpace::Status info;
 
   VectorXd x(n);
 
@@ -624,11 +630,11 @@ void testNistLanczos1(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 2);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
   VERIFY_IS_EQUAL(lm.nfev(), 79);
   VERIFY_IS_EQUAL(lm.njev(), 72);
   // check norm^2
-//   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.430899764097e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -645,11 +651,11 @@ void testNistLanczos1(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 2);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
   VERIFY_IS_EQUAL(lm.nfev(), 9);
   VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
-//   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.428595533845e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -696,7 +702,7 @@ const double rat42_functor::y[9] = { 8.930E0 ,10.800E0 ,18.590E0 ,22.330E0 ,39.3
 void testNistRat42(void)
 {
   const int n=3;
-  int info;
+  LevenbergMarquardtSpace::Status info;
 
   VectorXd x(n);
 
@@ -710,7 +716,7 @@ void testNistRat42(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
   VERIFY_IS_EQUAL(lm.nfev(), 10);
   VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
@@ -728,7 +734,7 @@ void testNistRat42(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
   VERIFY_IS_EQUAL(lm.nfev(), 6);
   VERIFY_IS_EQUAL(lm.njev(), 5);
   // check norm^2
@@ -774,7 +780,7 @@ const double MGH10_functor::y[16] = { 3.478000E+04, 2.861000E+04, 2.365000E+04,
 void testNistMGH10(void)
 {
   const int n=3;
-  int info;
+  LevenbergMarquardtSpace::Status info;
 
   VectorXd x(n);
 
@@ -786,17 +792,26 @@ void testNistMGH10(void)
   MGH10_functor functor;
   LevenbergMarquardt<MGH10_functor> lm(functor);
   info = lm.minimize(x);
+  ++g_test_level;
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  --g_test_level;
+  // was: VERIFY_IS_EQUAL(info, 1);
 
-  // check return value
-  VERIFY_IS_EQUAL(info, 1); 
-  VERIFY_IS_EQUAL(lm.nfev(), 284 ); 
-  VERIFY_IS_EQUAL(lm.njev(), 249 ); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
   // check x
   VERIFY_IS_APPROX(x[0], 5.6096364710E-03);
   VERIFY_IS_APPROX(x[1], 6.1813463463E+03);
   VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
+  
+  // check return value
+
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev(), 284 );
+  VERIFY_IS_EQUAL(lm.njev(), 249 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL);
 
   /*
    * Second try
@@ -804,17 +819,25 @@ void testNistMGH10(void)
   x<< 0.02, 4000., 250.;
   // do the computation
   info = lm.minimize(x);
+  ++g_test_level;
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // was: VERIFY_IS_EQUAL(info, 1);
+  --g_test_level;
 
-  // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 126);
-  VERIFY_IS_EQUAL(lm.njev(), 116);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
   // check x
   VERIFY_IS_APPROX(x[0], 5.6096364710E-03);
   VERIFY_IS_APPROX(x[1], 6.1813463463E+03);
   VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
+  
+  // check return value
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev(), 126);
+  VERIFY_IS_EQUAL(lm.njev(), 116);
+  --g_test_level;
+  VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL);
 }
 
 
@@ -866,15 +889,16 @@ void testNistBoxBOD(void)
   lm.setFactor(10);
   info = lm.minimize(x);
 
-  // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 31);
-  VERIFY_IS_EQUAL(lm.njev(), 25);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
   // check x
   VERIFY_IS_APPROX(x[0], 2.1380940889E+02);
   VERIFY_IS_APPROX(x[1], 5.4723748542E-01);
+  
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY(lm.nfev() < 31); // 31
+  VERIFY(lm.njev() < 25); // 25
 
   /*
    * Second try
@@ -888,8 +912,12 @@ void testNistBoxBOD(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1); 
-  VERIFY_IS_EQUAL(lm.nfev(), 15 ); 
-  VERIFY_IS_EQUAL(lm.njev(), 14 ); 
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev(), 16 );
+  VERIFY_IS_EQUAL(lm.njev(), 15 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
   // check x
@@ -948,10 +976,6 @@ void testNistMGH17(void)
   lm.setMaxfev(1000);
   info = lm.minimize(x);
 
-  // check return value
-//   VERIFY_IS_EQUAL(info, 2);  //FIXME Use (lm.info() == Success)
-//   VERIFY_IS_EQUAL(lm.nfev(), 602 ); 
-  VERIFY_IS_EQUAL(lm.njev(), 545 ); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05);
   // check x
@@ -960,6 +984,11 @@ void testNistMGH17(void)
   VERIFY_IS_APPROX(x[2], -1.4646871366E+00);
   VERIFY_IS_APPROX(x[3], 1.2867534640E-02);
   VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
+  
+    // check return value
+//   VERIFY_IS_EQUAL(info, 2);  //FIXME Use (lm.info() == Success)
+  VERIFY(lm.nfev() < 700 ); // 602
+  VERIFY(lm.njev() < 600 ); // 545
 
   /*
    * Second try
@@ -1035,10 +1064,6 @@ void testNistMGH09(void)
   lm.setMaxfev(1000);
   info = lm.minimize(x);
 
-  // check return value
-  VERIFY_IS_EQUAL(info, 1); 
-  VERIFY_IS_EQUAL(lm.nfev(), 490 ); 
-  VERIFY_IS_EQUAL(lm.njev(), 376 ); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04);
   // check x
@@ -1046,6 +1071,10 @@ void testNistMGH09(void)
   VERIFY_IS_APPROX(x[1], 0.19126423573); // should be 1.9128232873E-01
   VERIFY_IS_APPROX(x[2], 0.12305309914); // should be 1.2305650693E-01
   VERIFY_IS_APPROX(x[3], 0.13605395375); // should be 1.3606233068E-01
+  // check return value
+  VERIFY_IS_EQUAL(info, 1); 
+  VERIFY(lm.nfev() < 510 ); // 490
+  VERIFY(lm.njev() < 400 ); // 376
 
   /*
    * Second try
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 3c76cfb65..7c9b68a3c 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -102,7 +102,7 @@ void testMatrixExponential(const MatrixType& A)
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef std::complex<RealScalar> ComplexScalar;
 
-  VERIFY_IS_APPROX(A.exp(), A.matrixFunction(StdStemFunctions<ComplexScalar>::exp));
+  VERIFY_IS_APPROX(A.exp(), A.matrixFunction(internal::stem_function_exp<ComplexScalar>));
 }
 
 template<typename MatrixType>
@@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A)
 
   MatrixType scaledA;
   RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff();
-  if (maxImagPartOfSpectrum >= 0.9 * M_PI)
-    scaledA = A * 0.9 * M_PI / maxImagPartOfSpectrum;
+  if (maxImagPartOfSpectrum >= RealScalar(0.9L * EIGEN_PI))
+    scaledA = A * RealScalar(0.9L * EIGEN_PI) / maxImagPartOfSpectrum;
   else
     scaledA = A;
 
diff --git a/unsupported/test/matrix_functions.h b/unsupported/test/matrix_functions.h
index 5817caef6..4e2636404 100644
--- a/unsupported/test/matrix_functions.h
+++ b/unsupported/test/matrix_functions.h
@@ -10,27 +10,47 @@
 #include "main.h"
 #include <unsupported/Eigen/MatrixFunctions>
 
+// For complex matrices, any matrix is fine.
+template<typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct processTriangularMatrix
+{
+  static void run(MatrixType&, MatrixType&, const MatrixType&)
+  { }
+};
+
+// For real matrices, make sure none of the eigenvalues are negative.
+template<typename MatrixType>
+struct processTriangularMatrix<MatrixType,0>
+{
+  static void run(MatrixType& m, MatrixType& T, const MatrixType& U)
+  {
+    const Index size = m.cols();
+
+    for (Index i=0; i < size; ++i) {
+      if (i == size - 1 || T.coeff(i+1,i) == 0)
+        T.coeffRef(i,i) = std::abs(T.coeff(i,i));
+      else
+        ++i;
+    }
+    m = U * T * U.transpose();
+  }
+};
+
 template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
 struct generateTestMatrix;
 
-// for real matrices, make sure none of the eigenvalues are negative
 template <typename MatrixType>
 struct generateTestMatrix<MatrixType,0>
 {
   static void run(MatrixType& result, typename MatrixType::Index size)
   {
-    MatrixType mat = MatrixType::Random(size, size);
-    EigenSolver<MatrixType> es(mat);
-    typename EigenSolver<MatrixType>::EigenvalueType eivals = es.eigenvalues();
-    for (typename MatrixType::Index i = 0; i < size; ++i) {
-      if (eivals(i).imag() == 0 && eivals(i).real() < 0)
-	eivals(i) = -eivals(i);
-    }
-    result = (es.eigenvectors() * eivals.asDiagonal() * es.eigenvectors().inverse()).real();
+    result = MatrixType::Random(size, size);
+    RealSchur<MatrixType> schur(result);
+    MatrixType T = schur.matrixT();
+    processTriangularMatrix<MatrixType>::run(result, T, schur.matrixU());
   }
 };
 
-// for complex matrices, any matrix is fine
 template <typename MatrixType>
 struct generateTestMatrix<MatrixType,1>
 {
@@ -41,7 +61,7 @@ struct generateTestMatrix<MatrixType,1>
 };
 
 template <typename Derived, typename OtherDerived>
-double relerr(const MatrixBase<Derived>& A, const MatrixBase<OtherDerived>& B)
+typename Derived::RealScalar relerr(const MatrixBase<Derived>& A, const MatrixBase<OtherDerived>& B)
 {
   return std::sqrt((A - B).cwiseAbs2().sum() / (std::min)(A.cwiseAbs2().sum(), B.cwiseAbs2().sum()));
 }
diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index b9d513b45..7ccfacfdf 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+// Copyright (C) 2012, 2013 Chen-Pang He <jdh8@ms63.hinet.net>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -9,35 +9,8 @@
 
 #include "matrix_functions.h"
 
-template <typename MatrixType, int IsComplex = NumTraits<typename MatrixType::Scalar>::IsComplex>
-struct generateTriangularMatrix;
-
-// for real matrices, make sure none of the eigenvalues are negative
-template <typename MatrixType>
-struct generateTriangularMatrix<MatrixType,0>
-{
-  static void run(MatrixType& result, typename MatrixType::Index size)
-  {
-    result.resize(size, size);
-    result.template triangularView<Upper>() = MatrixType::Random(size, size);
-    for (typename MatrixType::Index i = 0; i < size; ++i)
-      result.coeffRef(i,i) = std::abs(result.coeff(i,i));
-  }
-};
-
-// for complex matrices, any matrix is fine
-template <typename MatrixType>
-struct generateTriangularMatrix<MatrixType,1>
-{
-  static void run(MatrixType& result, typename MatrixType::Index size)
-  {
-    result.resize(size, size);
-    result.template triangularView<Upper>() = MatrixType::Random(size, size);
-  }
-};
-
 template<typename T>
-void test2dRotation(double tol)
+void test2dRotation(const T& tol)
 {
   Matrix<T,2,2> A, B, C;
   T angle, c, s;
@@ -46,19 +19,19 @@ void test2dRotation(double tol)
   MatrixPower<Matrix<T,2,2> > Apow(A);
 
   for (int i=0; i<=20; ++i) {
-    angle = pow(10, (i-10) / 5.);
+    angle = std::pow(T(10), (i-10) / T(5.));
     c = std::cos(angle);
     s = std::sin(angle);
     B << c, s, -s, c;
 
-    C = Apow(std::ldexp(angle,1) / M_PI);
+    C = Apow(std::ldexp(angle,1) / T(EIGEN_PI));
     std::cout << "test2dRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
-    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+    VERIFY(C.isApprox(B, tol));
   }
 }
 
 template<typename T>
-void test2dHyperbolicRotation(double tol)
+void test2dHyperbolicRotation(const T& tol)
 {
   Matrix<std::complex<T>,2,2> A, B, C;
   T angle, ch = std::cosh((T)1);
@@ -75,12 +48,26 @@ void test2dHyperbolicRotation(double tol)
 
     C = Apow(angle);
     std::cout << "test2dHyperbolicRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
-    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+    VERIFY(C.isApprox(B, tol));
+  }
+}
+
+template<typename T>
+void test3dRotation(const T& tol)
+{
+  Matrix<T,3,1> v;
+  T angle;
+
+  for (int i=0; i<=20; ++i) {
+    v = Matrix<T,3,1>::Random();
+    v.normalize();
+    angle = std::pow(T(10), (i-10) / T(5.));
+    VERIFY(AngleAxis<T>(angle, v).matrix().isApprox(AngleAxis<T>(1,v).matrix().pow(angle), tol));
   }
 }
 
 template<typename MatrixType>
-void testExponentLaws(const MatrixType& m, double tol)
+void testGeneral(const MatrixType& m, const typename MatrixType::RealScalar& tol)
 {
   typedef typename MatrixType::RealScalar RealScalar;
   MatrixType m1, m2, m3, m4, m5;
@@ -97,37 +84,121 @@ void testExponentLaws(const MatrixType& m, double tol)
 
     m4 = mpow(x+y);
     m5.noalias() = m2 * m3;
-    VERIFY(m4.isApprox(m5, static_cast<RealScalar>(tol)));
+    VERIFY(m4.isApprox(m5, tol));
 
     m4 = mpow(x*y);
     m5 = m2.pow(y);
-    VERIFY(m4.isApprox(m5, static_cast<RealScalar>(tol)));
+    VERIFY(m4.isApprox(m5, tol));
 
     m4 = (std::abs(x) * m1).pow(y);
     m5 = std::pow(std::abs(x), y) * m3;
-    VERIFY(m4.isApprox(m5, static_cast<RealScalar>(tol)));
+    VERIFY(m4.isApprox(m5, tol));
+  }
+}
+
+template<typename MatrixType>
+void testSingular(const MatrixType& m_const, const typename MatrixType::RealScalar& tol)
+{
+  // we need to pass by reference in order to prevent errors with
+  // MSVC for aligned data types ...
+  MatrixType& m = const_cast<MatrixType&>(m_const);
+
+  const int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex;
+  typedef typename internal::conditional<IsComplex, TriangularView<MatrixType,Upper>, const MatrixType&>::type TriangularType;
+  typename internal::conditional< IsComplex, ComplexSchur<MatrixType>, RealSchur<MatrixType> >::type schur;
+  MatrixType T;
+
+  for (int i=0; i < g_repeat; ++i) {
+    m.setRandom();
+    m.col(0).fill(0);
+
+    schur.compute(m);
+    T = schur.matrixT();
+    const MatrixType& U = schur.matrixU();
+    processTriangularMatrix<MatrixType>::run(m, T, U);
+    MatrixPower<MatrixType> mpow(m);
+
+    T = T.sqrt();
+    VERIFY(mpow(0.5L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+
+    T = T.sqrt();
+    VERIFY(mpow(0.25L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+
+    T = T.sqrt();
+    VERIFY(mpow(0.125L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+  }
+}
+
+template<typename MatrixType>
+void testLogThenExp(const MatrixType& m_const, const typename MatrixType::RealScalar& tol)
+{
+  // we need to pass by reference in order to prevent errors with
+  // MSVC for aligned data types ...
+  MatrixType& m = const_cast<MatrixType&>(m_const);
+
+  typedef typename MatrixType::Scalar Scalar;
+  Scalar x;
+
+  for (int i=0; i < g_repeat; ++i) {
+    generateTestMatrix<MatrixType>::run(m, m.rows());
+    x = internal::random<Scalar>();
+    VERIFY(m.pow(x).isApprox((x * m.log()).exp(), tol));
   }
 }
 
 typedef Matrix<double,3,3,RowMajor>         Matrix3dRowMajor;
+typedef Matrix<long double,3,3>             Matrix3e;
 typedef Matrix<long double,Dynamic,Dynamic> MatrixXe;
  
 void test_matrix_power()
 {
   CALL_SUBTEST_2(test2dRotation<double>(1e-13));
   CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
-  CALL_SUBTEST_9(test2dRotation<long double>(1e-13)); 
+  CALL_SUBTEST_9(test2dRotation<long double>(1e-13L));
   CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14));
   CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5));
-  CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14));
-
-  CALL_SUBTEST_2(testExponentLaws(Matrix2d(),         1e-13));
-  CALL_SUBTEST_7(testExponentLaws(Matrix3dRowMajor(), 1e-13));
-  CALL_SUBTEST_3(testExponentLaws(Matrix4cd(),        1e-13));
-  CALL_SUBTEST_4(testExponentLaws(MatrixXd(8,8),      2e-12));
-  CALL_SUBTEST_1(testExponentLaws(Matrix2f(),         1e-4));
-  CALL_SUBTEST_5(testExponentLaws(Matrix3cf(),        1e-4));
-  CALL_SUBTEST_8(testExponentLaws(Matrix4f(),         1e-4));
-  CALL_SUBTEST_6(testExponentLaws(MatrixXf(2,2),      1e-3)); // see bug 614
-  CALL_SUBTEST_9(testExponentLaws(MatrixXe(7,7),      1e-13));
+  CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L));
+
+  CALL_SUBTEST_10(test3dRotation<double>(1e-13));
+  CALL_SUBTEST_11(test3dRotation<float>(1e-5));
+  CALL_SUBTEST_12(test3dRotation<long double>(1e-13L));
+
+  CALL_SUBTEST_2(testGeneral(Matrix2d(),         1e-13));
+  CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13));
+  CALL_SUBTEST_3(testGeneral(Matrix4cd(),        1e-13));
+  CALL_SUBTEST_4(testGeneral(MatrixXd(8,8),      2e-12));
+  CALL_SUBTEST_1(testGeneral(Matrix2f(),         1e-4));
+  CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4));
+  CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4));
+  CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3)); // see bug 614
+  CALL_SUBTEST_9(testGeneral(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_10(testGeneral(Matrix3d(),        1e-13));
+  CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4));
+  CALL_SUBTEST_12(testGeneral(Matrix3e(),        1e-13L));
+
+  CALL_SUBTEST_2(testSingular(Matrix2d(),         1e-13));
+  CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13));
+  CALL_SUBTEST_3(testSingular(Matrix4cd(),        1e-13));
+  CALL_SUBTEST_4(testSingular(MatrixXd(8,8),      2e-12));
+  CALL_SUBTEST_1(testSingular(Matrix2f(),         1e-4));
+  CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4));
+  CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4));
+  CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3));
+  CALL_SUBTEST_9(testSingular(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_10(testSingular(Matrix3d(),        1e-13));
+  CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4));
+  CALL_SUBTEST_12(testSingular(Matrix3e(),        1e-13L));
+
+  CALL_SUBTEST_2(testLogThenExp(Matrix2d(),         1e-13));
+  CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13));
+  CALL_SUBTEST_3(testLogThenExp(Matrix4cd(),        1e-13));
+  CALL_SUBTEST_4(testLogThenExp(MatrixXd(8,8),      2e-12));
+  CALL_SUBTEST_1(testLogThenExp(Matrix2f(),         1e-4));
+  CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4));
+  CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4));
+  CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3));
+  CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_10(testLogThenExp(Matrix3d(),        1e-13));
+  CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4));
+  CALL_SUBTEST_12(testLogThenExp(Matrix3e(),        1e-13L));
 }
diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp
index 509ebe09a..8b300b78a 100644
--- a/unsupported/test/minres.cpp
+++ b/unsupported/test/minres.cpp
@@ -1,8 +1,8 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
 // Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,21 +14,14 @@
 
 template<typename T> void test_minres_T()
 {
-  MINRES<SparseMatrix<T>, Lower|Upper, DiagonalPreconditioner<T> > minres_colmajor_diag;
+  // Identity preconditioner
   MINRES<SparseMatrix<T>, Lower, IdentityPreconditioner    > minres_colmajor_lower_I;
   MINRES<SparseMatrix<T>, Upper, IdentityPreconditioner    > minres_colmajor_upper_I;
-//  MINRES<SparseMatrix<T>, Lower, IncompleteLUT<T> >           minres_colmajor_ilut;
-  //minres<SparseMatrix<T>, SSORPreconditioner<T> >     minres_colmajor_ssor;
-
-
-//   CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_diag)  );
- // CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_ilut)     );
-  //CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_ssor)     );
 
   // Diagonal preconditioner
   MINRES<SparseMatrix<T>, Lower, DiagonalPreconditioner<T> > minres_colmajor_lower_diag;
   MINRES<SparseMatrix<T>, Upper, DiagonalPreconditioner<T> > minres_colmajor_upper_diag;
-  MINRES<SparseMatrix<T>, Upper|Lower, DiagonalPreconditioner<T> > minres_colmajor_uplo_diag;
+  MINRES<SparseMatrix<T>, Lower|Upper, DiagonalPreconditioner<T> > minres_colmajor_uplo_diag;
   
   // call tests for SPD matrix
   CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) );
@@ -36,14 +29,16 @@ template<typename T> void test_minres_T()
     
   CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag)  );
   CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag)  );
-//   CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_uplo_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_uplo_diag)  );
     
   // TO DO: symmetric semi-definite matrix
   // TO DO: symmetric indefinite matrix
+
 }
 
 void test_minres()
 {
   CALL_SUBTEST_1(test_minres_T<double>());
-//  CALL_SUBTEST_2(test_minres_T<std::complex<double> >());
+//  CALL_SUBTEST_2(test_minres_T<std::compex<double> >());
+
 }
diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h
index 7d6f4e79f..8404f1ff8 100644
--- a/unsupported/test/mpreal/mpreal.h
+++ b/unsupported/test/mpreal/mpreal.h
@@ -1,33 +1,34 @@
 /*
-    MPFR C++: Multi-precision floating point number class for C++. 
+    MPFR C++: Multi-precision floating point number class for C++.
     Based on MPFR library:    http://mpfr.org
 
     Project homepage:    http://www.holoborodko.com/pavel/mpfr
     Contact e-mail:      pavel@holoborodko.com
 
-    Copyright (c) 2008-2014 Pavel Holoborodko
+    Copyright (c) 2008-2015 Pavel Holoborodko
 
     Contributors:
-    Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman, 
-    Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen, 
-    Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng, 
+    Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman,
+    Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen,
+    Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng,
     Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood,
-    Petr Aleksandrov, Orion Poplawski, Charles Karney.
+    Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow,
+    Rodney James, Jorge Leitao.
 
     Licensing:
     (A) MPFR C++ is under GNU General Public License ("GPL").
-    
-    (B) Non-free licenses may also be purchased from the author, for users who 
+
+    (B) Non-free licenses may also be purchased from the author, for users who
         do not want their programs protected by the GPL.
 
-        The non-free licenses are for users that wish to use MPFR C++ in 
-        their products but are unwilling to release their software 
-        under the GPL (which would require them to release source code 
+        The non-free licenses are for users that wish to use MPFR C++ in
+        their products but are unwilling to release their software
+        under the GPL (which would require them to release source code
         and allow free redistribution).
 
         Such users can purchase an unlimited-use license from the author.
         Contact us for more details.
-    
+
     GNU General Public License ("GPL") copyright permissions statement:
     **************************************************************************
     This program is free software: you can redistribute it and/or modify
@@ -55,10 +56,10 @@
 #include <cmath>
 #include <cstring>
 #include <limits>
+#include <complex>
+#include <algorithm>
 
 // Options
-// FIXME HAVE_INT64_SUPPORT leads to clashes with long int and int64_t on some systems.
-//#define MPREAL_HAVE_INT64_SUPPORT               // Enable int64_t support if possible. Available only for MSVC 2010 & GCC.
 #define MPREAL_HAVE_MSVC_DEBUGVIEW              // Enable Debugger Visualizer for "Debug" builds in MSVC.
 #define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS  // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
                                                 // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.
@@ -66,19 +67,17 @@
 
 // Library version
 #define MPREAL_VERSION_MAJOR 3
-#define MPREAL_VERSION_MINOR 5
-#define MPREAL_VERSION_PATCHLEVEL 9
-#define MPREAL_VERSION_STRING "3.5.9"
+#define MPREAL_VERSION_MINOR 6
+#define MPREAL_VERSION_PATCHLEVEL 2
+#define MPREAL_VERSION_STRING "3.6.2"
 
 // Detect compiler using signatures from http://predef.sourceforge.net/
-#if defined(__GNUC__) && defined(__INTEL_COMPILER)
-    #define IsInf(x) isinf(x)                   // Intel ICC compiler on Linux 
-
-#elif defined(_MSC_VER)                         // Microsoft Visual C++ 
-    #define IsInf(x) (!_finite(x))                           
-
+#if defined(__GNUC__)
+    #define IsInf(x) (isinf)(x)                 // GNU C++/Intel ICC compiler on Linux
+#elif defined(_MSC_VER)                         // Microsoft Visual C++
+    #define IsInf(x) (!_finite(x))
 #else
-    #define IsInf(x) std::isinf(x)              // GNU C/C++ (and/or other compilers), just hope for C99 conformance
+    #define IsInf(x) (std::isinf)(x)              // GNU C/C++ (and/or other compilers), just hope for C99 conformance
 #endif
 
 // A Clang feature extension to determine compiler features.
@@ -93,54 +92,27 @@
 
     #define MPREAL_HAVE_MOVE_SUPPORT
 
-    // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization 
+    // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization
     #define mpfr_is_initialized(x)      (0 != (x)->_mpfr_d)
     #define mpfr_set_uninitialized(x)   ((x)->_mpfr_d = 0 )
 #endif
 
-// Detect support for explicit converters. 
+// Detect support for explicit converters.
 #if (__has_feature(cxx_explicit_conversions) || \
-       defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
-      (defined(_MSC_VER) && _MSC_VER >= 1800))
+       (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \
+       (defined(_MSC_VER) && _MSC_VER >= 1800))
 
     #define MPREAL_HAVE_EXPLICIT_CONVERTERS
 #endif
 
-// Detect available 64-bit capabilities
-#if defined(MPREAL_HAVE_INT64_SUPPORT)
-    
-    #define MPFR_USE_INTMAX_T                   // Should be defined before mpfr.h
-
-    #if defined(_MSC_VER)                       // MSVC + Windows
-        #if (_MSC_VER >= 1600)                    
-            #include <stdint.h>                 // <stdint.h> is available only in msvc2010!
-
-        #else                                   // MPFR relies on intmax_t which is available only in msvc2010
-            #undef MPREAL_HAVE_INT64_SUPPORT    // Besides, MPFR & MPIR have to be compiled with msvc2010
-            #undef MPFR_USE_INTMAX_T            // Since we cannot detect this, disable x64 by default
-                                                // Someone should change this manually if needed.
-        #endif
-
-    #elif defined (__GNUC__) && defined(__linux__)
-        #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) || defined (__PPC64__)
-            #undef MPREAL_HAVE_INT64_SUPPORT    // Remove all shaman dances for x64 builds since
-            #undef MPFR_USE_INTMAX_T            // GCC already supports x64 as of "long int" is 64-bit integer, nothing left to do
-        #else
-            #include <stdint.h>                 // use int64_t, uint64_t otherwise
-        #endif
-
-    #else
-        #include <stdint.h>                     // rely on int64_t, uint64_t in all other cases, Mac OSX, etc.
-    #endif
-
-#endif 
+#define MPFR_USE_INTMAX_T   // Enable 64-bit integer types - should be defined before mpfr.h
 
 #if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG)
     #define MPREAL_MSVC_DEBUGVIEW_CODE     DebugView = toString();
     #define MPREAL_MSVC_DEBUGVIEW_DATA     std::string DebugView;
 #else
-    #define MPREAL_MSVC_DEBUGVIEW_CODE 
-    #define MPREAL_MSVC_DEBUGVIEW_DATA 
+    #define MPREAL_MSVC_DEBUGVIEW_CODE
+    #define MPREAL_MSVC_DEBUGVIEW_DATA
 #endif
 
 #include <mpfr.h>
@@ -150,9 +122,15 @@
 #endif
 
 // Less important options
-#define MPREAL_DOUBLE_BITS_OVERFLOW -1          // Triggers overflow exception during conversion to double if mpreal 
+#define MPREAL_DOUBLE_BITS_OVERFLOW -1          // Triggers overflow exception during conversion to double if mpreal
                                                 // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits
                                                 // = -1 disables overflow checks (default)
+
+// Fast replacement for mpfr_set_zero(x, +1):
+// (a) uses low-level data members, might not be compatible with new versions of MPFR
+// (b) sign is not set, add (x)->_mpfr_sign = 1;
+#define mpfr_set_zero_fast(x)  ((x)->_mpfr_exp = __MPFR_EXP_ZERO)
+
 #if defined(__GNUC__)
   #define MPREAL_PERMISSIVE_EXPR __extension__
 #else
@@ -164,9 +142,9 @@ namespace mpfr {
 class mpreal {
 private:
     mpfr_t mp;
-    
+
 public:
-    
+
     // Get default rounding mode & precision
     inline static mp_rnd_t   get_default_rnd()    {    return (mp_rnd_t)(mpfr_get_default_rounding_mode());       }
     inline static mp_prec_t  get_default_prec()   {    return mpfr_get_default_prec();                            }
@@ -174,29 +152,26 @@ public:
     // Constructors && type conversions
     mpreal();
     mpreal(const mpreal& u);
-    mpreal(const mpf_t u);    
-    mpreal(const mpz_t u,             mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());    
-    mpreal(const mpq_t u,             mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());    
-    mpreal(const double u,            mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long double u,       mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned int u,      mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long int u,          mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const int u,               mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    
-    // Construct mpreal from mpfr_t structure.
-    // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.    
-    mpreal(const mpfr_t  u, bool shared = false);   
+    mpreal(const mpf_t u);
+    mpreal(const mpz_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const mpq_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const double u,                 mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const long double u,            mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const long long int u,          mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const unsigned long int u,      mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const unsigned int u,           mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const long int u,               mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const int u,                    mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
 
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-    mpreal(const uint64_t u,          mp_prec_t prec = mpreal::get_default_prec(),  mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const int64_t u,           mp_prec_t prec = mpreal::get_default_prec(),  mp_rnd_t mode = mpreal::get_default_rnd());
-#endif
+    // Construct mpreal from mpfr_t structure.
+    // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.
+    mpreal(const mpfr_t  u, bool shared = false);
 
     mpreal(const char* s,             mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
     mpreal(const std::string& s,      mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
 
-    ~mpreal();                           
+    ~mpreal();
 
 #ifdef MPREAL_HAVE_MOVE_SUPPORT
     mpreal& operator=(mpreal&& v);
@@ -205,7 +180,7 @@ public:
 
     // Operations
     // =
-    // +, -, *, /, ++, --, <<, >> 
+    // +, -, *, /, ++, --, <<, >>
     // *=, +=, -=, /=,
     // <, >, ==, <=, >=
 
@@ -215,13 +190,16 @@ public:
     mpreal& operator=(const mpz_t v);
     mpreal& operator=(const mpq_t v);
     mpreal& operator=(const long double v);
-    mpreal& operator=(const double v);        
+    mpreal& operator=(const double v);
     mpreal& operator=(const unsigned long int v);
+    mpreal& operator=(const unsigned long long int v);
+    mpreal& operator=(const long long int v);
     mpreal& operator=(const unsigned int v);
     mpreal& operator=(const long int v);
     mpreal& operator=(const int v);
     mpreal& operator=(const char* s);
     mpreal& operator=(const std::string& s);
+    template <typename real_t> mpreal& operator= (const std::complex<real_t>& z);
 
     // +
     mpreal& operator+=(const mpreal& v);
@@ -235,20 +213,18 @@ public:
     mpreal& operator+=(const long int u);
     mpreal& operator+=(const int u);
 
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-    mpreal& operator+=(const int64_t  u);
-    mpreal& operator+=(const uint64_t u);
-    mpreal& operator-=(const int64_t  u);
-    mpreal& operator-=(const uint64_t u);
-    mpreal& operator*=(const int64_t  u);
-    mpreal& operator*=(const uint64_t u);
-    mpreal& operator/=(const int64_t  u);
-    mpreal& operator/=(const uint64_t u);
-#endif 
+    mpreal& operator+=(const long long int  u);
+    mpreal& operator+=(const unsigned long long int u);
+    mpreal& operator-=(const long long int  u);
+    mpreal& operator-=(const unsigned long long int u);
+    mpreal& operator*=(const long long int  u);
+    mpreal& operator*=(const unsigned long long int u);
+    mpreal& operator/=(const long long int  u);
+    mpreal& operator/=(const unsigned long long int u);
 
     const mpreal operator+() const;
     mpreal& operator++ ();
-    const mpreal  operator++ (int); 
+    const mpreal  operator++ (int);
 
     // -
     mpreal& operator-=(const mpreal& v);
@@ -266,7 +242,7 @@ public:
     friend const mpreal operator-(const long int b,          const mpreal& a);
     friend const mpreal operator-(const int b,               const mpreal& a);
     friend const mpreal operator-(const double b,            const mpreal& a);
-    mpreal& operator-- ();    
+    mpreal& operator-- ();
     const mpreal  operator-- (int);
 
     // *
@@ -279,7 +255,7 @@ public:
     mpreal& operator*=(const unsigned int v);
     mpreal& operator*=(const long int v);
     mpreal& operator*=(const int v);
-    
+
     // /
     mpreal& operator/=(const mpreal& v);
     mpreal& operator/=(const mpz_t v);
@@ -308,51 +284,27 @@ public:
     mpreal& operator>>=(const long int u);
     mpreal& operator>>=(const int u);
 
-    // Boolean Operators
-    friend bool operator >  (const mpreal& a, const mpreal& b);
-    friend bool operator >= (const mpreal& a, const mpreal& b);
-    friend bool operator <  (const mpreal& a, const mpreal& b);
-    friend bool operator <= (const mpreal& a, const mpreal& b);
-    friend bool operator == (const mpreal& a, const mpreal& b);
-    friend bool operator != (const mpreal& a, const mpreal& b);
-
-    // Optimized specializations for boolean operators
-    friend bool operator == (const mpreal& a, const unsigned long int b);
-    friend bool operator == (const mpreal& a, const unsigned int b);
-    friend bool operator == (const mpreal& a, const long int b);
-    friend bool operator == (const mpreal& a, const int b);
-    friend bool operator == (const mpreal& a, const long double b);
-    friend bool operator == (const mpreal& a, const double b);
-
     // Type Conversion operators
-    bool            toBool      (mp_rnd_t mode = GMP_RNDZ)    const;
-    long            toLong      (mp_rnd_t mode = GMP_RNDZ)    const;
-    unsigned long   toULong     (mp_rnd_t mode = GMP_RNDZ)    const;
-    float           toFloat     (mp_rnd_t mode = GMP_RNDN)    const;
-    double          toDouble    (mp_rnd_t mode = GMP_RNDN)    const;
-    long double     toLDouble   (mp_rnd_t mode = GMP_RNDN)    const;
+    bool               toBool      (                        )    const;
+    long               toLong      (mp_rnd_t mode = GMP_RNDZ)    const;
+    unsigned long      toULong     (mp_rnd_t mode = GMP_RNDZ)    const;
+    long long          toLLong     (mp_rnd_t mode = GMP_RNDZ)    const;
+    unsigned long long toULLong    (mp_rnd_t mode = GMP_RNDZ)    const;
+    float              toFloat     (mp_rnd_t mode = GMP_RNDN)    const;
+    double             toDouble    (mp_rnd_t mode = GMP_RNDN)    const;
+    long double        toLDouble   (mp_rnd_t mode = GMP_RNDN)    const;
 
 #if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
-    explicit operator bool               () const { return toBool();       }
-    explicit operator int                () const { return toLong();       }
-    explicit operator long               () const { return toLong();       }
-    explicit operator long long          () const { return toLong();       }
-    explicit operator unsigned           () const { return toULong();      }
-    explicit operator unsigned long      () const { return toULong();      }
-    explicit operator unsigned long long () const { return toULong();      }
-    explicit operator float              () const { return toFloat();      }
-    explicit operator double             () const { return toDouble();     }
-    explicit operator long double        () const { return toLDouble();    }
-#endif
-
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-    int64_t         toInt64     (mp_rnd_t mode = GMP_RNDZ)    const;
-    uint64_t        toUInt64    (mp_rnd_t mode = GMP_RNDZ)    const;
-
-    #if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
-    explicit operator int64_t   () const { return toInt64();      }
-    explicit operator uint64_t  () const { return toUInt64();     }
-    #endif
+    explicit operator bool               () const { return toBool();                 }
+    explicit operator int                () const { return int(toLong());            }
+    explicit operator long               () const { return toLong();                 }
+    explicit operator long long          () const { return toLLong();                }
+    explicit operator unsigned           () const { return unsigned(toULong());      }
+    explicit operator unsigned long      () const { return toULong();                }
+    explicit operator unsigned long long () const { return toULLong();               }
+    explicit operator float              () const { return toFloat();                }
+    explicit operator double             () const { return toDouble();               }
+    explicit operator long double        () const { return toLDouble();              }
 #endif
 
     // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions
@@ -391,11 +343,12 @@ public:
     friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
     friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
     friend int cmpabs(const mpreal& a,const mpreal& b);
-    
+
     friend const mpreal log  (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp  (const mpreal& v, mp_rnd_t rnd_mode); 
+    friend const mpreal exp  (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode);
@@ -436,21 +389,22 @@ public:
     friend const mpreal eint   (const mpreal& v, mp_rnd_t rnd_mode);
 
     friend const mpreal gamma    (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal tgamma   (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal lngamma  (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal lgamma   (const mpreal& v, int *signp, mp_rnd_t rnd_mode);
     friend const mpreal zeta     (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal erf      (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal erfc     (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode); 
-    friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode); 
+    friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode); 
+    friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal fma      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
     friend const mpreal fms      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
     friend const mpreal agm      (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode);
-    friend const mpreal sum      (const mpreal tab[], unsigned long int n, mp_rnd_t rnd_mode);
+    friend const mpreal sum      (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode);
     friend int sgn(const mpreal& v); // returns -1 or +1
 
 // MPFR 2.4.0 Specifics
@@ -465,28 +419,26 @@ public:
     friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division
 #endif
 
-// MPFR 3.0.0 Specifics
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
     friend const mpreal digamma (const mpreal& v,        mp_rnd_t rnd_mode);
     friend const mpreal ai      (const mpreal& v,        mp_rnd_t rnd_mode);
     friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
+#endif
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
     friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
     friend const mpreal grandom (unsigned int seed);
 #endif
-    
+
     // Uniformly distributed random number generation in [0,1] using
     // Mersenne-Twister algorithm by default.
     // Use parameter to setup seed, e.g.: random((unsigned)time(NULL))
     // Check urandom() for more precise control.
     friend const mpreal random(unsigned int seed);
 
-    // Exponent and mantissa manipulation
-    friend const mpreal frexp(const mpreal& v, mp_exp_t* exp);    
-    friend const mpreal ldexp(const mpreal& v, mp_exp_t exp);
-
     // Splits mpreal value into fractional and integer parts.
     // Returns fractional part and stores integer part in n.
-    friend const mpreal modf(const mpreal& v, mpreal& n);    
+    friend const mpreal modf(const mpreal& v, mpreal& n);
 
     // Constants
     // don't forget to call mpfr_free_cache() for every thread where you are using const-functions
@@ -515,14 +467,14 @@ public:
     friend const mpreal frac        (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal remainder   (         const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
     friend const mpreal remquo      (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    
+
     // Miscellaneous Functions
     friend const mpreal nexttoward (const mpreal& x, const mpreal& y);
     friend const mpreal nextabove  (const mpreal& x);
     friend const mpreal nextbelow  (const mpreal& x);
 
     // use gmp_randinit_default() to init state, gmp_randclear() to clear
-    friend const mpreal urandomb (gmp_randstate_t& state); 
+    friend const mpreal urandomb (gmp_randstate_t& state);
 
 // MPFR < 2.4.2 Specifics
 #if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
@@ -530,9 +482,9 @@ public:
 #endif
 
     // Instance Checkers
-    friend bool isnan    (const mpreal& v);
-    friend bool isinf    (const mpreal& v);
-    friend bool isfinite (const mpreal& v);
+    friend bool (isnan)    (const mpreal& v);
+    friend bool (isinf)    (const mpreal& v);
+    friend bool (isfinite) (const mpreal& v);
 
     friend bool isnum    (const mpreal& v);
     friend bool iszero   (const mpreal& v);
@@ -549,9 +501,9 @@ public:
     // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex<mpreal> interface
     inline mpreal&      setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd());
     inline int          getPrecision() const;
-    
+
     // Set mpreal to +/- inf, NaN, +/-0
-    mpreal&        setInf  (int Sign = +1);    
+    mpreal&        setInf  (int Sign = +1);
     mpreal&        setNan  ();
     mpreal&        setZero (int Sign = +1);
     mpreal&        setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd());
@@ -560,7 +512,7 @@ public:
     mp_exp_t get_exp();
     int set_exp(mp_exp_t e);
     int check_range  (int t, mp_rnd_t rnd_mode = get_default_rnd());
-    int subnormalize (int t,mp_rnd_t rnd_mode = get_default_rnd());
+    int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd());
 
     // Inexact conversion from float
     inline bool fits_in_bits(double x, int n);
@@ -580,7 +532,7 @@ public:
 
     // Efficient swapping of two mpreal values - needed for std algorithms
     friend void swap(mpreal& x, mpreal& y);
-    
+
     friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
     friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
 
@@ -590,7 +542,7 @@ private:
     //
     // mpfr::mpreal=<DebugView>                              ; Show value only
     // mpfr::mpreal=<DebugView>, <mp[0]._mpfr_prec,u>bits    ; Show value & precision
-    // 
+    //
     // at the beginning of
     // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat
     MPREAL_MSVC_DEBUGVIEW_DATA
@@ -609,15 +561,15 @@ public:
 //////////////////////////////////////////////////////////////////////////
 // Constructors & converters
 // Default constructor: creates mp number and initializes it to 0.
-inline mpreal::mpreal() 
-{ 
-    mpfr_init2 (mpfr_ptr(), mpreal::get_default_prec()); 
-    mpfr_set_ui(mpfr_ptr(), 0, mpreal::get_default_rnd());
+inline mpreal::mpreal()
+{
+    mpfr_init2(mpfr_ptr(), mpreal::get_default_prec());
+    mpfr_set_zero_fast(mpfr_ptr());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
 
-inline mpreal::mpreal(const mpreal& u) 
+inline mpreal::mpreal(const mpreal& u)
 {
     mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr()));
     mpfr_set  (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
@@ -628,7 +580,7 @@ inline mpreal::mpreal(const mpreal& u)
 #ifdef MPREAL_HAVE_MOVE_SUPPORT
 inline mpreal::mpreal(mpreal&& other)
 {
-    mpfr_set_uninitialized(mpfr_ptr());     // make sure "other" holds no pinter to actual data 
+    mpfr_set_uninitialized(mpfr_ptr());     // make sure "other" holds no pointer to actual data
     mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
@@ -700,67 +652,65 @@ inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode)
 }
 
 inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode)
-{ 
+{
     mpfr_init2 (mpfr_ptr(), prec);
     mpfr_set_ld(mpfr_ptr(), u, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
 
-inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
+inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode)
+{
     mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ui(mpfr_ptr(), u, mode);
+    mpfr_set_uj(mpfr_ptr(), u, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
 
-inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
+inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode)
+{
     mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ui(mpfr_ptr(), u, mode);
+    mpfr_set_sj(mpfr_ptr(), u, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
 
-inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
+inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
+{
     mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_si(mpfr_ptr(), u, mode);
+    mpfr_set_ui(mpfr_ptr(), u, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
 
-inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
+inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
+{
     mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_si(mpfr_ptr(), u, mode);
+    mpfr_set_ui(mpfr_ptr(), u, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
 
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-inline mpreal::mpreal(const uint64_t u, mp_prec_t prec, mp_rnd_t mode)
+inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
 {
     mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_uj(mpfr_ptr(), u, mode); 
+    mpfr_set_si(mpfr_ptr(), u, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
 
-inline mpreal::mpreal(const int64_t u, mp_prec_t prec, mp_rnd_t mode)
+inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode)
 {
     mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_sj(mpfr_ptr(), u, mode); 
+    mpfr_set_si(mpfr_ptr(), u, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
-#endif
 
 inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
 {
     mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s, base, mode); 
+    mpfr_set_str(mpfr_ptr(), s, base, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
@@ -768,7 +718,7 @@ inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
 inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode)
 {
     mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode); 
+    mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
@@ -776,15 +726,15 @@ inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t m
 inline void mpreal::clear(::mpfr_ptr x)
 {
 #ifdef MPREAL_HAVE_MOVE_SUPPORT
-    if(mpfr_is_initialized(x)) 
+    if(mpfr_is_initialized(x))
 #endif
     mpfr_clear(x);
 }
 
-inline mpreal::~mpreal() 
-{ 
+inline mpreal::~mpreal()
+{
     clear(mpfr_ptr());
-}                           
+}
 
 // internal namespace needed for template magic
 namespace internal{
@@ -792,58 +742,55 @@ namespace internal{
     // Use SFINAE to restrict arithmetic operations instantiation only for numeric types
     // This is needed for smooth integration with libraries based on expression templates, like Eigen.
     // TODO: Do the same for boolean operators.
-    template <typename ArgumentType> struct result_type {};    
-    
-    template <> struct result_type<mpreal>              {typedef mpreal type;};    
-    template <> struct result_type<mpz_t>               {typedef mpreal type;};    
-    template <> struct result_type<mpq_t>               {typedef mpreal type;};    
-    template <> struct result_type<long double>         {typedef mpreal type;};    
-    template <> struct result_type<double>              {typedef mpreal type;};    
-    template <> struct result_type<unsigned long int>   {typedef mpreal type;};    
-    template <> struct result_type<unsigned int>        {typedef mpreal type;};    
-    template <> struct result_type<long int>            {typedef mpreal type;};    
-    template <> struct result_type<int>                 {typedef mpreal type;};    
-
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-    template <> struct result_type<int64_t  >           {typedef mpreal type;};    
-    template <> struct result_type<uint64_t >           {typedef mpreal type;};    
-#endif
+    template <typename ArgumentType> struct result_type {};
+
+    template <> struct result_type<mpreal>              {typedef mpreal type;};
+    template <> struct result_type<mpz_t>               {typedef mpreal type;};
+    template <> struct result_type<mpq_t>               {typedef mpreal type;};
+    template <> struct result_type<long double>         {typedef mpreal type;};
+    template <> struct result_type<double>              {typedef mpreal type;};
+    template <> struct result_type<unsigned long int>   {typedef mpreal type;};
+    template <> struct result_type<unsigned int>        {typedef mpreal type;};
+    template <> struct result_type<long int>            {typedef mpreal type;};
+    template <> struct result_type<int>                 {typedef mpreal type;};
+    template <> struct result_type<long long>           {typedef mpreal type;};
+    template <> struct result_type<unsigned long long>  {typedef mpreal type;};
 }
 
 // + Addition
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
     operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs;    }
 
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
-    operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs;    } 
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
+    operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs;    }
 
 // - Subtraction
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
     operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs;    }
 
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
     operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs;    }
 
 // * Multiplication
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
     operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs;    }
 
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
-    operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs;    } 
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
+    operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs;    }
 
 // / Division
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
     operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs;    }
 
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
     operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs;    }
 
 //////////////////////////////////////////////////////////////////////////
@@ -893,17 +840,17 @@ const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::g
 const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); 
+const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); 
+const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 
-const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());    
+const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 
-const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());    
+const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
@@ -920,9 +867,9 @@ inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpr
 inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec());
 
 // Returns smallest eps such that x + eps != x (relative machine epsilon)
-inline mpreal machine_epsilon(const mpreal& x);        
+inline mpreal machine_epsilon(const mpreal& x);
 
-// Gives max & min values for the required precision, 
+// Gives max & min values for the required precision,
 // minval is 'safe' meaning 1 / minval does not overflow
 // maxval is 'safe' meaning 1 / maxval does not underflow
 inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec());
@@ -935,13 +882,13 @@ inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps);
 inline bool isEqualFuzzy(const mpreal& a, const mpreal& b);
 
 // 'Bitwise' equality check
-//  maxUlps - a and b can be apart by maxUlps binary numbers. 
+//  maxUlps - a and b can be apart by maxUlps binary numbers.
 inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps);
 
 //////////////////////////////////////////////////////////////////////////
-//     Convert precision in 'bits' to decimal digits and vice versa.
-//        bits   = ceil(digits*log[2](10))
-//        digits = floor(bits*log[10](2))
+// Convert precision in 'bits' to decimal digits and vice versa.
+//    bits   = ceil(digits*log[2](10))
+//    digits = floor(bits*log[10](2))
 
 inline mp_prec_t digits2bits(int d);
 inline int       bits2digits(mp_prec_t b);
@@ -979,7 +926,7 @@ inline mpreal& mpreal::operator=(const mpreal& v)
 inline mpreal& mpreal::operator=(const mpf_t v)
 {
     mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd());
-    
+
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
@@ -987,7 +934,7 @@ inline mpreal& mpreal::operator=(const mpf_t v)
 inline mpreal& mpreal::operator=(const mpz_t v)
 {
     mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd());
-    
+
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
@@ -1000,16 +947,16 @@ inline mpreal& mpreal::operator=(const mpq_t v)
     return *this;
 }
 
-inline mpreal& mpreal::operator=(const long double v)        
-{    
+inline mpreal& mpreal::operator=(const long double v)
+{
     mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
 
-inline mpreal& mpreal::operator=(const double v)                
-{   
+inline mpreal& mpreal::operator=(const double v)
+{
 #if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
   if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW))
   {
@@ -1024,33 +971,49 @@ inline mpreal& mpreal::operator=(const double v)
     return *this;
 }
 
-inline mpreal& mpreal::operator=(const unsigned long int v)    
-{    
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());    
+inline mpreal& mpreal::operator=(const unsigned long int v)
+{
+    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const unsigned int v)
+{
+    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const unsigned long long int v)
+{
+    mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
 
-inline mpreal& mpreal::operator=(const unsigned int v)        
-{    
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());    
+inline mpreal& mpreal::operator=(const long long int v)
+{
+    mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
 
-inline mpreal& mpreal::operator=(const long int v)            
-{    
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());    
+inline mpreal& mpreal::operator=(const long int v)
+{
+    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
 
 inline mpreal& mpreal::operator=(const int v)
-{    
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());    
+{
+    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
@@ -1071,7 +1034,7 @@ inline mpreal& mpreal::operator=(const char* s)
 
     if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd()))
     {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd()); 
+        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
         MPREAL_MSVC_DEBUGVIEW_CODE;
     }
 
@@ -1094,7 +1057,7 @@ inline mpreal& mpreal::operator=(const std::string& s)
 
     if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd()))
     {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd()); 
+        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
         MPREAL_MSVC_DEBUGVIEW_CODE;
     }
 
@@ -1102,6 +1065,11 @@ inline mpreal& mpreal::operator=(const std::string& s)
     return *this;
 }
 
+template <typename real_t>
+inline mpreal& mpreal::operator= (const std::complex<real_t>& z)
+{
+    return *this = z.real();
+}
 
 //////////////////////////////////////////////////////////////////////////
 // + Addition
@@ -1135,9 +1103,9 @@ inline mpreal& mpreal::operator+=(const mpq_t u)
 
 inline mpreal& mpreal::operator+= (const long double u)
 {
-    *this += mpreal(u);    
+    *this += mpreal(u);
     MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
+    return *this;
 }
 
 inline mpreal& mpreal::operator+= (const double u)
@@ -1180,16 +1148,14 @@ inline mpreal& mpreal::operator+=(const int u)
     return *this;
 }
 
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-inline mpreal& mpreal::operator+=(const int64_t  u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator+=(const uint64_t u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const int64_t  u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const uint64_t u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const int64_t  u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const uint64_t u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const int64_t  u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const uint64_t u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-#endif 
+inline mpreal& mpreal::operator+=(const long long int u)         {    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator+=(const unsigned long long int u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator-=(const long long int  u)        {    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator-=(const unsigned long long int u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator*=(const long long int  u)        {    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator*=(const unsigned long long int u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator/=(const long long int  u)        {    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator/=(const unsigned long long int u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
 
 inline const mpreal mpreal::operator+()const    {    return mpreal(*this); }
 
@@ -1200,7 +1166,7 @@ inline const mpreal operator+(const mpreal& a, const mpreal& b)
   return c;
 }
 
-inline mpreal& mpreal::operator++() 
+inline mpreal& mpreal::operator++()
 {
     return *this += 1;
 }
@@ -1212,7 +1178,7 @@ inline const mpreal mpreal::operator++ (int)
     return x;
 }
 
-inline mpreal& mpreal::operator--() 
+inline mpreal& mpreal::operator--()
 {
     return *this -= 1;
 }
@@ -1249,9 +1215,9 @@ inline mpreal& mpreal::operator-=(const mpq_t v)
 
 inline mpreal& mpreal::operator-=(const long double v)
 {
-    *this -= mpreal(v);    
+    *this -= mpreal(v);
     MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
+    return *this;
 }
 
 inline mpreal& mpreal::operator-=(const double v)
@@ -1259,7 +1225,7 @@ inline mpreal& mpreal::operator-=(const double v)
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
     mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
 #else
-    *this -= mpreal(v);    
+    *this -= mpreal(v);
 #endif
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
@@ -1374,9 +1340,9 @@ inline mpreal& mpreal::operator*=(const mpq_t v)
 
 inline mpreal& mpreal::operator*=(const long double v)
 {
-    *this *= mpreal(v);    
+    *this *= mpreal(v);
     MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
+    return *this;
 }
 
 inline mpreal& mpreal::operator*=(const double v)
@@ -1384,7 +1350,7 @@ inline mpreal& mpreal::operator*=(const double v)
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
     mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
 #else
-    *this *= mpreal(v);    
+    *this *= mpreal(v);
 #endif
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
@@ -1452,7 +1418,7 @@ inline mpreal& mpreal::operator/=(const long double v)
 {
     *this /= mpreal(v);
     MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
+    return *this;
 }
 
 inline mpreal& mpreal::operator/=(const double v)
@@ -1460,7 +1426,7 @@ inline mpreal& mpreal::operator/=(const double v)
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
     mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
 #else
-    *this /= mpreal(v);    
+    *this /= mpreal(v);
 #endif
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
@@ -1671,45 +1637,86 @@ inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
 }
 
 //////////////////////////////////////////////////////////////////////////
-//Boolean operators
-inline bool operator >  (const mpreal& a, const mpreal& b){    return (mpfr_greater_p       (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-inline bool operator >= (const mpreal& a, const mpreal& b){    return (mpfr_greaterequal_p  (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-inline bool operator <  (const mpreal& a, const mpreal& b){    return (mpfr_less_p          (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-inline bool operator <= (const mpreal& a, const mpreal& b){    return (mpfr_lessequal_p     (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-inline bool operator == (const mpreal& a, const mpreal& b){    return (mpfr_equal_p         (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-inline bool operator != (const mpreal& a, const mpreal& b){    return (mpfr_lessgreater_p   (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-
-inline bool operator == (const mpreal& a, const unsigned long int b ){    return (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );    }
-inline bool operator == (const mpreal& a, const unsigned int b      ){    return (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );    }
-inline bool operator == (const mpreal& a, const long int b          ){    return (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );    }
-inline bool operator == (const mpreal& a, const int b               ){    return (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );    }
-inline bool operator == (const mpreal& a, const long double b       ){    return (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 );    }
-inline bool operator == (const mpreal& a, const double b            ){    return (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 );    }
-
-
-inline bool isnan    (const mpreal& op){    return (mpfr_nan_p    (op.mpfr_srcptr()) != 0 );    }
-inline bool isinf    (const mpreal& op){    return (mpfr_inf_p    (op.mpfr_srcptr()) != 0 );    }
-inline bool isfinite (const mpreal& op){    return (mpfr_number_p (op.mpfr_srcptr()) != 0 );    }
+//Relational operators
+
+// WARNING:
+//
+// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode:
+//
+// isnan(b) =  (b != b)
+// isnan(b) = !(b == b)  (we use in code below)
+//
+// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC).
+// Use std::isnan instead (C++11).
+
+inline bool operator >  (const mpreal& a, const mpreal& b           ){  return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );            }
+inline bool operator >  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 );    }
+inline bool operator >  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 );    }
+
+inline bool operator >= (const mpreal& a, const mpreal& b           ){  return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );       }
+inline bool operator >= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
+// inline bool operator >= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (isnan()a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
+inline bool operator >= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
+inline bool operator >= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
+inline bool operator >= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 );   }
+inline bool operator >= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 );   }
+
+inline bool operator <  (const mpreal& a, const mpreal& b           ){  return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );               }
+inline bool operator <  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 );    }
+inline bool operator <  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 );    }
+
+inline bool operator <= (const mpreal& a, const mpreal& b           ){  return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );          }
+inline bool operator <= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 );   }
+inline bool operator <= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 );   }
+
+inline bool operator == (const mpreal& a, const mpreal& b           ){  return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );              }
+inline bool operator == (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 );   }
+inline bool operator == (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 );   }
+
+inline bool operator != (const mpreal& a, const mpreal& b           ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const unsigned long int b ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const unsigned int b      ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const long int b          ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const int b               ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const long double b       ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const double b            ){  return !(a == b);  }
+
+inline bool (isnan)    (const mpreal& op){    return (mpfr_nan_p    (op.mpfr_srcptr()) != 0 );    }
+inline bool (isinf)    (const mpreal& op){    return (mpfr_inf_p    (op.mpfr_srcptr()) != 0 );    }
+inline bool (isfinite) (const mpreal& op){    return (mpfr_number_p (op.mpfr_srcptr()) != 0 );    }
 inline bool iszero   (const mpreal& op){    return (mpfr_zero_p   (op.mpfr_srcptr()) != 0 );    }
 inline bool isint    (const mpreal& op){    return (mpfr_integer_p(op.mpfr_srcptr()) != 0 );    }
 
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
 inline bool isregular(const mpreal& op){    return (mpfr_regular_p(op.mpfr_srcptr()));}
-#endif 
+#endif
 
 //////////////////////////////////////////////////////////////////////////
 // Type Converters
-inline bool             mpreal::toBool (mp_rnd_t /*mode*/) const   {    return  mpfr_zero_p (mpfr_srcptr()) == 0;     }
-inline long             mpreal::toLong   (mp_rnd_t mode)  const    {    return  mpfr_get_si (mpfr_srcptr(), mode);    }
-inline unsigned long    mpreal::toULong  (mp_rnd_t mode)  const    {    return  mpfr_get_ui (mpfr_srcptr(), mode);    }
-inline float            mpreal::toFloat  (mp_rnd_t mode)  const    {    return  mpfr_get_flt(mpfr_srcptr(), mode);    }
-inline double           mpreal::toDouble (mp_rnd_t mode)  const    {    return  mpfr_get_d  (mpfr_srcptr(), mode);    }
-inline long double      mpreal::toLDouble(mp_rnd_t mode)  const    {    return  mpfr_get_ld (mpfr_srcptr(), mode);    }
-
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-inline int64_t      mpreal::toInt64 (mp_rnd_t mode)    const{    return mpfr_get_sj(mpfr_srcptr(), mode);    }
-inline uint64_t     mpreal::toUInt64(mp_rnd_t mode)    const{    return mpfr_get_uj(mpfr_srcptr(), mode);    }
-#endif
+inline bool               mpreal::toBool   (             )  const    {    return  mpfr_zero_p (mpfr_srcptr()) == 0;     }
+inline long               mpreal::toLong   (mp_rnd_t mode)  const    {    return  mpfr_get_si (mpfr_srcptr(), mode);    }
+inline unsigned long      mpreal::toULong  (mp_rnd_t mode)  const    {    return  mpfr_get_ui (mpfr_srcptr(), mode);    }
+inline float              mpreal::toFloat  (mp_rnd_t mode)  const    {    return  mpfr_get_flt(mpfr_srcptr(), mode);    }
+inline double             mpreal::toDouble (mp_rnd_t mode)  const    {    return  mpfr_get_d  (mpfr_srcptr(), mode);    }
+inline long double        mpreal::toLDouble(mp_rnd_t mode)  const    {    return  mpfr_get_ld (mpfr_srcptr(), mode);    }
+inline long long          mpreal::toLLong  (mp_rnd_t mode)  const    {    return  mpfr_get_sj (mpfr_srcptr(), mode);    }
+inline unsigned long long mpreal::toULLong (mp_rnd_t mode)  const    {    return  mpfr_get_uj (mpfr_srcptr(), mode);    }
 
 inline ::mpfr_ptr     mpreal::mpfr_ptr()             { return mp; }
 inline ::mpfr_srcptr  mpreal::mpfr_ptr()    const    { return mp; }
@@ -1755,21 +1762,21 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
 
     std::ostringstream format;
 
-    int digits = (n >= 0) ? n : bits2digits(mpfr_get_prec(mpfr_srcptr()));
-    
+    int digits = (n >= 0) ? n : 1 + bits2digits(mpfr_get_prec(mpfr_srcptr()));
+
     format << "%." << digits << "RNg";
 
     return toString(format.str());
 
 #else
 
-    char *s, *ns = NULL; 
+    char *s, *ns = NULL;
     size_t slen, nslen;
     mp_exp_t exp;
     std::string out;
 
     if(mpfr_inf_p(mp))
-    { 
+    {
         if(mpfr_sgn(mp)>0) return "+Inf";
         else               return "-Inf";
     }
@@ -1784,7 +1791,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
     {
         slen  = strlen(s);
         nslen = strlen(ns);
-        if(nslen<=slen) 
+        if(nslen<=slen)
         {
             mpfr_free_str(s);
             s = ns;
@@ -1801,7 +1808,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
             {
                 // Remove zeros starting from right end
                 char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp) ptr--; 
+                while (*ptr=='0' && ptr>s+exp) ptr--;
 
                 if(ptr==s+exp) out = std::string(s,exp+1);
                 else           out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1);
@@ -1812,7 +1819,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
             {
                 // Remove zeros starting from right end
                 char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp-1) ptr--; 
+                while (*ptr=='0' && ptr>s+exp-1) ptr--;
 
                 if(ptr==s+exp-1) out = std::string(s,exp);
                 else             out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1);
@@ -1825,7 +1832,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
             {
                 // Remove zeros starting from right end
                 char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+1) ptr--; 
+                while (*ptr=='0' && ptr>s+1) ptr--;
 
                 if(ptr==s+1) out = std::string(s,2);
                 else         out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1);
@@ -1836,7 +1843,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
             {
                 // Remove zeros starting from right end
                 char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s) ptr--; 
+                while (*ptr=='0' && ptr>s) ptr--;
 
                 if(ptr==s) out = std::string(s,1);
                 else       out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1);
@@ -1863,7 +1870,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
 
 //////////////////////////////////////////////////////////////////////////
 // I/O
-inline std::ostream& mpreal::output(std::ostream& os) const 
+inline std::ostream& mpreal::output(std::ostream& os) const
 {
     std::ostringstream format;
     const std::ios::fmtflags flags = os.flags();
@@ -1926,8 +1933,7 @@ inline int bits2digits(mp_prec_t b)
 // Set/Get number properties
 inline int sgn(const mpreal& op)
 {
-    int r = mpfr_signbit(op.mpfr_srcptr());
-    return (r > 0? -1 : 1);
+    return mpfr_sgn(op.mpfr_srcptr());
 }
 
 inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode)
@@ -1949,29 +1955,28 @@ inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode)
     return *this;
 }
 
-inline mpreal& mpreal::setInf(int sign) 
-{ 
+inline mpreal& mpreal::setInf(int sign)
+{
     mpfr_set_inf(mpfr_ptr(), sign);
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
-}    
+}
 
-inline mpreal& mpreal::setNan() 
+inline mpreal& mpreal::setNan()
 {
     mpfr_set_nan(mpfr_ptr());
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
 
-inline mpreal&    mpreal::setZero(int sign)
+inline mpreal& mpreal::setZero(int sign)
 {
-
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
     mpfr_set_zero(mpfr_ptr(), sign);
 #else
     mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)());
     setSign(sign);
-#endif 
+#endif
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
@@ -2000,23 +2005,32 @@ inline int mpreal::set_exp (mp_exp_t e)
     return x;
 }
 
-inline const mpreal frexp(const mpreal& v, mp_exp_t* exp)
+inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd())
 {
-    mpreal x(v);
-    *exp = x.get_exp();
-    x.set_exp(0);
-    return x;
+    mpreal y(x);
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
+    mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode);
+#else
+    *exp = mpfr_get_exp(y.mpfr_srcptr());
+    mpfr_set_exp(y.mpfr_ptr(),0);
+#endif
+    return y;
 }
 
 inline const mpreal ldexp(const mpreal& v, mp_exp_t exp)
 {
     mpreal x(v);
 
-    // rounding is not important since we just increasing the exponent
-    mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd()); 
+    // rounding is not important since we are just increasing the exponent (= exact operation)
+    mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd());
     return x;
 }
 
+inline const mpreal scalbn(const mpreal& v, mp_exp_t exp)
+{
+    return ldexp(v, exp);
+}
+
 inline mpreal machine_epsilon(mp_prec_t prec)
 {
     /* the smallest eps such that 1 + eps != 1 */
@@ -2024,7 +2038,7 @@ inline mpreal machine_epsilon(mp_prec_t prec)
 }
 
 inline mpreal machine_epsilon(const mpreal& x)
-{    
+{
     /* the smallest eps such that x + eps != x */
     if( x < 0)
     {
@@ -2045,7 +2059,7 @@ inline mpreal minval(mp_prec_t prec)
 inline mpreal maxval(mp_prec_t prec)
 {
     /* max = (1 - eps) * 2^emax, eps is machine epsilon */
-    return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax(); 
+    return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax();
 }
 
 inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps)
@@ -2063,12 +2077,26 @@ inline bool isEqualFuzzy(const mpreal& a, const mpreal& b)
     return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b)))));
 }
 
+//////////////////////////////////////////////////////////////////////////
+// C++11 sign functions.
+inline mpreal copysign(const mpreal& x, const  mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal rop(0, mpfr_get_prec(x.mpfr_ptr()));
+    mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode);
+    return rop;
+}
+
+inline bool signbit(const mpreal& x)
+{
+    return mpfr_signbit(x.mpfr_srcptr());
+}
+
 inline const mpreal modf(const mpreal& v, mpreal& n)
 {
     mpreal f(v);
 
     // rounding is not important since we are using the same number
-    mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());    
+    mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());
     mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr());
     return f;
 }
@@ -2131,7 +2159,7 @@ inline mp_exp_t mpreal::get_emax_max (void)
 #define MPREAL_UNARY_MATH_FUNCTION_BODY(f)                    \
         mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));          \
         mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r);           \
-        return y; 
+        return y;
 
 inline const mpreal sqr  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
 {   MPREAL_UNARY_MATH_FUNCTION_BODY(sqr );    }
@@ -2154,7 +2182,7 @@ inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode)
 inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode)
 {
     if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-    else        return mpreal().setNan(); // NaN  
+    else        return mpreal().setNan(); // NaN
 }
 
 inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
@@ -2165,9 +2193,9 @@ inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
 
 inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd())
 {
-    mpreal y(0, mpfr_get_prec(x.mpfr_srcptr())); 
-    mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);  
-    return y; 
+    mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));
+    mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);
+    return y;
 }
 
 inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd())
@@ -2209,6 +2237,8 @@ inline const mpreal acos  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd
 inline const mpreal asin  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asin );    }
 inline const mpreal atan  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atan );    }
 
+inline const mpreal logb  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   return log2 (abs(x),r);                    }
+
 inline const mpreal acot  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atan (1/v, r);                      }
 inline const mpreal asec  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acos (1/v, r);                      }
 inline const mpreal acsc  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asin (1/v, r);                      }
@@ -2230,6 +2260,7 @@ inline const mpreal log1p   (const mpreal& x, mp_rnd_t r = mpreal::get_default_r
 inline const mpreal expm1   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(expm1  );    }
 inline const mpreal eint    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(eint   );    }
 inline const mpreal gamma   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
+inline const mpreal tgamma  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
 inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma);    }
 inline const mpreal zeta    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(zeta   );    }
 inline const mpreal erf     (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erf    );    }
@@ -2254,7 +2285,7 @@ inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode =
 }
 
 inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{    
+{
     mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
     mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
     return a;
@@ -2307,9 +2338,9 @@ inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, m
     mpreal a;
     mp_prec_t p1, p2, p3;
 
-    p1 = v1.get_prec(); 
-    p2 = v2.get_prec(); 
-    p3 = v3.get_prec(); 
+    p1 = v1.get_prec();
+    p2 = v2.get_prec();
+    p3 = v3.get_prec();
 
     a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
 
@@ -2322,9 +2353,9 @@ inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, m
     mpreal a;
     mp_prec_t p1, p2, p3;
 
-    p1 = v1.get_prec(); 
-    p2 = v2.get_prec(); 
-    p3 = v3.get_prec(); 
+    p1 = v1.get_prec();
+    p2 = v2.get_prec();
+    p3 = v3.get_prec();
 
     a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
 
@@ -2337,8 +2368,8 @@ inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode =
     mpreal a;
     mp_prec_t p1, p2;
 
-    p1 = v1.get_prec(); 
-    p2 = v2.get_prec(); 
+    p1 = v1.get_prec();
+    p2 = v2.get_prec();
 
     a.set_prec(p1>p2?p1:p2);
 
@@ -2347,16 +2378,17 @@ inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode =
     return a;
 }
 
-inline const mpreal sum (const mpreal tab[], unsigned long int n, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd())
 {
+    mpfr_srcptr *p = new mpfr_srcptr[n];
+
+    for (unsigned long int  i = 0; i < n; i++)
+        p[i] = tab[i].mpfr_srcptr();
+
     mpreal x;
-    mpfr_ptr* t;
-    unsigned long int i;
+    status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode);
 
-    t = new mpfr_ptr[n];
-    for (i=0;i<n;i++) t[i] = (mpfr_ptr)tab[i].mp;
-    mpfr_sum(x.mp,t,n,rnd_mode);
-    delete[] t;
+    delete [] p;
     return x;
 }
 
@@ -2369,9 +2401,9 @@ inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode =
     return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode);
 }
 
-inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) 
-{   
-    MPREAL_UNARY_MATH_FUNCTION_BODY(li2);    
+inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
+{
+    MPREAL_UNARY_MATH_FUNCTION_BODY(li2);
 }
 
 inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
@@ -2383,23 +2415,23 @@ inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = m
 inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
 {
     (void)rnd_mode;
-    
-    /*  
+
+    /*
 
     m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y)
 
     The following are true by convention:
     - mod(x,0) is x
     - mod(x,x) is 0
-    - mod(x,y) for x != y and y != 0 has the same sign as y.    
-    
+    - mod(x,y) for x != y and y != 0 has the same sign as y.
+
     */
 
     if(iszero(y)) return x;
     if(x == y) return 0;
 
     mpreal m = x - floor(x / y) * y;
-    
+
     m.setSign(sgn(y)); // make sure result has the same sign as Y
 
     return m;
@@ -2410,8 +2442,8 @@ inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode =
     mpreal a;
     mp_prec_t yp, xp;
 
-    yp = y.get_prec(); 
-    xp = x.get_prec(); 
+    yp = y.get_prec();
+    xp = x.get_prec();
 
     a.set_prec(yp>xp?yp:xp);
 
@@ -2553,33 +2585,24 @@ inline const mpreal nextbelow  (const mpreal& x)
 inline const mpreal urandomb (gmp_randstate_t& state)
 {
     mpreal x;
-    mpfr_urandomb(x.mp,state);
+    mpfr_urandomb(x.mpfr_ptr(),state);
     return x;
 }
 
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-// use gmp_randinit_default() to init state, gmp_randclear() to clear
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
 inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
 {
     mpreal x;
-    mpfr_urandom(x.mp,state,rnd_mode);
-    return x;
-}
-
-inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x;
-    mpfr_grandom(x.mp, NULL, state, rnd_mode);
+    mpfr_urandom(x.mpfr_ptr(), state, rnd_mode);
     return x;
 }
-
-#endif 
+#endif
 
 #if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
 inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
 {
     mpreal x;
-    mpfr_random2(x.mp,size,exp);
+    mpfr_random2(x.mpfr_ptr(),size,exp);
     return x;
 }
 #endif
@@ -2590,16 +2613,15 @@ inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
 // seed != 0
 inline const mpreal random(unsigned int seed = 0)
 {
-
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
     static gmp_randstate_t state;
-    static bool isFirstTime = true;
+    static bool initialize = true;
 
-    if(isFirstTime)
+    if(initialize)
     {
         gmp_randinit_default(state);
         gmp_randseed_ui(state,0);
-        isFirstTime = false;
+        initialize = false;
     }
 
     if(seed != 0)    gmp_randseed_ui(state,seed);
@@ -2612,17 +2634,25 @@ inline const mpreal random(unsigned int seed = 0)
 
 }
 
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
+
+inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x;
+    mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode);
+    return x;
+}
+
 inline const mpreal grandom(unsigned int seed = 0)
 {
     static gmp_randstate_t state;
-    static bool isFirstTime = true;
+    static bool initialize = true;
 
-    if(isFirstTime)
+    if(initialize)
     {
         gmp_randinit_default(state);
         gmp_randseed_ui(state,0);
-        isFirstTime = false;
+        initialize = false;
     }
 
     if(seed != 0) gmp_randseed_ui(state,seed);
@@ -2634,17 +2664,17 @@ inline const mpreal grandom(unsigned int seed = 0)
 //////////////////////////////////////////////////////////////////////////
 // Set/Get global properties
 inline void mpreal::set_default_prec(mp_prec_t prec)
-{ 
-    mpfr_set_default_prec(prec); 
+{
+    mpfr_set_default_prec(prec);
 }
 
 inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode)
-{ 
-    mpfr_set_default_rounding_mode(rnd_mode); 
+{
+    mpfr_set_default_rounding_mode(rnd_mode);
 }
 
 inline bool mpreal::fits_in_bits(double x, int n)
-{   
+{
     int i;
     double t;
     return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0);
@@ -2894,7 +2924,7 @@ inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode)
     else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
 }
 
-// pow long double 
+// pow long double
 inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode)
 {
     return pow(mpreal(a),mpreal(b),rnd_mode);
@@ -2953,9 +2983,9 @@ namespace std
 {
   // we are allowed to extend namespace std with specializations only
     template <>
-    inline void swap(mpfr::mpreal& x, mpfr::mpreal& y) 
-    { 
-        return mpfr::swap(x, y); 
+    inline void swap(mpfr::mpreal& x, mpfr::mpreal& y)
+    {
+        return mpfr::swap(x, y);
     }
 
     template<>
@@ -2966,7 +2996,7 @@ namespace std
         static const bool is_signed         = true;
         static const bool is_integer        = false;
         static const bool is_exact          = false;
-        static const int  radix             = 2;    
+        static const int  radix             = 2;
 
         static const bool has_infinity      = true;
         static const bool has_quiet_NaN     = true;
@@ -2986,7 +3016,7 @@ namespace std
 
         // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon)
         inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::machine_epsilon(precision); }
-    
+
         // Returns smallest eps such that x + eps != x (relative machine epsilon)
         inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) {  return mpfr::machine_epsilon(x);  }
 
@@ -2994,8 +3024,8 @@ namespace std
         {
             mp_rnd_t r = mpfr::mpreal::get_default_rnd();
 
-            if(r == GMP_RNDN)  return mpfr::mpreal(0.5, precision); 
-            else               return mpfr::mpreal(1.0, precision);    
+            if(r == GMP_RNDN)  return mpfr::mpreal(0.5, precision);
+            else               return mpfr::mpreal(1.0, precision);
         }
 
         inline static const mpfr::mpreal infinity()         { return mpfr::const_infinity();     }
@@ -3006,17 +3036,17 @@ namespace std
         // Please note, exponent range is not fixed in MPFR
         static const int min_exponent = MPFR_EMIN_DEFAULT;
         static const int max_exponent = MPFR_EMAX_DEFAULT;
-        MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811); 
-        MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811); 
+        MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811);
+        MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811);
 
 #ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
 
         // Following members should be constant according to standard, but they can be variable in MPFR
-        // So we define them as functions here. 
+        // So we define them as functions here.
         //
         // This is preferable way for std::numeric_limits<mpfr::mpreal> specialization.
-        // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost. 
-        // See below for compatible implementation. 
+        // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost.
+        // See below for compatible implementation.
         inline static float_round_style round_style()
         {
             mp_rnd_t r = mpfr::mpreal::get_default_rnd();
@@ -3024,9 +3054,9 @@ namespace std
             switch (r)
             {
             case GMP_RNDN: return round_to_nearest;
-            case GMP_RNDZ: return round_toward_zero; 
-            case GMP_RNDU: return round_toward_infinity; 
-            case GMP_RNDD: return round_toward_neg_infinity; 
+            case GMP_RNDZ: return round_toward_zero;
+            case GMP_RNDU: return round_toward_infinity;
+            case GMP_RNDD: return round_toward_neg_infinity;
             default: return round_indeterminate;
             }
         }
@@ -3053,13 +3083,13 @@ namespace std
         // If possible, please use functions digits() and round_style() defined above.
         //
         // These (default) values are preserved for compatibility with existing libraries, e.g. boost.
-        // Change them accordingly to your application. 
+        // Change them accordingly to your application.
         //
         // For example, if you use 256 bits of precision uniformly in your program, then:
         // digits       = 256
-        // digits10     = 77 
+        // digits10     = 77
         // max_digits10 = 78
-        // 
+        //
         // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details.
 
         static const std::float_round_style round_style = round_to_nearest;
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp
index bc00382be..685e7ea45 100644
--- a/unsupported/test/mpreal_support.cpp
+++ b/unsupported/test/mpreal_support.cpp
@@ -12,11 +12,13 @@ void test_mpreal_support()
   // set precision to 256 bits (double has only 53 bits)
   mpreal::set_default_prec(256);
   typedef Matrix<mpreal,Eigen::Dynamic,Eigen::Dynamic> MatrixXmp;
+  typedef Matrix<std::complex<mpreal>,Eigen::Dynamic,Eigen::Dynamic> MatrixXcmp;
 
   std::cerr << "epsilon =         " << NumTraits<mpreal>::epsilon() << "\n";
   std::cerr << "dummy_precision = " << NumTraits<mpreal>::dummy_precision() << "\n";
   std::cerr << "highest =         " << NumTraits<mpreal>::highest() << "\n";
   std::cerr << "lowest =          " << NumTraits<mpreal>::lowest() << "\n";
+  std::cerr << "digits10 =        " << NumTraits<mpreal>::digits10() << "\n";
 
   for(int i = 0; i < g_repeat; i++) {
     int s = Eigen::internal::random<int>(1,100);
@@ -24,6 +26,10 @@ void test_mpreal_support()
     MatrixXmp B = MatrixXmp::Random(s,s);
     MatrixXmp S = A.adjoint() * A;
     MatrixXmp X;
+    MatrixXcmp Ac = MatrixXcmp::Random(s,s);
+    MatrixXcmp Bc = MatrixXcmp::Random(s,s);
+    MatrixXcmp Sc = Ac.adjoint() * Ac;
+    MatrixXcmp Xc;
     
     // Basic stuffs
     VERIFY_IS_APPROX(A.real(), A);
@@ -32,12 +38,14 @@ void test_mpreal_support()
     VERIFY_IS_APPROX(A.array().abs2().sqrt(), A.array().abs());
     VERIFY_IS_APPROX(A.array().sin(),         sin(A.array()));
     VERIFY_IS_APPROX(A.array().cos(),         cos(A.array()));
-    
 
     // Cholesky
     X = S.selfadjointView<Lower>().llt().solve(B);
     VERIFY_IS_APPROX((S.selfadjointView<Lower>()*X).eval(),B);
 
+    Xc = Sc.selfadjointView<Lower>().llt().solve(Bc);
+    VERIFY_IS_APPROX((Sc.selfadjointView<Lower>()*Xc).eval(),Bc);
+    
     // partial LU
     X = A.lu().solve(B);
     VERIFY_IS_APPROX((A*X).eval(),B);
diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp
index de79f1538..0c87478dd 100644
--- a/unsupported/test/polynomialsolver.cpp
+++ b/unsupported/test/polynomialsolver.cpp
@@ -38,6 +38,9 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
 
   const Index deg = pols.size()-1;
 
+  // Test template constructor from coefficient vector
+  SOLVER solve_constr (pols);
+
   psolve.compute( pols );
   const RootsType& roots( psolve.roots() );
   EvalRootsType evr( deg );
@@ -104,6 +107,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
     // 1) the roots found are correct
     // 2) the roots have distinct moduli
 
+    typedef typename POLYNOMIAL::Scalar                 Scalar;
     typedef typename REAL_ROOTS::Scalar                 Real;
 
     //Test realRoots
@@ -118,7 +122,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
       bool found = false;
       for( size_t j=0; j<calc_realRoots.size()&& !found; ++j )
       {
-        if( internal::isApprox( calc_realRoots[i], real_roots[j] ), psPrec ){
+        if( internal::isApprox( calc_realRoots[i], real_roots[j], psPrec ) ){
           found = true; }
       }
       VERIFY( found );
@@ -209,5 +213,6 @@ void test_polynomialsolver()
     CALL_SUBTEST_10((polynomialsolver<double,Dynamic>(
             internal::random<int>(9,13)
             )) );
+    CALL_SUBTEST_11((polynomialsolver<float,Dynamic>(1)) );
   }
 }
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index 1ee791b0f..a010ceb93 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -49,7 +49,6 @@ bool test_random_setter(DynamicSparseMatrix<T>& sm, const DenseType& ref, const
 
 template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& ref)
 {
-  typedef typename SparseMatrixType::Index Index;
   const Index rows = ref.rows();
   const Index cols = ref.cols();
   typedef typename SparseMatrixType::Scalar Scalar;
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
new file mode 100644
index 000000000..057fb3e92
--- /dev/null
+++ b/unsupported/test/special_functions.cpp
@@ -0,0 +1,345 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+  for(Index i=0; i<x.size(); ++i)
+  {
+    if((numext::isfinite)(y(i)))
+      VERIFY_IS_APPROX( x(i), y(i) );
+    else if((numext::isnan)(y(i)))
+      VERIFY((numext::isnan)(x(i)));
+    else
+      VERIFY_IS_EQUAL( x(i), y(i) );
+  }
+}
+
+template<typename ArrayType> void array_special_functions()
+{
+  using std::abs;
+  using std::sqrt;
+  typedef typename ArrayType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Scalar plusinf = std::numeric_limits<Scalar>::infinity();
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Index rows = internal::random<Index>(1,30);
+  Index cols = 1;
+
+  // API
+  {
+    ArrayType m1 = ArrayType::Random(rows,cols);
+#if EIGEN_HAS_C99_MATH
+    VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1));
+    VERIFY_IS_APPROX(m1.digamma(), digamma(m1));
+    VERIFY_IS_APPROX(m1.erf(), erf(m1));
+    VERIFY_IS_APPROX(m1.erfc(), erfc(m1));
+#endif  // EIGEN_HAS_C99_MATH
+  }
+
+
+#if EIGEN_HAS_C99_MATH
+  // check special functions (comparing against numpy implementation)
+  if (!NumTraits<Scalar>::IsComplex)
+  {
+
+    {
+      ArrayType m1 = ArrayType::Random(rows,cols);
+      ArrayType m2 = ArrayType::Random(rows,cols);
+
+      // Test various propreties of igamma & igammac.  These are normalized
+      // gamma integrals where
+      //   igammac(a, x) = Gamma(a, x) / Gamma(a)
+      //   igamma(a, x) = gamma(a, x) / Gamma(a)
+      // where Gamma and gamma are considered the standard unnormalized
+      // upper and lower incomplete gamma functions, respectively.
+      ArrayType a = m1.abs() + 2;
+      ArrayType x = m2.abs() + 2;
+      ArrayType zero = ArrayType::Zero(rows, cols);
+      ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
+      ArrayType a_m1 = a - one;
+      ArrayType Gamma_a_x = Eigen::igammac(a, x) * a.lgamma().exp();
+      ArrayType Gamma_a_m1_x = Eigen::igammac(a_m1, x) * a_m1.lgamma().exp();
+      ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
+      ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
+
+      // Gamma(a, 0) == Gamma(a)
+      VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
+
+      // Gamma(a, x) + gamma(a, x) == Gamma(a)
+      VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
+
+      // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
+      VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
+
+      // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
+      VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
+    }
+
+    {
+      // Check exact values of igamma and igammac against a third party calculation.
+      Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+      Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+      // location i*6+j corresponds to a_s[i], x_s[j].
+      Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                              {0.0, 0.6321205588285578, 0.7768698398515702,
+                              0.9816843611112658, 9.999500016666262e-05, 1.0},
+                              {0.0, 0.4275932955291202, 0.608374823728911,
+                              0.9539882943107686, 7.522076445089201e-07, 1.0},
+                              {0.0, 0.01898815687615381, 0.06564245437845008,
+                              0.5665298796332909, 4.166333347221828e-18, 1.0},
+                              {0.0, 0.9999780593618628, 0.9999899967080838,
+                              0.9999996219837988, 0.9991370418689945, 1.0},
+                              {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+      Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                              {1.0, 0.36787944117144233, 0.22313016014842982,
+                                0.018315638888734182, 0.9999000049998333, 0.0},
+                              {1.0, 0.5724067044708798, 0.3916251762710878,
+                                0.04601170568923136, 0.9999992477923555, 0.0},
+                              {1.0, 0.9810118431238462, 0.9343575456215499,
+                                0.4334701203667089, 1.0, 0.0},
+                              {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                                3.7801620118431334e-07, 0.0008629581310054535,
+                                0.0},
+                              {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+      for (int i = 0; i < 6; ++i) {
+        for (int j = 0; j < 6; ++j) {
+          if ((std::isnan)(igamma_s[i][j])) {
+            VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j])));
+          } else {
+            VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]);
+          }
+
+          if ((std::isnan)(igammac_s[i][j])) {
+            VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j])));
+          } else {
+            VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]);
+          }
+        }
+      }
+    }
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
+  // Check the zeta function against scipy.special.zeta
+  {
+    ArrayType x(7), q(7), res(7), ref(7);
+    x << 1.5,   4, 10.5, 10000.5,    3, 1,        0.9;
+    q << 2,   1.5,    3,  1.0001, -2.5, 1.2345, 1.2345;
+    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+    CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
+  }
+
+  // digamma
+  {
+    ArrayType x(7), res(7), ref(7);
+    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1;
+    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+    CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = digamma(x);  verify_component_wise(res, ref); );
+  }
+
+
+#if EIGEN_HAS_C99_MATH
+  {
+    ArrayType n(11), x(11), res(11), ref(11);
+    n << 1, 1,    1, 1.5,   17,   31,   28,    8, 42, 147, 170;
+    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64;
+    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+    if(sizeof(RealScalar)>=8) {  // double
+      // Reason for commented line: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+      //       CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res, ref); );
+      CALL_SUBTEST( res = polygamma(n,x);  verify_component_wise(res, ref); );
+    }
+    else {
+      //       CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res.head(8), ref.head(8)); );
+      CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res.head(8), ref.head(8)); );
+    }
+  }
+#endif
+
+#if EIGEN_HAS_C99_MATH
+  {
+    // Inputs and ground truth generated with scipy via:
+    //   a = np.logspace(-3, 3, 5) - 1e-3
+    //   b = np.logspace(-3, 3, 5) - 1e-3
+    //   x = np.linspace(-0.1, 1.1, 5)
+    //   (full_a, full_b, full_x) = np.vectorize(lambda a, b, x: (a, b, x))(*np.ix_(a, b, x))
+    //   full_a = full_a.flatten().tolist()  # same for full_b, full_x
+    //   v = scipy.special.betainc(full_a, full_b, full_x).flatten().tolist()
+    //
+    // Note in Eigen, we call betainc with arguments in the order (x, a, b).
+    ArrayType a(125);
+    ArrayType b(125);
+    ArrayType x(125);
+    ArrayType v(125);
+    ArrayType res(125);
+
+    a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999;
+
+    b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+        0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999,
+        999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.999, 0.999, 0.999, 0.999,
+        0.999, 31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999;
+
+    x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+        0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+        0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+        -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+        1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+        0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+        0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1;
+
+    v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+        nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+        0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+        0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+        0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan,
+        nan, nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+        0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+        0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+        0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+        0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+        1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06,
+        nan, nan, 7.864342668429763e-23, 3.015969667594166e-10,
+        0.0008598571564165444, nan, nan, 6.031987710123844e-08,
+        0.5000000000000007, 0.9999999396801229, nan, nan, 0.9999999999999999,
+        0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan,
+        nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan,
+        0.0, 9.275871147869727e-302, 1.2232913026152827e-97, nan, nan, 0.0,
+        3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
+        2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
+
+    CALL_SUBTEST(res = betainc(a, b, x);
+                 verify_component_wise(res, v););
+  }
+
+  // Test various properties of betainc
+  {
+    ArrayType m1 = ArrayType::Random(32);
+    ArrayType m2 = ArrayType::Random(32);
+    ArrayType m3 = ArrayType::Random(32);
+    ArrayType one = ArrayType::Constant(32, Scalar(1.0));
+    const Scalar eps = std::numeric_limits<Scalar>::epsilon();
+    ArrayType a = (m1 * 4.0).exp();
+    ArrayType b = (m2 * 4.0).exp();
+    ArrayType x = m3.abs();
+
+    // betainc(a, 1, x) == x**a
+    CALL_SUBTEST(
+        ArrayType test = betainc(a, one, x);
+        ArrayType expected = x.pow(a);
+        verify_component_wise(test, expected););
+
+    // betainc(1, b, x) == 1 - (1 - x)**b
+    CALL_SUBTEST(
+        ArrayType test = betainc(one, b, x);
+        ArrayType expected = one - (one - x).pow(b);
+        verify_component_wise(test, expected););
+
+    // betainc(a, b, x) == 1 - betainc(b, a, 1-x)
+    CALL_SUBTEST(
+        ArrayType test = betainc(a, b, x) + betainc(b, a, one - x);
+        ArrayType expected = one;
+        verify_component_wise(test, expected););
+
+    // betainc(a+1, b, x) = betainc(a, b, x) - x**a * (1 - x)**b / (a * beta(a, b))
+    CALL_SUBTEST(
+        ArrayType num = x.pow(a) * (one - x).pow(b);
+        ArrayType denom = a * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+        // Add eps to rhs and lhs so that component-wise test doesn't result in
+        // nans when both outputs are zeros.
+        ArrayType expected = betainc(a, b, x) - num / denom + eps;
+        ArrayType test = betainc(a + one, b, x) + eps;
+        if (sizeof(Scalar) >= 8) { // double
+          verify_component_wise(test, expected);
+        } else {
+          // Reason for limited test: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+          verify_component_wise(test.head(8), expected.head(8));
+        });
+
+    // betainc(a, b+1, x) = betainc(a, b, x) + x**a * (1 - x)**b / (b * beta(a, b))
+    CALL_SUBTEST(
+        // Add eps to rhs and lhs so that component-wise test doesn't result in
+        // nans when both outputs are zeros.
+        ArrayType num = x.pow(a) * (one - x).pow(b);
+        ArrayType denom = b * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+        ArrayType expected = betainc(a, b, x) + num / denom + eps;
+        ArrayType test = betainc(a, b + one, x) + eps;
+        verify_component_wise(test, expected););
+  }
+#endif
+}
+
+void test_special_functions()
+{
+  CALL_SUBTEST_1(array_special_functions<ArrayXf>());
+  CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+}
diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp
index a7eb3e0c4..3be020434 100644
--- a/unsupported/test/splines.cpp
+++ b/unsupported/test/splines.cpp
@@ -13,23 +13,23 @@
 
 namespace Eigen {
   
-// lets do some explicit instantiations and thus
-// force the compilation of all spline functions...
-template class Spline<double, 2, Dynamic>;
-template class Spline<double, 3, Dynamic>;
+  // lets do some explicit instantiations and thus
+  // force the compilation of all spline functions...
+  template class Spline<double, 2, Dynamic>;
+  template class Spline<double, 3, Dynamic>;
 
-template class Spline<double, 2, 2>;
-template class Spline<double, 2, 3>;
-template class Spline<double, 2, 4>;
-template class Spline<double, 2, 5>;
+  template class Spline<double, 2, 2>;
+  template class Spline<double, 2, 3>;
+  template class Spline<double, 2, 4>;
+  template class Spline<double, 2, 5>;
 
-template class Spline<float, 2, Dynamic>;
-template class Spline<float, 3, Dynamic>;
+  template class Spline<float, 2, Dynamic>;
+  template class Spline<float, 3, Dynamic>;
 
-template class Spline<float, 3, 2>;
-template class Spline<float, 3, 3>;
-template class Spline<float, 3, 4>;
-template class Spline<float, 3, 5>;
+  template class Spline<float, 3, 2>;
+  template class Spline<float, 3, 3>;
+  template class Spline<float, 3, 4>;
+  template class Spline<float, 3, 5>;
 
 }
 
@@ -234,11 +234,48 @@ void check_global_interpolation2d()
   }
 }
 
+void check_global_interpolation_with_derivatives2d()
+{
+  typedef Spline2d::PointType PointType;
+  typedef Spline2d::KnotVectorType KnotVectorType;
+
+  const Eigen::DenseIndex numPoints = 100;
+  const unsigned int dimension = 2;
+  const unsigned int degree = 3;
+
+  ArrayXXd points = ArrayXXd::Random(dimension, numPoints);
+
+  KnotVectorType knots;
+  Eigen::ChordLengths(points, knots);
+
+  ArrayXXd derivatives = ArrayXXd::Random(dimension, numPoints);
+  VectorXd derivativeIndices(numPoints);
+
+  for (Eigen::DenseIndex i = 0; i < numPoints; ++i)
+      derivativeIndices(i) = static_cast<double>(i);
+
+  const Spline2d spline = SplineFitting<Spline2d>::InterpolateWithDerivatives(
+    points, derivatives, derivativeIndices, degree);  
+    
+  for (Eigen::DenseIndex i = 0; i < points.cols(); ++i)
+  {
+    PointType point = spline(knots(i));
+    PointType referencePoint = points.col(i);
+    VERIFY_IS_APPROX(point, referencePoint);
+    PointType derivative = spline.derivatives(knots(i), 1).col(1);
+    PointType referenceDerivative = derivatives.col(i);
+    VERIFY_IS_APPROX(derivative, referenceDerivative);
+  }
+}
 
 void test_splines()
 {
-  CALL_SUBTEST( eval_spline3d() );
-  CALL_SUBTEST( eval_spline3d_onbrks() );
-  CALL_SUBTEST( eval_closed_spline2d() );
-  CALL_SUBTEST( check_global_interpolation2d() );
+  for (int i = 0; i < g_repeat; ++i)
+  {
+    CALL_SUBTEST( eval_spline3d() );
+    CALL_SUBTEST( eval_spline3d_onbrks() );
+    CALL_SUBTEST( eval_closed_spline2d() );
+    CALL_SUBTEST( check_global_interpolation2d() );
+    CALL_SUBTEST( check_global_interpolation_with_derivatives2d() );
+  }
 }
diff --git a/unsupported/test/svd_common.h b/unsupported/test/svd_common.h
deleted file mode 100644
index b40c23a2b..000000000
--- a/unsupported/test/svd_common.h
+++ /dev/null
@@ -1,261 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
-// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
-// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
-// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// discard stack allocation as that too bypasses malloc
-#define EIGEN_STACK_ALLOCATION_LIMIT 0
-#define EIGEN_RUNTIME_NO_MALLOC
-
-#include "main.h"
-#include <unsupported/Eigen/SVD>
-#include <Eigen/LU>
-
-
-// check if "svd" is the good image of "m"  
-template<typename MatrixType, typename SVD>
-void svd_check_full(const MatrixType& m, const SVD& svd)
-{
-  typedef typename MatrixType::Index Index;
-  Index rows = m.rows();
-  Index cols = m.cols();
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime
-  };
-
-  typedef typename MatrixType::Scalar Scalar;
-  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime> MatrixUType;
-  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime> MatrixVType;
-
-  
-  MatrixType sigma = MatrixType::Zero(rows, cols);
-  sigma.diagonal() = svd.singularValues().template cast<Scalar>();
-  MatrixUType u = svd.matrixU();
-  MatrixVType v = svd.matrixV();
-  VERIFY_IS_APPROX(m, u * sigma * v.adjoint());
-  VERIFY_IS_UNITARY(u);
-  VERIFY_IS_UNITARY(v);
-} // end svd_check_full
-
-
-
-// Compare to a reference value
-template<typename MatrixType, typename SVD>
-void svd_compare_to_full(const MatrixType& m,
-			 unsigned int computationOptions,
-			 const SVD& referenceSvd)
-{
-  typedef typename MatrixType::Index Index;
-  Index rows = m.rows();
-  Index cols = m.cols();
-  Index diagSize = (std::min)(rows, cols);
-
-  SVD svd(m, computationOptions);
-
-  VERIFY_IS_APPROX(svd.singularValues(), referenceSvd.singularValues());
-  if(computationOptions & ComputeFullU)
-    VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU());
-  if(computationOptions & ComputeThinU)
-    VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU().leftCols(diagSize));
-  if(computationOptions & ComputeFullV)
-    VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV());
-  if(computationOptions & ComputeThinV)
-    VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV().leftCols(diagSize));
-} // end svd_compare_to_full
-
-
-
-template<typename MatrixType, typename SVD>
-void svd_solve(const MatrixType& m, unsigned int computationOptions)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
-  Index rows = m.rows();
-  Index cols = m.cols();
-
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime
-  };
-
-  typedef Matrix<Scalar, RowsAtCompileTime, Dynamic> RhsType;
-  typedef Matrix<Scalar, ColsAtCompileTime, Dynamic> SolutionType;
-
-  RhsType rhs = RhsType::Random(rows, internal::random<Index>(1, cols));
-  SVD svd(m, computationOptions);
-  SolutionType x = svd.solve(rhs);
-  // evaluate normal equation which works also for least-squares solutions
-  VERIFY_IS_APPROX(m.adjoint()*m*x,m.adjoint()*rhs);
-} // end svd_solve
-
-
-// test computations options
-// 2 functions because Jacobisvd can return before the second function
-template<typename MatrixType, typename SVD>
-void svd_test_computation_options_1(const MatrixType& m, const SVD& fullSvd)
-{
-  svd_check_full< MatrixType, SVD >(m, fullSvd);
-  svd_solve< MatrixType, SVD >(m, ComputeFullU | ComputeFullV);
-}
-
-
-template<typename MatrixType, typename SVD>
-void svd_test_computation_options_2(const MatrixType& m, const SVD& fullSvd)
-{
-  svd_compare_to_full< MatrixType, SVD >(m, ComputeFullU, fullSvd);
-  svd_compare_to_full< MatrixType, SVD >(m, ComputeFullV, fullSvd);
-  svd_compare_to_full< MatrixType, SVD >(m, 0, fullSvd);
-
-  if (MatrixType::ColsAtCompileTime == Dynamic) {
-    // thin U/V are only available with dynamic number of columns
- 
-    svd_compare_to_full< MatrixType, SVD >(m, ComputeFullU|ComputeThinV, fullSvd);
-    svd_compare_to_full< MatrixType, SVD >(m,              ComputeThinV, fullSvd);
-    svd_compare_to_full< MatrixType, SVD >(m, ComputeThinU|ComputeFullV, fullSvd);
-    svd_compare_to_full< MatrixType, SVD >(m, ComputeThinU             , fullSvd);
-    svd_compare_to_full< MatrixType, SVD >(m, ComputeThinU|ComputeThinV, fullSvd);
-    svd_solve<MatrixType, SVD>(m, ComputeFullU | ComputeThinV);
-    svd_solve<MatrixType, SVD>(m, ComputeThinU | ComputeFullV);
-    svd_solve<MatrixType, SVD>(m, ComputeThinU | ComputeThinV);
-    
-    typedef typename MatrixType::Index Index;
-    Index diagSize = (std::min)(m.rows(), m.cols());
-    SVD svd(m, ComputeThinU | ComputeThinV);
-    VERIFY_IS_APPROX(m, svd.matrixU().leftCols(diagSize) * svd.singularValues().asDiagonal() * svd.matrixV().leftCols(diagSize).adjoint());
-  }
-}
-
-template<typename MatrixType, typename SVD> 
-void svd_verify_assert(const MatrixType& m)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
-  Index rows = m.rows();
-  Index cols = m.cols();
-
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime
-  };
-
-  typedef Matrix<Scalar, RowsAtCompileTime, 1> RhsType;
-  RhsType rhs(rows);
-  SVD svd;
-  VERIFY_RAISES_ASSERT(svd.matrixU())
-  VERIFY_RAISES_ASSERT(svd.singularValues())
-  VERIFY_RAISES_ASSERT(svd.matrixV())
-  VERIFY_RAISES_ASSERT(svd.solve(rhs))
-  MatrixType a = MatrixType::Zero(rows, cols);
-  a.setZero();
-  svd.compute(a, 0);
-  VERIFY_RAISES_ASSERT(svd.matrixU())
-  VERIFY_RAISES_ASSERT(svd.matrixV())
-  svd.singularValues();
-  VERIFY_RAISES_ASSERT(svd.solve(rhs))
-    
-  if (ColsAtCompileTime == Dynamic)
-  {
-    svd.compute(a, ComputeThinU);
-    svd.matrixU();
-    VERIFY_RAISES_ASSERT(svd.matrixV())
-    VERIFY_RAISES_ASSERT(svd.solve(rhs))
-    svd.compute(a, ComputeThinV);
-    svd.matrixV();
-    VERIFY_RAISES_ASSERT(svd.matrixU())
-    VERIFY_RAISES_ASSERT(svd.solve(rhs))
-  }
-  else
-  {
-    VERIFY_RAISES_ASSERT(svd.compute(a, ComputeThinU))
-    VERIFY_RAISES_ASSERT(svd.compute(a, ComputeThinV))
-  }
-}
-
-// work around stupid msvc error when constructing at compile time an expression that involves
-// a division by zero, even if the numeric type has floating point
-template<typename Scalar>
-EIGEN_DONT_INLINE Scalar zero() { return Scalar(0); }
-
-// workaround aggressive optimization in ICC
-template<typename T> EIGEN_DONT_INLINE  T sub(T a, T b) { return a - b; }
-
-
-template<typename MatrixType, typename SVD>
-void svd_inf_nan()
-{
-  // all this function does is verify we don't iterate infinitely on nan/inf values
-
-  SVD svd;
-  typedef typename MatrixType::Scalar Scalar;
-  Scalar some_inf = Scalar(1) / zero<Scalar>();
-  VERIFY(sub(some_inf, some_inf) != sub(some_inf, some_inf));
-  svd.compute(MatrixType::Constant(10,10,some_inf), ComputeFullU | ComputeFullV);
-
-  Scalar some_nan = zero<Scalar> () / zero<Scalar> ();
-  VERIFY(some_nan != some_nan);
-  svd.compute(MatrixType::Constant(10,10,some_nan), ComputeFullU | ComputeFullV);
-
-  MatrixType m = MatrixType::Zero(10,10);
-  m(internal::random<int>(0,9), internal::random<int>(0,9)) = some_inf;
-  svd.compute(m, ComputeFullU | ComputeFullV);
-
-  m = MatrixType::Zero(10,10);
-  m(internal::random<int>(0,9), internal::random<int>(0,9)) = some_nan;
-  svd.compute(m, ComputeFullU | ComputeFullV);
-}
-
-
-template<typename SVD>
-void svd_preallocate()
-{
-  Vector3f v(3.f, 2.f, 1.f);
-  MatrixXf m = v.asDiagonal();
-
-  internal::set_is_malloc_allowed(false);
-  VERIFY_RAISES_ASSERT(VectorXf v(10);)
-    SVD svd;
-  internal::set_is_malloc_allowed(true);
-  svd.compute(m);
-  VERIFY_IS_APPROX(svd.singularValues(), v);
-
-  SVD svd2(3,3);
-  internal::set_is_malloc_allowed(false);
-  svd2.compute(m);
-  internal::set_is_malloc_allowed(true);
-  VERIFY_IS_APPROX(svd2.singularValues(), v);
-  VERIFY_RAISES_ASSERT(svd2.matrixU());
-  VERIFY_RAISES_ASSERT(svd2.matrixV());
-  svd2.compute(m, ComputeFullU | ComputeFullV);
-  VERIFY_IS_APPROX(svd2.matrixU(), Matrix3f::Identity());
-  VERIFY_IS_APPROX(svd2.matrixV(), Matrix3f::Identity());
-  internal::set_is_malloc_allowed(false);
-  svd2.compute(m);
-  internal::set_is_malloc_allowed(true);
-
-  SVD svd3(3,3,ComputeFullU|ComputeFullV);
-  internal::set_is_malloc_allowed(false);
-  svd2.compute(m);
-  internal::set_is_malloc_allowed(true);
-  VERIFY_IS_APPROX(svd2.singularValues(), v);
-  VERIFY_IS_APPROX(svd2.matrixU(), Matrix3f::Identity());
-  VERIFY_IS_APPROX(svd2.matrixV(), Matrix3f::Identity());
-  internal::set_is_malloc_allowed(false);
-  svd2.compute(m, ComputeFullU|ComputeFullV);
-  internal::set_is_malloc_allowed(true);
-}
-
-
-
-
-
author	Miao Wang <miaowang@google.com>	2017-03-06 13:45:08 -0800
committer	Miao Wang <miaowang@google.com>	2017-03-07 16:30:11 -0800
commit	2b8756b6f1de65d3f8bffab45be6c44ceb7411fc (patch)
tree	0488797fc544fe977bec6418c73445759f052482
parent	353bba589de58014a35f8f3666b7b96353c300f8 (diff)
download	eigen-2b8756b6f1de65d3f8bffab45be6c44ceb7411fc.tar.gz